From 93aef45017e97c79206fe97f438be446fe631210 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Sun, 28 Jan 2018 22:55:45 -0500 Subject: [PATCH] added parser.cpp --- .gitignore | 3 ++ Parser.cpp | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 .gitignore create mode 100644 Parser.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d607ccc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ +CMakeLists.txt +cmake-build-debug/ diff --git a/Parser.cpp b/Parser.cpp new file mode 100644 index 0000000..259a1f1 --- /dev/null +++ b/Parser.cpp @@ -0,0 +1,127 @@ +// +// Created by Veronica Day on 1/28/18. +// + +#include <string> +#include <functional> +#include <queue> +#include <iostream> +#include <fstream> + +using namespace std; + +// Doc Id +std::priority_queue<int> DOCID_PQ; +std::priority_queue<string> URL_PQ; +string PATH = "/doc"; + +//TODO +// get doc id from DocIDqueue (sent from crawler) +// go to disk and get the HTML file +// parse the html file + // if find url; send to crawler + // if find title send string to tokenizer +class Parser +{ + +public: + + /** + * Parser + * @return + */ + string execute() + { + ifstream inFile; + string docid = to_string(DOCID_PQ.top()); + + inFile.open(PATH+docid); + + // error checking + if (!inFile) + cerr << "Unable to open file datafile.txt"; + + + parse(inFile); + + + + + //TODO + // close file and remove + + } + + +private: + + /** + * Parses file + * @param inFile + * @return + */ + string parse(ifstream inFile) + { + string word = ""; + while (!inFile.eof()) + { + inFile >> word; + + // checks for url + check_url(word); + + // checks for title tags + Tokenizer.execute(check_title(word)); + } + + } + + /** + * Checks for url in string word + * @param word + */ + void check_url(string &word) + { + if (char* pos = strstr("href", word)) + { + while (pos != "\"" && pos != "\'") + ++pos; + + // take everything until next quote + string url = ""; + ++pos; + while (pos != "\"" && pos != "\'") + { + url += *pos; + } + + // send it back to the crawler + URL_PQ.push(url); + } + } + + /** + * <title >AJF</title> + * @param word + */ + string check_title(string &word) + { + if (char* pos = strstr("<title>", word)) + { + pos += 6; + end_pos = strstr("</title>", word); + string title = ""; + while (pos != end_pos) + { + ++pos; + title += *pos; + + } + + return title; + } + + } + + +}; \ No newline at end of file -- GitLab