// // Created by Veronica Day on 1/28/18. // // keep running count of offset, if stop word: don't incrememnt and remove stopword // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue // #include <string> #include <functional> #include <queue> #include <iostream> #include <fstream> #include "../util/Tokenizer.h" #include "../util/stringProcessing.h" #include "../shared/Document.h" #include "../shared/ProducerConsumerQueue.h" using namespace std; /** * This class uses the Doc object from the Crawler to parse the text * Returns a pointer to a dictionary that contains the tokenized input */ class Parser { public: Parser ( ProducerConsumerQueue < string > * urlFrontierIn) { urlFrontier = urlFrontierIn; } /** * Parser * @return */ const unordered_map< string, vector< int>> * execute ( Document* document) { Tokenizer tokenizer; parse ( document->DocToString (), &tokenizer ); return tokenizer.get ( ); } private: ProducerConsumerQueue < string >* urlFrontier; /** * Parses file * @param inFile * @return */ void parse ( string html, Tokenizer *tokenizer ) { string tokenizerInput = ""; string currentTerm = ""; for ( int i = 0; i < html.size ( ); ++i ) { while ( html.at( i ) != '\n' ) { currentTerm += html[ i ]; } string url = extract_url ( currentTerm ); if (url != "") { urlFrontier->Push (url); } else { string title = extract_title ( currentTerm ); if (title != "") { tokenizerInput += title; } } } tokenizer->execute ( tokenizerInput ); } /** * Returns a url, or "" if none * @param word * @return */ string extract_url ( string word ) { string url = ""; if ( *findStr ( word, "<a" ) != '\0' ) { auto foundHttp = findStr ( word, "href=http" ); if ( *foundHttp != '\0' ) { url = "http"; foundHttp += 9; while ( *foundHttp != *findStr ( word, "\">" ) ) { url += *foundHttp; ++foundHttp; } } } return url; } /** * Returns a title, or "" if none * @param word * @return */ string extract_title ( string & word ) { string title = ""; auto pos = findStr ( "<title>", word ); if ( *pos != '\0') { pos += 6; while ( *pos != *findStr ( "</title>", word ) ) { ++pos; title += *pos; } } return title; } };