From 20743e12416d98a5024e7a9e727a1199b53465fd Mon Sep 17 00:00:00 2001 From: aanvi <aanvi@umich.edu> Date: Tue, 13 Mar 2018 15:32:50 -0400 Subject: [PATCH] Added script conditional, altered dictionary val to vector of structs instead of just offsets --- parser/Parser.cpp | 42 +++++++++++++++++++++++++++++++++--------- util/Tokenizer.h | 24 ++++++++++++++++++++---- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/parser/Parser.cpp b/parser/Parser.cpp index e412e36..9a37a91 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -14,8 +14,15 @@ */ //TODO instead of grabbing each line, look to see if beginning of // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found +// TODO have to read input in as a stream of chars eventually - cat into string? +// TODO different counts: frequency, total num unique words, etc +// TODO handle bad html style (ie no closing p tag) +//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct) + void Parser::parse ( string html, Tokenizer *tokenizer ) { + + //maybe add some code to read in stream and add chars to string as they come in auto htmlIt = html.begin(); int offset = 0; while (htmlIt != html.end()) @@ -23,25 +30,32 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) // if open bracket if ( *htmlIt == '<' ) { + // TODO have to put a conditional that ensures the opening and closing tags are the same type auto begCloseTag = findNext ("</", htmlIt); auto endCloseTag = findNext ( ">", begCloseTag); string line (htmlIt, endCloseTag + 1); htmlIt = endCloseTag + 2; + //check if line is a script + if ( isScript( line ) ) + { + // DO NOTHING + } // check if line is url - string url = extract_url ( line ); - if (url != "") + else if ( extract_url( line ) != "" ) { - urlFrontier->Push ( url ); + //where is urlFrontier defined? + urlFrontier->push ( url ); } - // check if line is title + // check if line is title + else if ( extract_title( line ) != "" ) + { + tokenizer->execute ( title, offset ); + } + else { - string title = extract_title ( line ); - if (title != "") - { - tokenizer->execute ( title, offset ); - } + //DO NOTHING } //TODO fix offset? offset = htmlIt - html.begin(); @@ -60,6 +74,16 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) * @param word * @return */ + +bool Parser::isScript ( string word ) + { + if ( *findStr ( "<script", word ) != '\0' ) + { + return true; + } + return false; + } + string Parser::extract_url ( string word ) { string url = ""; diff --git a/util/Tokenizer.h b/util/Tokenizer.h index ba27e43..2b16020 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -9,34 +9,50 @@ using namespace std; +struct wordData { + int offset; + int frequency = 0; + //total num words/unique words?? +}; class Tokenizer { public: Tokenizer ( ) { - docIndex = new unordered_map< string, vector< int>>; + docIndex = new unordered_map< string, vector<wordData>>; } unordered_map< string, vector< int>> *get ( ) const { return docIndex; } - + //add type of word parameter, ie paragraph, url etc void execute ( string originalText, int offset ) { + vector< string > splitText = splitStr ( originalText, ' ' ); string lowerString = ""; + int vectorLength = 0; for ( int i = 0; i < splitText.size ( ); ++i ) { lowerString = toLower ( splitText[ i ] ); if ( !isStopWord ( lowerString ) ) { - ( *docIndex )[ lowerString ].push_back ( offset ); + //crawler will have to delete these off the heap as well + //when would a dtor come into play here? + wordData *currentWord = new wordData; + currentWord -> offset = offset; + vectorLength = ( *docIndex )[ lowerString ].size( ); + ( *docIndex )[ lowerString ].push_back ( *currentWord ); + ( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1; + //I don't know if this is good practice or not + delete currentWord; ++offset; } } + currentWord = nullptr; } private: - unordered_map< string, vector< int>> *docIndex; + unordered_map< string, vector<wordData>> *docIndex; }; -- GitLab