From b9acd359ec6621ca63c0a918af457babd335196b Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Mon, 5 Mar 2018 00:20:44 -0500 Subject: [PATCH] parse logic imporved --- parser/Parser.h | 68 ++++++++++++++++++++----------------- parser/tests/parserTest.cpp | 53 +++++++++++++++++++++++++++-- util/Tokenizer.h | 11 +++--- util/stringProcessing.h | 52 +++++++++++++++++++++++++++- 4 files changed, 144 insertions(+), 40 deletions(-) diff --git a/parser/Parser.h b/parser/Parser.h index 4f7203a..5d1c166 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -54,39 +54,46 @@ private: * @param inFile * @return */ + //TODO instead of grabbing each line, look to see if beginning of + // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found void parse ( string html, Tokenizer *tokenizer ) { - - string tokenizerInput = ""; - string currentTerm = ""; - int index = 0; - while (index != html.size()) + auto htmlIt = html.begin(); + int offset = 0; + while (htmlIt != html.end()) { - currentTerm = ""; - while ( html.at( index ) != '\n' ) - { - currentTerm += html[ index ]; - ++index; - } - ++index; - - - string url = extract_url ( currentTerm ); - if (url != "") + // if open bracket + if ( *htmlIt == '<' ) { - urlFrontier->Push (url); + auto begCloseTag = findNext ("</", htmlIt); + auto endCloseTag = findNext ( ">", begCloseTag); + string line (htmlIt, endCloseTag + 1); + htmlIt = endCloseTag + 2; + + // check if line is url + string url = extract_url ( line ); + if (url != "") + { + urlFrontier->Push ( url ); + } + // check if line is title + else + { + string title = extract_title ( line ); + if (title != "") + { + tokenizer->execute ( title, offset ); + } + } + //TODO fix offset? + offset = htmlIt - html.begin(); } else { - string title = extract_title ( currentTerm ); - if (title != "") - { - tokenizerInput += title; - } + ++htmlIt; } - } - tokenizer->execute ( tokenizerInput ); + } @@ -98,16 +105,15 @@ private: string extract_url ( string word ) { string url = ""; - - if ( *findStr ( word, "<a" ) != '\0' ) + if ( *findStr ( "<a", word ) != '\0' ) { - auto foundHttp = findStr ( word, "href=http" ); + auto foundHref = findStr ( "href", word ); + auto foundHttp = findNext ( "http", foundHref ); if ( *foundHttp != '\0' ) { - url = "http"; - foundHttp += 9; - - while ( *foundHttp != *findStr ( word, "\">" ) ) + url = ""; + auto closeTag = findNext ( ">", word.begin ( ) ); + while ( *foundHttp != *closeTag ) { url += *foundHttp; ++foundHttp; diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index d160c4c..50c7469 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -11,11 +11,23 @@ using namespace std; +void testSimple ( ); +void testComplex ( ); + int main ( ) { cout << "Testing Parser ... " << endl << endl; + testSimple (); + testComplex (); + cout << "Parser Tests Passed! :D" << endl; + + } + +void testSimple ( ) + { + ProducerConsumerQueue < string > * urlFrontierTest; - Document document ( "<title>This Cat Title Cat</title>\n" ); + Document document ( "<title>This Cat Title Cat</title>" ); Parser parser ( urlFrontierTest ); auto dictionary = parser.execute ( &document ); @@ -28,7 +40,44 @@ int main ( ) assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 ); assert ( dictionary->at ( "title" )[ 0 ] == 1 ); - cout << "Parser Tests Passed! :D" << endl; + delete dictionary; } +void testComplex ( ) + { + + ProducerConsumerQueue < string > * urlFrontierTest; + ifstream file("../tests/cats.html"); + string temp; + string docString = "<title>Joe the Cat</title>\n"; + docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n"; + while(std::getline(file, temp)) { + docString += temp; + } + + Document document ( docString ); + + Parser parser ( urlFrontierTest ); + auto dictionary = parser.execute ( &document ); + +// cout << dictionary->size () << endl; +// for (auto p : *dictionary) +// cout << p.first << endl; + + assert ( dictionary != nullptr ); + assert ( dictionary->size () == 3); + + assert ( dictionary->find ( "cat" ) != dictionary->end () ); + assert ( dictionary->find ( "story" ) != dictionary->end () ); + assert ( dictionary->find ( "joe" ) != dictionary->end () ); + + assert ( dictionary->find ( "the" ) == dictionary->end () ); + assert ( dictionary->find ( "of" ) == dictionary->end () ); + +// assert ( dictionary->at ( "cat" )[ 0 ] == 1 ); +// assert ( dictionary->at ( "story" )[ 0 ] == 0 ); +// cout << urlFrontierTest->Size () << endl; +// cout << urlFrontierTest->Pop () << endl; + delete dictionary; + } \ No newline at end of file diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 3e28002..3de99f5 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -14,17 +14,16 @@ class Tokenizer public: Tokenizer ( ) { - doc_index = new unordered_map< string, vector< int>>; + docIndex = new unordered_map< string, vector< int>>; } unordered_map< string, vector< int>> *get ( ) const { - return doc_index; + return docIndex; } - void execute ( string originalText ) + void execute ( string originalText, int offset ) { - int offset = 0; vector< string > splitText = splitStr ( originalText, ' ' ); string lowerString = ""; for ( int i = 0; i < splitText.size ( ); ++i ) @@ -32,12 +31,12 @@ public: lowerString = toLower ( splitText[ i ] ); if ( !isStopWord ( lowerString ) ) { - ( *doc_index )[ lowerString ].push_back ( offset ); + ( *docIndex )[ lowerString ].push_back ( offset ); ++offset; } } } private: - unordered_map< string, vector< int>> *doc_index; + unordered_map< string, vector< int>> *docIndex; }; diff --git a/util/stringProcessing.h b/util/stringProcessing.h index ed54713..40056e0 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -24,9 +24,10 @@ set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", " "you", "your" }; /** * Finds the needle in the haystack + * returns position of first match * @param haystack * @param needle - * @return + * @return string::iterator */ string::iterator findStr (string needle, string haystack ) { @@ -75,6 +76,55 @@ string::iterator findStr (string needle, string haystack ) } +/** + * Finds the next position of the needle in the string + * @param needle + * @param pointer + * @return string::iterator + */ +string::iterator findNext (string needle, string::iterator haystackPointer ) + { + auto beginNeedle = needle.begin ( ); + auto beginHaystack = haystackPointer; + while ( *beginHaystack != '\0' ) + { + //keep looking for instance of a match + if ( *beginHaystack != *beginNeedle ) + { + ++beginHaystack; + } + + else if ( *beginHaystack == *beginNeedle ) + { + /* want to keep the original iterator where it is so it + can return the beginning of the matched word if found */ + auto temp = beginHaystack; + while ( *temp == *beginNeedle ) + { + ++temp; + ++beginNeedle; + //if it hits the end of the needleing, it signifies an exact match + if ( *beginNeedle == '\0' ) + { + //this is pointing at the beginning of the match + return beginHaystack; + } + + } + //need to reset because still has to search rest of the string for a match + beginNeedle = needle.begin ( ); + //sets the original text pointer to where the last search left off + beginHaystack = temp; + } + + else + { + //DO NOTHING + } + } + + return beginHaystack; + } /** -- GitLab