From a041c8ffe5aac0bb4457db89a8a38be13ebb0e01 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Sun, 4 Mar 2018 19:12:52 -0500 Subject: [PATCH] parser tests pass --- parser/Parser.h | 18 ++++++++++++------ parser/tests/parserTest.cpp | 31 ++++++++++--------------------- util/stringProcessing.h | 2 +- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/parser/Parser.h b/parser/Parser.h index 6435fae..4f7203a 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -59,12 +59,17 @@ private: string tokenizerInput = ""; string currentTerm = ""; - for ( int i = 0; i < html.size ( ); ++i ) + int index = 0; + while (index != html.size()) { - while ( html.at( i ) != '\n' ) + currentTerm = ""; + while ( html.at( index ) != '\n' ) { - currentTerm += html[ i ]; + currentTerm += html[ index ]; + ++index; } + ++index; + string url = extract_url ( currentTerm ); if (url != "") @@ -121,14 +126,15 @@ private: string extract_title ( string & word ) { string title = ""; + char end = '<'; auto pos = findStr ( "<title>", word ); if ( *pos != '\0') { - pos += 6; - while ( *pos != *findStr ( "</title>", word ) ) + pos += 7; + while ( *pos != end ) { - ++pos; title += *pos; + ++pos; } } return title; diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index bc5248b..d160c4c 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -15,30 +15,19 @@ int main ( ) { cout << "Testing Parser ... " << endl << endl; ProducerConsumerQueue < string > * urlFrontierTest; - Document document ( "<!DOCTYPE html>\n" - "<html>\n" - "<head>\n" - "<!-- HTML Codes by Quackit.com -->\n" - "<title>\n" - "Story of Cat</title>\n" - "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n" - "<meta name=\"keywords\" content=\"cat story\">\n" - "<meta name=\"description\" content=\"This is the tale of a cat names joe\">\n" - "<style>\n" - "body {background-color:#ffffff;background-repeat:no-repeat;background-position:top left;background-attachment:fixed;}\n" - "h1{font-family:Arial, sans-serif;color:#000000;background-color:#ffffff;}\n" - "p {font-family:Georgia, serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;}\n" - "</style>\n" - "</head>\n" - "<body>\n" - "<h1>Joe the cat</h1>\n" - "<p>On Saturday, joe the cat went to the store. He climbed up a mountain? It was weird. The store was called Food Store</p>\n" - "</body>\n" - "</html>" ); + Document document ( "<title>This Cat Title Cat</title>\n" ); Parser parser ( urlFrontierTest ); auto dictionary = parser.execute ( &document ); - assert( dictionary != nullptr ); + + assert ( dictionary != nullptr ); + assert ( dictionary->size () == 2); + assert ( dictionary->find ( "cat" ) != dictionary->end () ); + assert ( dictionary->find ( "title" ) != dictionary->end () ); + assert ( dictionary->find ( "this" ) == dictionary->end () ); + assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 ); + assert ( dictionary->at ( "title" )[ 0 ] == 1 ); + cout << "Parser Tests Passed! :D" << endl; } diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 0afdee9..ed54713 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -28,7 +28,7 @@ set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", " * @param needle * @return */ -string::iterator findStr ( string haystack, string needle ) +string::iterator findStr (string needle, string haystack ) { auto beginNeedle = needle.begin ( ); -- GitLab