From b9a95befbaddb23f60359d739caf9a1ee0930c1d Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Thu, 15 Mar 2018 23:53:32 -0400 Subject: [PATCH] fixed stemmer --- parser/Parser.cpp | 1 - parser/tests/parserTest.cpp | 46 ++++++++++++++++-------------------- util/Stemmer.cpp | 2 +- util/tests/tokenizerTest.cpp | 1 + 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 1df6081..99b46d4 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -49,7 +49,6 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) if ( url != "" ) { urlFrontier->Push( url ); - cout << url << endl; } // check if line is title else diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 48ec8c5..7cd5f2b 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -12,34 +12,14 @@ void testSimple ( ); void testComplex ( ); +void testURL ( ); int main ( ) { cout << "Testing Parser ... " << endl << endl; - - const char * line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>"; - - ProducerConsumerQueue< string > urlFrontierTest; - ParsedUrl url = ParsedUrl( "testurl.com" ); - char docString[10240]; - strcpy( docString, line ); - Document document( url, docString ); - - Parser parser( &urlFrontierTest ); - auto dict = parser.execute( &document ); - - for ( auto it = dict->begin( ); it != dict->end( ); it++ ) - { - cout << it->first << ':'; - for ( int i = 0; i < it->second.size( ); ++i ) - { - cout << it->second[ i ] << " "; - } - cout << std::endl; - } - -// testSimple( ); -// testComplex( ); + testURL ( ); + testSimple( ); + testComplex( ); cout << "Parser Tests Passed! :D" << endl; } @@ -48,7 +28,7 @@ void testSimple ( ) { ProducerConsumerQueue< string > urlFrontierTest; - ParsedUrl url = ParsedUrl( "testurl.com" ); + ParsedUrl url = ParsedUrl( "http://www.testurl.com" ); char docString[10240]; strcpy( docString, "<title>This Cat Title Cat</title>" ); Document document( url, docString ); @@ -105,3 +85,19 @@ void testComplex ( ) delete[] writable; } + +void testURL ( ) + { + const char *line = "<li><span class=\"official-website\"><span class=\"url\"><a rel=\"nofollow\" class=\"external text\" href=\"http://www.bafta.org/\">Official website</a></span></span></li>"; + + ProducerConsumerQueue< string > urlFrontierTest; + ParsedUrl url = ParsedUrl( "testurl.com" ); + char docString[10240]; + strcpy( docString, line ); + Document document( url, docString ); + + Parser parser( &urlFrontierTest ); + auto dict = parser.execute( &document ); + assert ( urlFrontierTest.Pop( ) == "http://www.bafta.org/"); + delete dict; + } \ No newline at end of file diff --git a/util/Stemmer.cpp b/util/Stemmer.cpp index 52f47c1..952445f 100644 --- a/util/Stemmer.cpp +++ b/util/Stemmer.cpp @@ -323,7 +323,7 @@ std::string Stemmer::step1b ( std::string word ) else if ( *substrING != '\0' && isVowelPresent( word.begin( ), substrING, word ) ) { wordStem = subStr( word.begin( ), substrING ); - if ( addE( wordStem ) || ( m == 1 && endCVC( wordStem + 'e' ) ) ) + if ( addE( wordStem ) || ( measure ( wordStem ) == 1 && endCVC( wordStem + 'e' ) ) ) { wordStem += 'e'; } diff --git a/util/tests/tokenizerTest.cpp b/util/tests/tokenizerTest.cpp index 4755059..a89e22d 100644 --- a/util/tests/tokenizerTest.cpp +++ b/util/tests/tokenizerTest.cpp @@ -41,5 +41,6 @@ void testExecute ( string original ) } cout << std::endl; } + delete dict; } -- GitLab