diff --git a/crawler/Readers/LocalReader.cpp b/crawler/Readers/LocalReader.cpp index 08105912fee20bfce2f947d937149adac3ba0bf3..cef70bae71e13f472349b1950a57834e7298514e 100644 --- a/crawler/Readers/LocalReader.cpp +++ b/crawler/Readers/LocalReader.cpp @@ -31,7 +31,7 @@ string LocalReader::PageToString ( ) ParsedUrl LocalReader::getUrl ( ) { - ParsedUrl url(fileName); //Fixme + ParsedUrl url(test_url); return url; } diff --git a/tests/plaintext.txt b/tests/plaintext.txt index b570f9cfcf71339a326b71dfe96c93b56d7e4c80..5a34b1d751755671058a3879d0f7fb783e13abf6 100644 --- a/tests/plaintext.txt +++ b/tests/plaintext.txt @@ -2,7 +2,11 @@ <title> Apple Ardvark Anteater Alligator </title> -<body> +<p class="text-muted"> Basement Battle Bridge Bottle -</body> +</p> +<p class="text-muted"> +Hello Goodbye <a href="http://veronicaday.com/" class="btn btn-yes"> +Cat Cradle +</p> </html> \ No newline at end of file diff --git a/util/Stemmer.cpp b/util/Stemmer.cpp index 7b17f21ca769045a0d82720203451b25c7045ec9..674a93b5626a38ee8cccffd5bc7d618d6d8df605 100644 --- a/util/Stemmer.cpp +++ b/util/Stemmer.cpp @@ -49,6 +49,12 @@ int Stemmer::measure ( std::string word ) int m = 0; unsigned long wordIt = 0; unsigned long wordEnd = word.size( ) - 1; + + if ( word.empty( ) ) + { + return 0; + } + // Looking for CVC pattern while ( wordIt <= wordEnd ) { @@ -108,6 +114,12 @@ int Stemmer::measure ( std::string word ) */ bool Stemmer::isVowelPresent ( unsigned long wordBeg, unsigned long wordEnd, string word ) { + + if ( word.empty( ) ) + { + return false; + } + while ( wordBeg != wordEnd && wordBeg < word.size( ) ) { if ( !isConsonant( wordBeg, word ) ) @@ -159,6 +171,11 @@ bool Stemmer::isConsonant ( unsigned long wordIt, string word ) */ bool Stemmer::addE ( string word ) { + if ( word.empty( ) ) + { + return false; + } + // AT -> ATE // BL -> BLE // IZ -> IZE @@ -182,6 +199,11 @@ bool Stemmer::addE ( string word ) */ bool Stemmer::doubleCon ( string word ) { + if ( word.empty( ) ) + { + return false; + } + unsigned long endWord = word.size( ) - 1; if ( word.size( ) > 2 && word[ endWord ] == word[ endWord - 1 ] ) @@ -209,6 +231,12 @@ bool Stemmer::doubleCon ( string word ) */ bool Stemmer::endCVC ( std::string word ) { + + if ( word.empty( ) ) + { + return false; + } + unsigned long endWord = word.size( ) - 1; if ( word.size( ) > 2 ) @@ -236,6 +264,11 @@ bool Stemmer::endCVC ( std::string word ) std::string Stemmer::step1a ( std::string word ) { + if ( word.empty( ) ) + { + return word; + } + // check S at end if ( word[ word.size( ) - 1 ] == 's' && word.size() != 1) { @@ -286,6 +319,12 @@ std::string Stemmer::step1a ( std::string word ) */ std::string Stemmer::step1b ( std::string word ) { + + if ( word.empty( ) ) + { + return word; + } + unsigned long end = word.size( ) - 1; auto begPtr = word.begin( ); auto endPtr = begPtr + end; @@ -349,6 +388,10 @@ std::string Stemmer::step1b ( std::string word ) string Stemmer::step1c ( string word ) { + if ( word.empty( ) ) + { + return word; + } // Y -> I // happy -> happi // sky -> sky @@ -371,7 +414,7 @@ string Stemmer::step1c ( string word ) */ string Stemmer::step2 ( std::string word ) { - if ( measure( word ) == 0 ) + if ( measure( word ) == 0 || word.empty( ) ) { return word; } @@ -528,7 +571,7 @@ string Stemmer::step2 ( std::string word ) std::string Stemmer::step3 ( std::string word ) { - if ( measure( word ) == 0 ) + if ( measure( word ) == 0 || word.empty( ) ) { return word; } @@ -592,7 +635,7 @@ std::string Stemmer::step3 ( std::string word ) */ std::string Stemmer::step4 ( std::string word ) { - if ( measure( word ) <= 2 ) + if ( measure( word ) <= 2 || word.empty( ) ) { return word; } @@ -730,6 +773,11 @@ std::string Stemmer::step4 ( std::string word ) */ std::string Stemmer::step5a ( std::string word ) { + if ( word.empty( ) ) + { + return word; + } + auto m = measure( word ); // E -> // probabte -> probat @@ -757,6 +805,11 @@ std::string Stemmer::step5a ( std::string word ) */ std::string Stemmer::step5b ( std::string word ) { + if ( word.empty( ) ) + { + return word; + } + if ( word.size( ) > 2 && measure( word ) > 1 && word[ word.size( ) - 1 ] == 'l' && word[ word.size( ) - 2 ] == 'l' ) { word = subStr( word, 0, word.size( ) - 1 ); diff --git a/util/tests/stemmerTest.cpp b/util/tests/stemmerTest.cpp index c49415b87a47bd2cf09d7056b1888ec7db828a6b..79fabfa93c4e9e56cff20f22969c92c1b8d8f34f 100644 --- a/util/tests/stemmerTest.cpp +++ b/util/tests/stemmerTest.cpp @@ -110,6 +110,8 @@ int main ( ) assert ( stem.execute( "are" ) == "ar" ); assert ( stem.execute( "terrible" ) == "terribl" ); + assert ( stem.execute( "" ) == "" ); + assert ( stem.execute( "s" ) == "s" ); cout << "\nTests passed for Stemmer :D" << endl;