diff --git a/CMakeLists.txt b/CMakeLists.txt index 26aa9a42697d73a1ac5ea363fc6cf9592a802401..5ba668904de271ede10523abb5daf212c72eb267 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,19 +21,21 @@ add_executable(crawler-parser-test add_executable(StringProcessingTest util/stringProcessing.cpp - util/Stemmer.h + util/Stemmer.cpp util/tests/stringProcessingTest.cpp) add_executable(TokenizerTest util/Tokenizer.cpp util/stringProcessing.cpp + util/Stemmer.cpp util/tests/tokenizerTest.cpp) add_executable(StemmerTest + util/stringProcessing.cpp util/Stemmer.cpp util/tests/stemmerTest.cpp) -add_executable(ParserTest +add_executable(ParserTestE2E parser/Parser.cpp shared/Document.cpp shared/url.h @@ -41,6 +43,7 @@ add_executable(ParserTest util/Tokenizer.cpp shared/ProducerConsumerQueue.h util/stringProcessing.cpp + util/Stemmer.cpp parser/tests/parserTest.cpp) diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 2740f938d5e06e6d7fb354602a40d03a5a9c183b..14fbc83e3baaf256fb0ed92afac69ff84bc07556 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -55,7 +55,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) * @param word * @return */ -string Parser::extract_url ( string word ) +string Parser::extract_url ( string & word ) { string url = ""; if ( *findStr ( "<a", word ) != '\0' ) diff --git a/parser/Parser.h b/parser/Parser.h index 5fd995a7ab1dfce301b1d0594c7f90d41e6b6441..da60d764f0fe07690dfb8983cf4af45c65919489 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -58,7 +58,7 @@ private: * @param word * @return */ - string extract_url ( string word ); + string extract_url ( string & word ); /** diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 4d990d051db3b050f4916b42b23de1f871360694..542bfbe4c6fdea7309fc9bcfad55a62884e5754d 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -35,10 +35,10 @@ void testSimple ( ) assert ( dictionary != nullptr ); assert ( dictionary->size () == 2); assert ( dictionary->find ( "cat" ) != dictionary->end () ); - assert ( dictionary->find ( "title" ) != dictionary->end () ); + assert ( dictionary->find ( "titl" ) != dictionary->end () ); assert ( dictionary->find ( "this" ) == dictionary->end () ); assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 ); - assert ( dictionary->at ( "title" )[ 0 ] == 1 ); + assert ( dictionary->at ( "titl" )[ 0 ] == 1 ); delete dictionary; @@ -70,7 +70,7 @@ void testComplex ( ) assert ( dictionary->size () == 3); assert ( dictionary->find ( "cat" ) != dictionary->end () ); - assert ( dictionary->find ( "story" ) != dictionary->end () ); + assert ( dictionary->find ( "stori" ) != dictionary->end () ); assert ( dictionary->find ( "joe" ) != dictionary->end () ); assert ( dictionary->find ( "the" ) == dictionary->end () ); diff --git a/util/Stemmer.cpp b/util/Stemmer.cpp index 0604e3f9e208e9dd83c0fdec066dd4965beb18a4..4f6f5383e4f8e267eeb9dc9bd0c6736cabc762cc 100644 --- a/util/Stemmer.cpp +++ b/util/Stemmer.cpp @@ -1,5 +1,6 @@ #include "Stemmer.h" +#include "stringProcessing.h" /** * Stemmer Cstor @@ -15,14 +16,14 @@ Stemmer::Stemmer ( ) */ std::string Stemmer::execute ( std::string word ) { - word = step1a ( word ); - word = step1b ( word ); - word = step1c ( word ); - word = step2 ( word ); - word = step3 ( word ); - word = step4 ( word ); - word = step5a ( word ); - word = step5b ( word ); + word = step1a( word ); + word = step1b( word ); + word = step1c( word ); + word = step2( word ); + word = step3( word ); + word = step4( word ); + word = step5a( word ); + word = step5b( word ); return word; } @@ -41,7 +42,7 @@ int Stemmer::measure ( std::string word ) { int m = 0; int begin = 0; - unsigned long end = word.size ( ) - 1; + unsigned long end = word.size( ) - 1; while ( true ) { @@ -49,7 +50,7 @@ int Stemmer::measure ( std::string word ) { return m; } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) ) { break; } @@ -65,7 +66,7 @@ int Stemmer::measure ( std::string word ) { return m; } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + if ( isConsonant( word.begin( ) + begin, word.begin( ) ) ) { break; } @@ -79,7 +80,7 @@ int Stemmer::measure ( std::string word ) { return m; } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) ) { break; } @@ -101,10 +102,11 @@ bool Stemmer::isVowelPresent ( string::iterator wordBeg, string::iterator wordEn { while ( wordBeg != wordEnd ) { - if ( !isConsonant ( wordBeg, word.begin ( ) ) ) + if ( !isConsonant( wordBeg, word.begin( ) ) ) { return true; } + ++wordBeg; } return false; } @@ -118,7 +120,7 @@ bool Stemmer::isVowelPresent ( string::iterator wordBeg, string::iterator wordEn */ bool Stemmer::isConsonant ( string::iterator wordIt, string::iterator wordBegin ) { - if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'u' ) + if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'o' || *wordIt == 'u' ) { return false; } @@ -130,7 +132,7 @@ bool Stemmer::isConsonant ( string::iterator wordIt, string::iterator wordBegin } else { - return ( !isConsonant ( wordIt - 1, wordBegin ) ); + return ( !isConsonant( wordIt - 1, wordBegin ) ); } } return true; @@ -147,11 +149,12 @@ bool Stemmer::addE ( string word ) // AT -> ATE // BL -> BLE // IZ -> IZE - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - auto substrAT = findPrev ( "at", endPtr ); - auto substrBL = findPrev ( "bl", endPtr ); - auto substrIZ = findPrev ( "iz", endPtr ); + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; + auto substrAT = findPrev( "at", endPtr, begPtr + word.size( ) - 3 ); + auto substrBL = findPrev( "bl", endPtr, begPtr + word.size( ) - 3 ); + auto substrIZ = findPrev( "iz", endPtr, begPtr + word.size( ) - 3 ); if ( *substrAT != '\0' || *substrBL != '\0' || *substrIZ != '\0' ) { @@ -171,10 +174,10 @@ bool Stemmer::addE ( string word ) */ bool Stemmer::doubleCon ( string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto endPtr = word.begin( ) + end; - if ( word.size ( ) > 2 && *endPtr == *( endPtr - 1 ) ) + if ( word.size( ) > 2 && *endPtr == *( endPtr - 1 ) ) { if ( *endPtr == 'l' || *endPtr == 's' || *endPtr == 'z' ) { @@ -198,14 +201,14 @@ bool Stemmer::doubleCon ( string word ) */ bool Stemmer::endCVC ( std::string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto endPtr = word.begin( ) + end - 1; - if ( word.size ( ) > 2 ) + if ( word.size( ) > 3 ) { // the stem ends cvc - if ( isConsonant ( endPtr, word.begin ( ) ) && !isConsonant ( endPtr - 1, word.begin ( ) ) && - isConsonant ( endPtr - 2, word.begin ( ) ) ) + if ( isConsonant( endPtr, word.begin( ) ) && !isConsonant( endPtr - 1, word.begin( ) ) && + isConsonant( endPtr - 2, word.begin( ) ) ) { // the second c is not W, X or Y if ( *( endPtr - 1 ) != 'w' && *( endPtr - 1 ) != 'x' && *( endPtr - 1 ) != 'y' ) @@ -225,45 +228,48 @@ bool Stemmer::endCVC ( std::string word ) */ std::string Stemmer::step1a ( std::string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; // check S at end - if ( word.at ( end ) == 's' ) + if ( word.at( end ) == 's' ) { - string wordStem ( word.begin ( ), word.end ( ) ); + string wordStem( word.begin( ), word.end( ) ); - auto substrSSES = findPrev ( "sses", endPtr ); - auto substrIES = findPrev ( "ies", endPtr ); - auto substrSS = findPrev ( "ss", endPtr ); - auto substrS = findPrev ( "s", endPtr ); + auto substrSSES = findPrev( "sses", endPtr, begPtr + word.size( ) - 5 ); + auto substrIES = findPrev( "ies", endPtr, begPtr + word.size( ) - 4 ); + auto substrSS = findPrev( "ss", endPtr, begPtr + word.size( ) - 3 ); + auto substrS = findPrev( "s", endPtr, begPtr + word.size( ) - 2 ); // sses -> ss // caresses -> caress if ( *substrSSES != '\0' ) { - string wordStem ( word.begin ( ), substrSSES + 1 ); + wordStem = subStr( word.begin( ), substrSSES ); + wordStem += "ss"; } // ies -> i // ponies -> poni else if ( *substrIES != '\0' ) { - string wordStem ( word.begin ( ), substrIES + 1 ); + wordStem = subStr( word.begin( ), substrIES ); + wordStem += 'i'; } // ss -> ss // caress -> caress else if ( *substrSS != '\0' ) { - string wordStem ( word.begin ( ), word.end ( ) ); + // do nothing } // s -> // cats -> cat else if ( *substrS != '\0' ) { - string wordStem ( word.begin ( ), substrS + 1 ); + wordStem = subStr( word.begin( ), substrS ); } else { - string wordStem ( word.begin ( ), word.end ( ) ); + wordStem = subStr( word.begin( ), word.end( ) ); } return wordStem; @@ -279,59 +285,65 @@ std::string Stemmer::step1a ( std::string word ) */ std::string Stemmer::step1b ( std::string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; - if ( measure ( word ) > 0 ) - { - string wordStem ( word.begin ( ), word.end ( ) ); + string wordStem( word.begin( ), word.end( ) ); - auto substrEED = findPrev ( "eed", endPtr ); - auto substrED = findPrev ( "ed", endPtr ); - auto substrING = findPrev ( "ing", endPtr ); + auto substrEED = findPrev( "eed", endPtr, begPtr + word.size( ) - 4 ); + auto substrED = findPrev( "ed", endPtr, begPtr + word.size( ) - 3 ); + auto substrING = findPrev( "ing", endPtr, begPtr + word.size( ) - 4 ); - // check EED at end and m > 0 - // feed -> feed - // agreed -> agree - if ( *substrEED != '\0' ) + // check EED at end and m > 0 + // feed -> feed + // agreed -> agree + if ( measure( word ) > 1 && *substrEED != '\0' ) + { + wordStem = subStr( word.begin( ), substrEED ); + wordStem += "ee"; + } + // check ED at end and preceeded by substr with vowel + // plastered -> plaster + // bled -> bled + else if ( measure( word ) > 1 && *substrED != '\0' && isVowelPresent( word.begin( ), substrED, word ) ) + { + wordStem = subStr( word.begin( ), substrED ); + if ( addE( wordStem ) ) { - string wordStem ( word.begin ( ), substrEED + 1 ); + wordStem += 'e'; } - // check ED at end and preceeded by substr with vowel - // plastered -> plaster - // bled -> bled - else if ( *substrED != '\0' && isVowelPresent ( word.begin ( ), substrED, word ) ) + else if ( doubleCon( wordStem ) ) { - - string wordStem ( word.begin ( ), substrED + 1 ); - if ( addE ( wordStem ) ) - { - wordStem += 'e'; - } - else if ( doubleCon ( wordStem ) ) - { - wordStem = wordStem.substr ( 0, wordStem.size ( ) - 1 ); - } - else if ( measure ( wordStem ) > 1 && endCVC ( wordStem ) ) - { - wordStem += 'e'; - } + wordStem = subStr( word, 0, wordStem.size( ) - 1 ); } - // check ING at end and proceeded by substr with vowel - // motoring -> motor - // sing -> sing - else if ( *substrING != '\0' && isVowelPresent ( word.begin ( ), substrING, word ) ) + else if ( measure( word ) == 1 && endCVC( wordStem + 'e' ) ) { - string wordStem ( word.begin ( ), substrING + 1 ); - if ( addE ( wordStem ) ) - { - wordStem += 'e'; - } + wordStem += 'e'; } - - return wordStem; } - return word; + // check ING at end and proceeded by substr with vowel + // motoring -> motor + // sing -> sing + else if ( *substrING != '\0' && isVowelPresent( word.begin( ), substrING, word ) ) + { + wordStem = subStr( word.begin( ), substrING ); + if ( addE( wordStem ) ) + { + wordStem += 'e'; + } + else if ( doubleCon( wordStem ) ) + { + wordStem = subStr( word, 0, wordStem.size( ) - 1 ); + } + else if ( measure( wordStem ) == 1 && endCVC( wordStem + 'e' ) ) + { + wordStem += 'e'; + } + } + + return wordStem; + } /** @@ -339,19 +351,19 @@ std::string Stemmer::step1b ( std::string word ) * @param word * @return */ -string step1c ( string word ) +string Stemmer::step1c ( string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto endPtr = word.begin( ) + end; // Y -> I // happy -> happi // sky -> sky if ( *endPtr == 'y' ) { - if ( isVowelPresent ( word.begin ( ), endPtr, word ) ) + if ( isVowelPresent( word.begin( ), endPtr, word ) ) { - word = word.substr ( 0, word.size ( ) - 1 ); + word = subStr( word, 0, word.size( ) - 1 ); word += 'i'; } } @@ -366,177 +378,177 @@ string step1c ( string word ) */ string Stemmer::step2 ( std::string word ) { - - if ( measure ( word ) == 0 ) + if ( measure( word ) == 0 ) { return word; } - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); - - auto substrATIONAL = findPrev ( "ational", endPtr ); - auto substrTIONAL = findPrev ( "tional", endPtr ); - auto substrENCI = findPrev ( "enci", endPtr ); - auto substrANCI = findPrev ( "anci", endPtr ); - auto substrIZER = findPrev ( "izer", endPtr ); - auto substrABLI = findPrev ( "abli", endPtr ); - auto substrALLI = findPrev ( "alli", endPtr ); - auto substrENTLI = findPrev ( "entli", endPtr ); - auto substrELI = findPrev ( "eli", endPtr ); - auto substrOUSLI = findPrev ( "ousli", endPtr ); - auto substrIZATION = findPrev ( "ization", endPtr ); - auto substrATION = findPrev ( "ation", endPtr ); - auto substrATOR = findPrev ( "ator", endPtr ); - auto substrALISM = findPrev ( "alism", endPtr ); - auto substrIVENESS = findPrev ( "iveness", endPtr ); - auto substrFULNESS = findPrev ( "fulness", endPtr ); - auto substrOUSNESS = findPrev ( "ousness", endPtr ); - auto substrALITI = findPrev ( "aliti", endPtr ); - auto substrIVITI = findPrev ( "iviti", endPtr ); - auto substrBILITI = findPrev ( "biliti", endPtr ); + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; + string wordStem( word.begin( ), word.end( ) ); + + auto substrATIONAL = findPrev( "ational", endPtr, begPtr + word.size( ) - 8 ); + auto substrTIONAL = findPrev( "tional", endPtr, begPtr + word.size( ) - 7 ); + auto substrENCI = findPrev( "enci", endPtr, begPtr + word.size( ) - 5 ); + auto substrANCI = findPrev( "anci", endPtr, begPtr + word.size( ) - 5 ); + auto substrIZER = findPrev( "izer", endPtr, begPtr + word.size( ) - 5 ); + auto substrABLI = findPrev( "abli", endPtr, begPtr + word.size( ) - 5 ); + auto substrALLI = findPrev( "alli", endPtr, begPtr + word.size( ) - 5 ); + auto substrENTLI = findPrev( "entli", endPtr, begPtr + word.size( ) - 6 ); + auto substrELI = findPrev( "eli", endPtr, begPtr + word.size( ) - 4 ); + auto substrOUSLI = findPrev( "ousli", endPtr, begPtr + word.size( ) - 6 ); + auto substrIZATION = findPrev( "ization", endPtr, begPtr + word.size( ) - 8 ); + auto substrATION = findPrev( "ation", endPtr, begPtr + word.size( ) - 6 ); + auto substrATOR = findPrev( "ator", endPtr, begPtr + word.size( ) - 5 ); + auto substrALISM = findPrev( "alism", endPtr, begPtr + word.size( ) - 6 ); + auto substrIVENESS = findPrev( "iveness", endPtr, begPtr + word.size( ) - 8 ); + auto substrFULNESS = findPrev( "fulness", endPtr, begPtr + word.size( ) - 8 ); + auto substrOUSNESS = findPrev( "ousness", endPtr, begPtr + word.size( ) - 8 ); + auto substrALITI = findPrev( "aliti", endPtr, begPtr + word.size( ) - 6 ); + auto substrIVITI = findPrev( "iviti", endPtr, begPtr + word.size( ) - 6 ); + auto substrBILITI = findPrev( "biliti", endPtr, begPtr + word.size( ) - 7 ); // ATIONAL -> ATE // relational -> relate - if ( *substrATIONAL != '\0' ) + if ( *substrATIONAL != '\0' && ( begPtr + 1 ) != substrATIONAL ) { - string wordStem ( word.begin ( ), substrATIONAL + 1 ); - wordStem + 'ate'; + wordStem = subStr( word.begin( ), substrATIONAL ); + wordStem += "ate"; } // TIONAL -> TION // conditional -> condition // rational -> rational else if ( *substrTIONAL != '\0' ) { - string wordStem ( word.begin ( ), substrTIONAL + 1 ); - wordStem += 'tion'; + wordStem = subStr( word.begin( ), substrTIONAL ); + wordStem += "tion"; } // ENCI -> ENCE // valenci -> valence else if ( *substrENCI != '\0' ) { - string wordStem ( word.begin ( ), substrENCI + 1 ); - wordStem += 'ence'; + wordStem = subStr( word.begin( ), substrENCI ); + wordStem += "ence"; } // ANCI -> ANCE // hesitanci -> hesitance else if ( *substrANCI != '\0' ) { - string wordStem ( word.begin ( ), substrANCI + 1 ); - wordStem += 'ance'; + wordStem = subStr( word.begin( ), substrANCI ); + wordStem += "ance"; } // IZER -> IZE // digitizer -> digitize else if ( *substrIZER != '\0' ) { - string wordStem ( word.begin ( ), substrIZER + 1 ); - wordStem += 'ize'; + wordStem = subStr( word.begin( ), substrIZER ); + wordStem += "ize"; } // ABLI -> ABLE // conformabli -> comformable else if ( *substrABLI != '\0' ) { - string wordStem ( word.begin ( ), substrABLI + 1 ); - wordStem += 'able'; + wordStem = subStr( word.begin( ), substrABLI ); + wordStem += "able"; } // ALLI -> AL // radicalli -> radical else if ( *substrALLI != '\0' ) { - string wordStem ( word.begin ( ), substrALLI + 1 ); - wordStem += 'al'; + wordStem = subStr( word.begin( ), substrALLI ); + wordStem += "al"; } // ENTLI -> ENT // differentli -> different else if ( *substrENTLI != '\0' ) { - string wordStem ( word.begin ( ), substrENTLI + 1 ); - wordStem += 'ent'; + wordStem = subStr( word.begin( ), substrENTLI ); + wordStem += "ent"; } // ELI -> E // vileli -> vile else if ( *substrELI != '\0' ) { - string wordStem ( word.begin ( ), substrELI + 1 ); + wordStem = subStr( word.begin( ), substrELI ); wordStem += 'e'; } // OUSLI -> OUS // analogousli -> analogous else if ( *substrOUSLI != '\0' ) { - string wordStem ( word.begin ( ), substrOUSLI + 1 ); - wordStem += 'ous'; + wordStem = subStr( word.begin( ), substrOUSLI ); + wordStem += "ous"; } // IZATION -> IZE // vietnamization -> vietnamize else if ( *substrIZATION != '\0' ) { - string wordStem ( word.begin ( ), substrIZATION + 1 ); - wordStem += 'ize'; + wordStem = subStr( word.begin( ), substrIZATION ); + wordStem += "ize"; } // ATION -> ATE // predication -> predicate else if ( *substrATION != '\0' ) { - string wordStem ( word.begin ( ), substrATION + 1 ); - wordStem += 'ate'; + wordStem = subStr( word.begin( ), substrATION ); + wordStem += "ate"; } // ATOR -> ATE // predication -> predicate else if ( *substrATOR != '\0' ) { - string wordStem ( word.begin ( ), substrATOR + 1 ); - wordStem += 'ate'; + wordStem = subStr( word.begin( ), substrATOR ); + wordStem += "ate"; } // ALISM -> AL // feudalism -> feudal else if ( *substrALISM != '\0' ) { - string wordStem ( word.begin ( ), substrALISM + 1 ); - wordStem += 'al'; + wordStem = subStr( word.begin( ), substrALISM ); + wordStem += "al"; } // IVENESS -> IVE // decisivenss -> decisive else if ( *substrIVENESS != '\0' ) { - string wordStem ( word.begin ( ), substrIVENESS + 1 ); - wordStem += 'ive'; + wordStem = subStr( word.begin( ), substrIVENESS ); + wordStem += "ive"; } // FULNESS -> FUL // hopefulness -> hopeful else if ( *substrFULNESS != '\0' ) { - string wordStem ( word.begin ( ), substrFULNESS + 1 ); - wordStem += 'ful'; + wordStem = subStr( word.begin( ), substrFULNESS ); + wordStem += "ful"; } // OUSNESS -> OUS // callousness -> callous else if ( *substrOUSNESS != '\0' ) { - string wordStem ( word.begin ( ), substrOUSNESS + 1 ); - wordStem += 'ous'; + wordStem = subStr( word.begin( ), substrOUSNESS ); + wordStem += "ous"; } // ALITI -> AL // formalit -> callous else if ( *substrOUSNESS != '\0' ) { - string wordStem ( word.begin ( ), substrOUSNESS + 1 ); - wordStem += 'al'; + wordStem = subStr( word.begin( ), substrOUSNESS ); + wordStem += "al"; } // IVITI -> IVE // sensitiviti -> sensitive else if ( *substrIVITI != '\0' ) { - string wordStem ( word.begin ( ), substrIVITI + 1 ); - wordStem += 'ive'; + wordStem = subStr( word.begin( ), substrIVITI ); + wordStem += "ive"; } // BILITI -> BLE // sensibiliti -> sensible else if ( *substrBILITI != '\0' ) { - string wordStem ( word.begin ( ), substrBILITI + 1 ); - wordStem += 'ble'; + wordStem = subStr( word.begin( ), substrBILITI ); + wordStem += "ble"; } return wordStem; @@ -551,68 +563,69 @@ string Stemmer::step2 ( std::string word ) std::string Stemmer::step3 ( std::string word ) { - if ( measure ( word ) == 0 ) + if ( measure( word ) == 0 ) { return word; } - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; + string wordStem( word.begin( ), word.end( ) ); - auto substrICATE = findPrev ( "icate", endPtr ); - auto substrATIVE = findPrev ( "ative", endPtr ); - auto substrALIZE = findPrev ( "alize", endPtr ); - auto substrICITI = findPrev ( "iciti", endPtr ); - auto substrICAL = findPrev ( "ical", endPtr ); - auto substrFUL = findPrev ( "ful", endPtr ); - auto substrNESS = findPrev ( "ness", endPtr ); + auto substrICATE = findPrev( "icate", endPtr, begPtr + word.size( ) - 6 ); + auto substrATIVE = findPrev( "ative", endPtr, begPtr + word.size( ) - 6 ); + auto substrALIZE = findPrev( "alize", endPtr, begPtr + word.size( ) - 6 ); + auto substrICITI = findPrev( "iciti", endPtr, begPtr + word.size( ) - 6 ); + auto substrICAL = findPrev( "ical", endPtr, begPtr + word.size( ) - 4 ); + auto substrFUL = findPrev( "ful", endPtr, begPtr + word.size( ) - 4 ); + auto substrNESS = findPrev( "ness", endPtr, begPtr + word.size( ) - 5 ); // ICATE -> IC // triplicate -> triplic if ( *substrICATE != '\0' ) { - string wordStem ( word.begin ( ), substrICATE + 1 ); - wordStem + 'ic'; + wordStem = subStr( word.begin( ), substrICATE ); + wordStem += "ic"; } // ATIVE -> // formative -> form else if ( *substrATIVE != '\0' ) { - string wordStem ( word.begin ( ), substrATIVE + 1 ); + wordStem = subStr( word.begin( ), substrATIVE ); } // ALIZE -> AL // formalize -> formal else if ( *substrALIZE != '\0' ) { - string wordStem ( word.begin ( ), substrALIZE + 1 ); - wordStem += 'al'; + wordStem = subStr( word.begin( ), substrALIZE ); + wordStem += "al"; } // ICITI -> IC // electriciti -> electric else if ( *substrICITI != '\0' ) { - string wordStem ( word.begin ( ), substrICITI + 1 ); - wordStem += 'ic'; + wordStem = subStr( word.begin( ), substrICITI ); + wordStem += "ic"; } // ICAL -> IC // electrical -> electric else if ( *substrICAL != '\0' ) { - string wordStem ( word.begin ( ), substrICAL + 1 ); - wordStem += 'ic'; + wordStem = subStr( word.begin( ), substrICAL ); + wordStem += "ic"; } // FUL -> // hopeful -> hope else if ( *substrFUL != '\0' ) { - string wordStem ( word.begin ( ), substrFUL + 1 ); + wordStem = subStr( word.begin( ), substrFUL ); } // NESS -> // goodness -> good else if ( *substrNESS != '\0' ) { - string wordStem ( word.begin ( ), substrNESS + 1 ); + wordStem = subStr( word.begin( ), substrNESS ); } return wordStem; @@ -626,150 +639,151 @@ std::string Stemmer::step3 ( std::string word ) */ std::string Stemmer::step4 ( std::string word ) { - if ( measure ( word ) <= 1 ) + if ( measure( word ) <= 2 ) { return word; } - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); - - - auto substrAL = findPrev ( "al", endPtr ); - auto substrANCE = findPrev ( "ance", endPtr ); - auto substrENCE = findPrev ( "ence", endPtr ); - auto substrER = findPrev ( "er", endPtr ); - auto substrIC = findPrev ( "ic", endPtr ); - auto substrABLE = findPrev ( "able", endPtr ); - auto substrIBLE = findPrev ( "ible", endPtr ); - auto substrANT = findPrev ( "ant", endPtr ); - auto substrEMENT = findPrev ( "ement", endPtr ); - auto substrMENT = findPrev ( "ment", endPtr ); - auto substrENT = findPrev ( "ent", endPtr ); - auto substrTION = findPrev ( "tion", endPtr ); - auto substrOU = findPrev ( "ou", endPtr ); - auto substrISM = findPrev ( "ism", endPtr ); - auto substrATE = findPrev ( "ate", endPtr ); - auto substrITI = findPrev ( "iti", endPtr ); - auto substrOUS = findPrev ( "ous", endPtr ); - auto substrIVE = findPrev ( "ive", endPtr ); - auto substrIZE = findPrev ( "ize", endPtr ); + unsigned long end = word.size( ) - 1; + auto begPtr = word.begin( ); + auto endPtr = begPtr + end; + string wordStem( word.begin( ), word.end( ) ); + + + auto substrAL = findPrev( "al", endPtr, begPtr + word.size( ) - 3 ); + auto substrANCE = findPrev( "ance", endPtr, begPtr + word.size( ) - 5 ); + auto substrENCE = findPrev( "ence", endPtr, begPtr + word.size( ) - 5 ); + auto substrER = findPrev( "er", endPtr, begPtr + word.size( ) - 3 ); + auto substrIC = findPrev( "ic", endPtr, begPtr + word.size( ) - 3 ); + auto substrABLE = findPrev( "able", endPtr, begPtr + word.size( ) - 5 ); + auto substrIBLE = findPrev( "ible", endPtr, begPtr + word.size( ) - 5 ); + auto substrANT = findPrev( "ant", endPtr, begPtr + word.size( ) - 4 ); + auto substrEMENT = findPrev( "ement", endPtr, begPtr + word.size( ) - 6 ); + auto substrMENT = findPrev( "ment", endPtr, begPtr + word.size( ) - 5 ); + auto substrENT = findPrev( "ent", endPtr, begPtr + word.size( ) - 4 ); + auto substrION = findPrev( "ion", endPtr, begPtr + word.size( ) - 4 ); + auto substrOU = findPrev( "ou", endPtr, begPtr + word.size( ) - 3 ); + auto substrISM = findPrev( "ism", endPtr, begPtr + word.size( ) - 4 ); + auto substrATE = findPrev( "ate", endPtr, begPtr + word.size( ) - 4 ); + auto substrITI = findPrev( "iti", endPtr, begPtr + word.size( ) - 4 ); + auto substrOUS = findPrev( "ous", endPtr, begPtr + word.size( ) - 4 ); + auto substrIVE = findPrev( "ive", endPtr, begPtr + word.size( ) - 4 ); + auto substrIZE = findPrev( "ize", endPtr, begPtr + word.size( ) - 4 ); // AL -> // revival -> reviv if ( *substrAL != '\0' ) { - string wordStem ( word.begin ( ), substrAL + 1 ); + wordStem = subStr( word.begin( ), substrAL ); } // ANCE -> // allowance -> allow else if ( *substrANCE != '\0' ) { - string wordStem ( word.begin ( ), substrANCE + 1 ); + wordStem = subStr( word.begin( ), substrANCE ); } // ENCE -> // inference -> infer else if ( *substrENCE != '\0' ) { - string wordStem ( word.begin ( ), substrENCE + 1 ); + wordStem = subStr( word.begin( ), substrENCE ); } // ER -> // airliner -> airlin else if ( *substrER != '\0' ) { - string wordStem ( word.begin ( ), substrER + 1 ); + wordStem = subStr( word.begin( ), substrER ); } // IC -> // gyroscopic -> gyroscope else if ( *substrIC != '\0' ) { - string wordStem ( word.begin ( ), substrIC + 1 ); + wordStem = subStr( word.begin( ), substrIC ); } // ABLE -> // adjustable -> adjust else if ( *substrABLE != '\0' ) { - string wordStem ( word.begin ( ), substrABLE + 1 ); + wordStem = subStr( word.begin( ), substrABLE ); } // IBLE -> // goodness -> good else if ( *substrIBLE != '\0' ) { - string wordStem ( word.begin ( ), substrIBLE + 1 ); + wordStem = subStr( word.begin( ), substrIBLE ); } // ANT -> // irritant -> irrit else if ( *substrANT != '\0' ) { - string wordStem ( word.begin ( ), substrANT + 1 ); + wordStem = subStr( word.begin( ), substrANT ); } // EMENT -> // replacement -> replace else if ( *substrEMENT != '\0' ) { - string wordStem ( word.begin ( ), substrEMENT + 1 ); + wordStem = subStr( word.begin( ), substrEMENT ); } // MENT -> // adjustment -> adjust else if ( *substrMENT != '\0' ) { - string wordStem ( word.begin ( ), substrMENT + 1 ); + wordStem = subStr( word.begin( ), substrMENT ); } // ENT -> // dependent -> depend else if ( *substrENT != '\0' ) { - string wordStem ( word.begin ( ), substrENT + 1 ); + wordStem = subStr( word.begin( ), substrENT ); } // TION -> // stem must end in 't' or 's' // adoption -> adopt - else if ( *substrTION != '\0' && ( *( substrTION - 1 ) == 's' || *( substrTION - 1 ) == 't' ) ) + else if ( *substrION != '\0' && ( *( substrION - 1 ) == 's' || *( substrION - 1 ) == 't' ) ) { - string wordStem ( word.begin ( ), substrTION + 1 ); + wordStem = subStr( word.begin( ), substrION ); } // OU -> // homologou -> homolog else if ( *substrOU != '\0' ) { - string wordStem ( word.begin ( ), substrOU + 1 ); + wordStem = subStr( word.begin( ), substrOU ); } // ISM -> // communism -> commun else if ( *substrISM != '\0' ) { - string wordStem ( word.begin ( ), substrISM + 1 ); + wordStem = subStr( word.begin( ), substrISM ); } // ATE -> // activate -> activ else if ( *substrATE != '\0' ) { - string wordStem ( word.begin ( ), substrATE + 1 ); + wordStem = subStr( word.begin( ), substrATE ); } // ITI -> // angulariti -> angular else if ( *substrITI != '\0' ) { - string wordStem ( word.begin ( ), substrITI + 1 ); + wordStem = subStr( word.begin( ), substrITI ); } // OUS -> // homologous -> homolog else if ( *substrOUS != '\0' ) { - string wordStem ( word.begin ( ), substrOUS + 1 ); + wordStem = subStr( word.begin( ), substrOUS ); } // IVE -> // effective -> effect else if ( *substrIVE != '\0' ) { - string wordStem ( word.begin ( ), substrIVE + 1 ); + wordStem = subStr( word.begin( ), substrIVE ); } // IZE -> // bowdlerize -> bowdler else if ( *substrIZE != '\0' ) { - string wordStem ( word.begin ( ), substrIZE + 1 ); + wordStem = subStr( word.begin( ), substrIZE ); } return wordStem; @@ -783,21 +797,21 @@ std::string Stemmer::step4 ( std::string word ) */ std::string Stemmer::step5a ( std::string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto endPtr = word.begin( ) + end; // E -> // probabte -> probat - if ( measure ( word ) > 1 && *endPtr == 'e' ) + if ( measure( word ) > 1 && *endPtr == 'e' ) { - word = word.substr ( 0, word.size ( ) - 1 ); + word = subStr( word, 0, word.size( ) - 1 ); return word; } // E -> // cease -> cease - if ( measure ( word ) == 1 && !endCVC ( word ) && *endPtr == 'e' ) + if ( measure( word ) == 1 && !endCVC( word ) && *endPtr == 'e' ) { - word = word.substr ( 0, word.size ( ) - 1 ); + word = subStr( word, 0, word.size( ) - 1 ); } return word; @@ -811,12 +825,12 @@ std::string Stemmer::step5a ( std::string word ) */ std::string Stemmer::step5b ( std::string word ) { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + unsigned long end = word.size( ) - 1; + auto endPtr = word.begin( ) + end; - if ( word.size ( ) > 2 && measure ( word ) > 1 && *endPtr == 'l' && *( endPtr - 1 ) == 'l' ) + if ( word.size( ) > 2 && measure( word ) > 1 && *endPtr == 'l' && *( endPtr - 1 ) == 'l' ) { - word = word.substr ( 0, word.size ( ) - 1 ); + word = subStr( word, 0, word.size( ) - 1 ); } return word; } \ No newline at end of file diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index 76f45b2798061c76ed752852157fe5bfd5fa363f..477bc4b9dba0d2e5025a96c8e8092d3c55860b08 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -11,6 +11,7 @@ Tokenizer::Tokenizer ( ) /** * Returns pointer to the docIndex dictionary + * * @return pointer to unordered_map< string, vector< int>> */ unordered_map< string, vector< int>> *Tokenizer::get ( ) const @@ -21,22 +22,27 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const /** * Executes the Tokenizer * Sends tokens to dictionary + * * token -> [offsets] * @param originalText * @param offset */ -void Tokenizer::execute ( string originalText, int offset ) +void Tokenizer::execute ( string & originalText, int offset ) { - vector< string > splitText = splitStr ( originalText, ' ' ); - //TODO make function to remove characters - //TODO normalize contractions - string lowerString = ""; - for ( int i = 0; i < splitText.size ( ); ++i ) + vector< string > splitText = splitStr( originalText, ' ' ); + string processedString = ""; + for ( int i = 0; i < splitText.size( ); ++i ) { - lowerString = toLower ( splitText[ i ] ); - if ( !isStopWord ( lowerString ) ) + // case fold + processedString = toLower( splitText[ i ] ); + //strip all characters + processedString = stripStr( processedString ); + + if ( !isStopWord( processedString ) ) { - ( *docIndex )[ lowerString ].push_back ( offset ); + // stem word + processedString = stem.execute( processedString ); + ( *docIndex )[ processedString ].push_back( offset ); ++offset; } } diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 9e9cecd3fb91faee10ec6602167f8a4006d8c928..bccbcfaf23c7c9f19de168550ff57052696caa53 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -4,20 +4,38 @@ #include <unordered_map> #include <vector> #include "stringProcessing.h" +#include "Stemmer.h" using namespace std; class Tokenizer { - + public: + + /** + * Tokenizer Cstor + */ Tokenizer ( ); + /** + * Returns pointer to the docIndex dictionary + * + * @return pointer to unordered_map< string, vector< int>> + */ unordered_map< string, vector< int>> *get ( ) const; - void execute ( string originalText, int offset ); + /** + * Executes the Tokenizer + * Sends tokens to dictionary + * + * token -> [offsets] + * @param originalText + * @param offset + */ + void execute ( string &originalText, int offset ); private: unordered_map< string, vector< int>> *docIndex; - + Stemmer stem; }; diff --git a/util/stringProcessing.cpp b/util/stringProcessing.cpp index 14e1cf1ab948be895780449b7594d9d603b7362a..3b4d5366828dfd2263a46c4a2af792813410b2a8 100644 --- a/util/stringProcessing.cpp +++ b/util/stringProcessing.cpp @@ -4,6 +4,7 @@ #include "stringProcessing.h" #include "Stemmer.h" +#include <cassert> using namespace std; @@ -15,48 +16,48 @@ using namespace std; * @param needle * @return string::iterator */ -string::iterator findStr (string needle, string haystack ) +string::iterator findStr ( string needle, string haystack ) { - auto beginNeedle = needle.begin ( ); - auto beginHaystack = haystack.begin(); + auto beginNeedle = needle.begin( ); + auto beginHaystack = haystack.begin( ); while ( *beginHaystack != '\0' ) - { + { //keep looking for instance of a match if ( *beginHaystack != *beginNeedle ) - { + { ++beginHaystack; - } + } else if ( *beginHaystack == *beginNeedle ) - { + { /* want to keep the original iterator where it is so it can return the beginning of the matched word if found */ auto temp = beginHaystack; while ( *temp == *beginNeedle ) - { + { ++temp; ++beginNeedle; //if it hits the end of the needleing, it signifies an exact match if ( *beginNeedle == '\0' ) - { + { //this is pointing at the beginning of the match return beginHaystack; - } + } - } + } //need to reset because still has to search rest of the string for a match - beginNeedle = needle.begin ( ); + beginNeedle = needle.begin( ); //sets the original text pointer to where the last search left off beginHaystack = temp; - } + } else - { + { //DO NOTHING + } } - } return beginHaystack; @@ -69,48 +70,47 @@ string::iterator findStr (string needle, string haystack ) * @param pointer * @return string::iterator */ -string::iterator findNext (string needle, string::iterator haystackPointer ) - { - auto beginNeedle = needle.begin ( ); - auto beginHaystack = haystackPointer; - while ( *beginHaystack != '\0' ) +string::iterator findNext ( string needle, string::iterator haystackPointer ) { - //keep looking for instance of a match - if ( *beginHaystack != *beginNeedle ) + auto beginNeedle = needle.begin( ); + while ( *haystackPointer != '\0' ) { - ++beginHaystack; - } + //keep looking for instance of a match + if ( *haystackPointer != *beginNeedle ) + { + ++haystackPointer; + } - else if ( *beginHaystack == *beginNeedle ) - { + else if ( *haystackPointer == *beginNeedle ) + { /* want to keep the original iterator where it is so it can return the beginning of the matched word if found */ - auto temp = beginHaystack; + auto temp = haystackPointer; while ( *temp == *beginNeedle ) - { + { ++temp; ++beginNeedle; //if it hits the end of the needleing, it signifies an exact match if ( *beginNeedle == '\0' ) - { + { //this is pointing at the beginning of the match - return beginHaystack; - } + return haystackPointer; + } - } + } //need to reset because still has to search rest of the string for a match - beginNeedle = needle.begin ( ); + beginNeedle = needle.begin( ); //sets the original text pointer to where the last search left off - beginHaystack = temp; - } + haystackPointer = temp; + } else - { + { //DO NOTHING + } } - } - return beginHaystack; + return haystackPointer; } /** @@ -118,50 +118,58 @@ string::iterator findNext (string needle, string::iterator haystackPointer ) * * @param needle * @param haystackPointer - * @return + * @return string::iterator */ -string::iterator findPrev ( string needle, string::iterator haystackPointer ) +string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg ) { - auto beginNeedle = needle.begin ( ); - auto beginHaystack = haystackPointer; - while ( *beginHaystack != '\0' ) + auto begNeedle = needle.begin( ); + auto endNeedle = begNeedle + ( needle.size( ) - 1 ); + + while ( haystackPointer != haystackBeg ) + { + //keep looking for instance of a match + if ( *haystackPointer != *endNeedle ) { - //keep looking for instance of a match - if ( *beginHaystack != *beginNeedle ) - { - --beginHaystack; - } + --haystackPointer; + } - else if ( *beginHaystack == *beginNeedle ) + else if ( *haystackPointer == *endNeedle ) + { + /* want to keep the original iterator where it is so it + can return the beginning of the matched word if found */ + auto temp = haystackPointer; + while ( *temp == *endNeedle ) { - /* want to keep the original iterator where it is so it - can return the beginning of the matched word if found */ - auto temp = beginHaystack; - while ( *temp == *beginNeedle ) + //if it hits the end of the needleing, it signifies an exact match + if ( endNeedle == begNeedle && *temp == *endNeedle ) { - --temp; - --beginNeedle; - //if it hits the end of the needleing, it signifies an exact match - if ( *beginNeedle == '\0' ) - { - //this is pointing at the beginning of the match - return beginHaystack; - } + //this is pointing at the beginning of the match + return temp; + } + if ( temp != haystackBeg ) + { + --temp; + } + if ( endNeedle != begNeedle ) + { + --endNeedle; } - //need to reset because still has to search rest of the string for a match - beginNeedle = needle.begin ( ); - //sets the original text pointer to where the last search left off - beginHaystack = temp; - } - else - { - //DO NOTHING } + //need to reset because still has to search rest of the string for a match + endNeedle = begNeedle + ( needle.size( ) - 1 ); + //sets the original text pointer to where the last search left off + haystackPointer = temp; + } + + else + { + //DO NOTHING } + } - return beginHaystack; + return needle.end( ); } /** @@ -169,25 +177,25 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer ) * * @param originalText * @param delim - * @return + * @return vector < string > */ -vector< string > splitStr ( string originalText, char delim ) +vector< string > splitStr ( string & originalText, char delim ) { vector< string > splitWords; - auto begin = originalText.begin ( ); + auto begin = originalText.begin( ); while ( *begin != '\0' ) - { + { string word = ""; while ( *begin != delim && *begin != '\0' ) - { + { word += *begin; ++begin; - } + } - splitWords.push_back ( word ); + splitWords.push_back( word ); ++begin; - } + } return splitWords; @@ -197,11 +205,11 @@ vector< string > splitStr ( string originalText, char delim ) * Returns true if @word is a stopword * * @param word - * @return + * @return bool */ -bool isStopWord ( string word ) +bool isStopWord ( string & word ) { - return ( stopWords.find ( word ) != stopWords.end ( ) ); + return ( stopWords.find( word ) != stopWords.end( ) ); } @@ -209,25 +217,25 @@ bool isStopWord ( string word ) * Returns lowercase @word * * @param word - * @return + * @return string */ -string toLower ( string word ) +string toLower ( string & word ) { - auto iter = word.begin ( ); + auto iter = word.begin( ); string lowerWord = ""; while ( *iter != '\0' ) - { - if ( *iter >= 'A' && *iter <= 'Z' ) { + if ( *iter >= 'A' && *iter <= 'Z' ) + { lowerWord += ( *iter + 32 ); - } + } else - { + { lowerWord += *iter; - } + } ++iter; - } + } return lowerWord; } @@ -236,11 +244,139 @@ string toLower ( string word ) * Returns stemmed @word * * @param word - * @return + * @return string */ -string stemWord(string word) +string stemWord ( string & word ) { Stemmer stemmer; - word = stemmer.execute ( word ); + word = stemmer.execute( word ); return word; } + +/** + * Returns a substring [ post, len ) + * + * @param word + * @param pos + * @param len + * @return string + */ +string subStr ( string & word, size_t pos, size_t len ) + { + string substr = ""; + for ( int i = 0; i < len; ++i ) + { + substr += word.at( pos ); + ++pos; + } + return substr; + } + +/** + * Returns a substring [ begin, end ) + * + * @param pos + * @param len + * @return string + */ +string subStr ( string::iterator begin, string::iterator end ) + { + string substr = ""; + while ( begin != end ) + { + substr += *begin; + ++begin; + } + return substr; + } + +/** + * Removes the chars in vector from word + * + * @param word + * @param chars + * @return string + */ +string stripStr ( string & word, vector< char > chars ) + { + string wordStripped = ""; + auto begin = word.begin( ); + bool isSymbol = false; + + while ( begin != word.end( ) ) + { + for ( int i = 0; i < chars.size( ); ++i ) + { + if ( *begin == chars[ i ] ) + { + isSymbol = true; + } + } + if ( !isSymbol ) + { + wordStripped += *begin; + } + ++begin; + } + return wordStripped; + } + +/** + * Removes all chars from word + * Assumes word is lowercase + * + * @param word + * @param chars + * @return string + */ +string stripStr ( string & word ) + { + string wordStripped = ""; + auto begin = word.begin( ); + + while ( begin != word.end( ) ) + { + if ( isAlpha( *begin ) || isNum( *begin ) ) + { + wordStripped += *begin; + } + ++begin; + } + return wordStripped; + } + +/** + * Returns true is character is a letter + * + * @param ch + * @return bool + */ +bool isAlpha ( char ch ) + { + // capital letter + if ( ch >= 'A' && ch <= 'Z' ) + { + return true; + } + // lowercase letter + if ( ch >= 'a' && ch <= 'z' ) + { + return true; + } + return false; + } + +/** + * Returns true is character is a number + * + * @param ch + * @return bool + */ +bool isNum ( char ch ) + { + if ( ch >= '0' && ch <= '9' ) + { + return true; + } + return false; + } \ No newline at end of file diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 4de025e504da9c03479835d1cf09b5c96a6d876e..c3e6a7c10eb9a0cae3060761c8e99b215af8100c 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -48,40 +48,93 @@ string::iterator findNext ( string needle, string::iterator haystackPointer ); * * @param needle * @param haystackPointer - * @return + * @param haystackBeg + * @return string::iterator */ -string::iterator findPrev ( string needle, string::iterator haystackPointer ); +string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg ); /** * Returns a vector of strings from @originalText, split by @delim * * @param originalText * @param delim - * @return + * @return vector< string > */ -vector< string > splitStr ( string originalText, char delim ); +vector< string > splitStr ( string & originalText, char delim ); /** * Returns true if @word is a stopword * * @param word - * @return + * @return bool */ -bool isStopWord ( string word ); +bool isStopWord ( string & word ); /** * Returns lowercase @word * * @param word - * @return + * @return string */ -string toLower ( string word ); +string toLower ( string & word ); /** * Returns stemmed @word * * @param word - * @return + * @return string + */ +string stemWord ( string & word ); + +/** + * Returns a substring [ post, len ) + * + * @param word + * @param pos + * @param len + * @return string + */ +string subStr ( string & word, size_t pos, size_t len ); + +/** + * Returns a substring [ begin, end ) + * + * @param pos + * @param len + * @return string */ -string stemWord ( string word ); +string subStr ( string::iterator begin, string::iterator end ); +/** + * Removes the chars in vector from word + * + * @param word + * @param chars + * @return string + */ +string stripStr ( string & word, vector< char > chars ); + +/** + * Removes all chars from word + * Assumes word is lowercase + * + * @param word + * @return string + */ +string stripStr ( string & word ); + +/** + * Returns true is character is a letter + * + * @param ch + * @return bool + */ +bool isAlpha ( char ch ); + +/** + * Returns true is character is a number + * + * @param ch + * @return bool + */ +bool isNum ( char ch ); \ No newline at end of file diff --git a/util/tests/stemmerTest.cpp b/util/tests/stemmerTest.cpp index aaa8053a8c127b05b10a9e24ee103925b1065945..c49415b87a47bd2cf09d7056b1888ec7db828a6b 100644 --- a/util/tests/stemmerTest.cpp +++ b/util/tests/stemmerTest.cpp @@ -1,16 +1,115 @@ #include <string> #include <vector> -#include "../stringProcessing.h" +#include "../Stemmer.h" #include <iostream> #include <cassert> int main ( ) { - cout << "Beginning testing for Stemmer" << endl << endl; + cout << "Beginning testing for Stemmer" << endl; + + Stemmer stem; + + assert ( stem.execute( "caresses" ) == "caress" ); + assert ( stem.execute( "ponies" ) == "poni" ); + assert ( stem.execute( "ties" ) == "ti" ); + assert ( stem.execute( "caress" ) == "caress" ); + assert ( stem.execute( "cats" ) == "cat" ); + assert ( stem.execute( "feed" ) == "feed" ); + + assert ( stem.execute( "agreed" ) == "agre" ); + assert ( stem.execute( "plastered" ) == "plaster" ); + assert ( stem.execute( "bled" ) == "bled" ); + assert ( stem.execute( "motoring" ) == "motor" ); + assert ( stem.execute( "conflated" ) == "conflat" ); + + assert ( stem.execute( "troubled" ) == "troubl" ); + assert ( stem.execute( "sized" ) == "size" ); + assert ( stem.execute( "hopping" ) == "hop" ); + assert ( stem.execute( "tanning" ) == "tan" ); + assert ( stem.execute( "tanned" ) == "tan" ); + + assert ( stem.execute( "falling" ) == "fall" ); + assert ( stem.execute( "hissing" ) == "hiss" ); + assert ( stem.execute( "fizzed" ) == "fizz" ); + assert ( stem.execute( "failing" ) == "fail" ); + assert ( stem.execute( "filing" ) == "file" ); + + assert ( stem.execute( "happy" ) == "happi" ); + assert ( stem.execute( "sky" ) == "sky" ); + assert ( stem.execute( "relational" ) == "relat" ); + assert ( stem.execute( "conditional" ) == "condit" ); + assert ( stem.execute( "rational" ) == "ration" ); + + assert ( stem.execute( "valenci" ) == "valenc" ); + assert ( stem.execute( "hesitanci" ) == "hesit" ); + assert ( stem.execute( "digitizer" ) == "digit" ); + assert ( stem.execute( "conformabli" ) == "conform" ); + assert ( stem.execute( "radicalli" ) == "radic" ); + assert ( stem.execute( "differentli" ) == "differ" ); + + assert ( stem.execute( "vileli" ) == "vile" ); + assert ( stem.execute( "analogousli" ) == "analog" ); + assert ( stem.execute( "vietnamization" ) == "vietnam" ); + assert ( stem.execute( "predication" ) == "predic" ); + assert ( stem.execute( "operator" ) == "oper" ); + assert ( stem.execute( "feudalism" ) == "feudal" ); + + assert ( stem.execute( "decisiveness" ) == "decis" ); + assert ( stem.execute( "hopefulness" ) == "hope" ); + assert ( stem.execute( "callousness" ) == "callous" ); + assert ( stem.execute( "formaliti" ) == "formal" ); + assert ( stem.execute( "sensitiviti" ) == "sensit" ); + assert ( stem.execute( "sensibiliti" ) == "sensibl" ); + + assert ( stem.execute( "triplicate" ) == "triplic" ); + assert ( stem.execute( "formative" ) == "form" ); + assert ( stem.execute( "formalize" ) == "formal" ); + assert ( stem.execute( "electriciti" ) == "electr" ); + assert ( stem.execute( "electrical" ) == "electr" ); + assert ( stem.execute( "hopeful" ) == "hope" ); + + assert ( stem.execute( "goodness" ) == "good" ); + assert ( stem.execute( "revival" ) == "reviv" ); + assert ( stem.execute( "allowance" ) == "allow" ); + assert ( stem.execute( "inference" ) == "infer" ); + + assert ( stem.execute( "airliner" ) == "airlin" ); + assert ( stem.execute( "gyroscopic" ) == "gyroscop" ); + assert ( stem.execute( "adjustable" ) == "adjust" ); + assert ( stem.execute( "defensible" ) == "defens" ); + assert ( stem.execute( "irritant" ) == "irrit" ); + assert ( stem.execute( "replacement" ) == "replac" ); + assert ( stem.execute( "adjustment" ) == "adjust" ); + assert ( stem.execute( "dependent" ) == "depend" ); + + assert ( stem.execute( "adoption" ) == "adopt" ); + assert ( stem.execute( "homologou" ) == "homolog" ); + assert ( stem.execute( "communism" ) == "commun" ); + assert ( stem.execute( "activate" ) == "activ" ); + assert ( stem.execute( "angulariti" ) == "angular" ); + assert ( stem.execute( "homologous" ) == "homolog" ); + assert ( stem.execute( "effective" ) == "effect" ); + + assert ( stem.execute( "bowdlerize" ) == "bowdler" ); + assert ( stem.execute( "probate" ) == "probat" ); + assert ( stem.execute( "cease" ) == "ceas" ); + assert ( stem.execute( "controll" ) == "control" ); + assert ( stem.execute( "roll" ) == "roll" ); + assert ( stem.execute( "university" ) == "univers" ); + assert ( stem.execute( "example" ) == "exampl" ); + + assert ( stem.execute( "do" ) == "do" ); + assert ( stem.execute( "you" ) == "you" ); + assert ( stem.execute( "really" ) == "real" ); + assert ( stem.execute( "weakness" ) == "weak" ); + assert ( stem.execute( "yields" ) == "yield" ); + assert ( stem.execute( "temptation" ) == "temptat" ); + assert ( stem.execute( "are" ) == "ar" ); + assert ( stem.execute( "terrible" ) == "terribl" ); - assert ( false ); cout << "\nTests passed for Stemmer :D" << endl; diff --git a/util/tests/stringProcessingTest.cpp b/util/tests/stringProcessingTest.cpp index f54f4ab5cc0c4858200d6fb3bf55036f8fbd0025..c2fb6a34d2c4afaeeb1a72d543717cd686b72dfb 100644 --- a/util/tests/stringProcessingTest.cpp +++ b/util/tests/stringProcessingTest.cpp @@ -2,6 +2,7 @@ #include <string> #include <vector> #include "../stringProcessing.h" +#include "../Stemmer.h" #include <iostream> #include <cassert> @@ -9,15 +10,25 @@ using namespace std; void testFindStr ( string original ); +void testFindNext ( ); + +void testFindPrev ( ); + void testSplitStr ( string original ); +void testIsStopWord ( ); + void testToLower ( ); -void testIsStopWord ( ); +void testStemWord ( ); -void testFindNext ( ); +void testSubStr ( ); -void testFindPrev ( ); +void testStripStr ( ); + +void testIsAlpha ( ); + +void testIsNum ( ); int main ( ) { @@ -28,12 +39,17 @@ int main ( ) "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "making it look like readable English. "; - testFindStr ( original ); - testSplitStr ( original ); - testToLower ( ); - testIsStopWord ( ); - testFindNext ( ); - testFindPrev ( ); + testFindStr( original ); + testFindNext( ); + testFindPrev( ); + testSplitStr( original ); + testIsStopWord( ); + testToLower( ); + testStemWord( ); + testSubStr( ); + testStripStr( ); + testIsAlpha( ); + testIsNum( ); cout << "\nTests passed for StringProcessing :D" << endl; @@ -42,49 +58,115 @@ int main ( ) void testFindStr ( string original ) { cout << "Testing findStr..." << endl; - assert( *findStr ( original, "established" ) == 'e' ); - assert( *findStr ( original, "Lorem Ipsum" ) == 'L' ); + assert( *findStr( "established", original ) == 'e' ); + assert( *findStr( "Lorem Ipsum", original ) == 'L' ); string title = "<title> This is a test </title>"; - auto word = findStr ( title, "<title>" ); + auto word = findStr( "<title>", title ); assert( *word == '<' ); - auto titleIt = title.begin ( ); - while ( word != title.end ( ) && titleIt != title.end ( ) ) + auto titleIt = title.begin( ); + while ( word != title.end( ) && titleIt != title.end( ) ) { assert( *word == *titleIt ); ++word; ++titleIt; } - auto word1 = findStr ( title, "</title>" ); + auto word1 = findStr( "</title>", title ); assert( *word1 == '<' && *( word1 + 1 ) == '/' ); - assert( *findStr ( original, "</title>" ) == '\0' ); - assert( *findStr ( original, "orange" ) == '\0' ); - assert( *findStr ( "apple", "orange" ) == '\0' ); - auto word2 = findStr ( "bigbird", "bird" ); + assert( *findStr( "</title>", original ) == '\0' ); + assert( *findStr( "orange", original ) == '\0' ); + assert( *findStr( "orange", "apple" ) == '\0' ); + auto word2 = findStr( "bird", "bigbird" ); assert( *word2 == 'b' && *( word2 + 1 ) == 'i' && *( word2 + 2 ) == 'r' ); - cout << "testFindStr passed" << endl; + cout << "testFindStr passed" << endl << endl; + + } + +void testFindNext ( ) + { + cout << "Testing findNext..." << endl; + + string racecar = "racecar"; + string hello = "hello"; + string blank = ""; + + assert ( *findNext( "race", racecar.begin( ) ) == 'r' ); + assert ( *findNext( "race", racecar.begin( ) + 4 ) == '\0' ); + assert ( *findNext( "car", racecar.begin( ) + 4 ) == 'c' ); + + assert ( *findNext( "hello", hello.begin( ) ) == 'h' ); + assert ( *findNext( "ello", hello.begin( ) ) == 'e' ); + assert ( *findNext( "ello", hello.begin( ) + 2 ) == '\0' ); + + assert ( *findNext( "", blank.begin( ) ) == '\0' ); + + cout << "testFindNext passed" << endl << endl; } +void testFindPrev ( ) + { + cout << "Testing findPrev..." << endl; + + string racecar = "racecar"; + string hello = "hello"; + string blank = ""; + + assert ( *findPrev( "race", racecar.begin( ), racecar.begin( ) ) == '\0' ); + assert ( *findPrev( "race", racecar.begin( ) + 4, racecar.begin( ) ) == 'r' ); + assert ( *findPrev( "car", racecar.begin( ) + 4, racecar.begin( ) ) == '\0' ); + assert ( *findPrev( "car", racecar.begin( ) + 7, racecar.begin( ) ) == 'c' ); + + assert ( *findPrev( "hello", hello.begin( ), hello.begin( ) ) == '\0' ); + assert ( *findPrev( "ello", hello.begin( ) + 3, hello.begin( ) ) == '\0' ); + assert ( *findPrev( "ello", hello.begin( ) + 5, hello.begin( ) ) == 'e' ); + + assert ( *findPrev( "", blank.begin( ), blank.begin( ) ) == '\0' ); + + string fall = "fall"; + assert ( *findPrev( "bl", fall.begin( ) + 3, fall.begin( ) ) == '\0' ); + + cout << "testFindPrev passed" << endl << endl; + + } void testSplitStr ( string original ) { cout << "Testing splitStr..." << endl; - vector< string > vec = splitStr ( original, ' ' ); - assert( vec.size ( ) == 53 ); + vector< string > vec = splitStr( original, ' ' ); + assert( vec.size( ) == 53 ); string word = "hello\ngoodbye"; - vec = splitStr ( word, '\n' ); - assert( vec.size ( ) == 2 ); + vec = splitStr( word, '\n' ); + assert( vec.size( ) == 2 ); assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" ); - cout << "testSplitStr passed" << endl; + cout << "testSplitStr passed" << endl << endl; } +void testIsStopWord ( ) + { + cout << "Testing isStopWord..." << endl; + + string is = "is"; + string hello = "Hello"; + string none = "none"; + string blank = ""; + string blank2 = " "; + + assert ( isStopWord( is ) ); + assert ( !isStopWord( hello ) ); + assert ( isStopWord( none ) ); + assert ( !isStopWord( blank ) ); + assert ( !isStopWord( blank2 ) ); + + cout << "testIsStopWord passed" << endl << endl; + + } void testToLower ( ) { @@ -96,11 +178,11 @@ void testToLower ( ) string word4 = ""; string word5 = " "; - string test = toLower ( word ); - string test2 = toLower ( word2 ); - string test3 = toLower ( word3 ); - string test4 = toLower ( word4 ); - string test5 = toLower ( word5 ); + string test = toLower( word ); + string test2 = toLower( word2 ); + string test3 = toLower( word3 ); + string test4 = toLower( word4 ); + string test5 = toLower( word5 ); assert ( test == "hello" ); assert ( test2 == "hello" ); @@ -108,70 +190,144 @@ void testToLower ( ) assert ( test4 == "" ); assert ( test5 == " " ); - cout << "testToLower passed" << endl; + cout << "testToLower passed" << endl << endl; } - -void testIsStopWord ( ) +void testStemWord ( ) { - cout << "Testing isStopWord..." << endl; + cout << "Testing stemWord..." << endl; + Stemmer stem; - string is = "is"; - string hello = "Hello"; - string none = "none"; - string blank = ""; - string blank2 = " "; - - assert ( isStopWord ( is ) ); - assert ( !isStopWord ( hello ) ); - assert ( isStopWord ( none ) ); - assert ( !isStopWord ( blank ) ); - assert ( !isStopWord ( blank2 ) ); - - cout << "testIsStopWord passed" << endl; + assert ( stem.execute( "cats" ) == "cat" ); + assert ( stem.execute( "wilde" ) == "wild" ); + assert( stem.execute( "zoo" ) == "zoo" ); + assert( stem.execute( "troublesome" ) == "troublesom" ); + cout << "testStemWord passed" << endl << endl; } -void testFindNext ( ) +void testSubStr ( ) { - cout << "Testing findNext..." << endl; + cout << "Testing subStr..." << endl; - string racecar = "racecar"; string hello = "hello"; - string blank = ""; + string goodbye = "goodbye"; + string blank = " "; + string blank2 = ""; - assert ( *findNext ( "race", racecar.begin( ) ) == 'r' ); - assert ( *findNext ( "race", racecar.begin( ) + 4 ) == '\0' ); - assert ( *findNext ( "car", racecar.begin( ) + 4 ) == 'c' ); + assert ( subStr( hello, 1, 4 ) == "ello" ); + assert ( subStr( hello, 0, 5 ) == "hello" ); + assert ( subStr( hello, 0, 1 ) == "h" ); + assert ( subStr( hello, 1, 2 ) == "el" ); - assert ( *findNext ( "hello", hello.begin( ) ) == 'h' ); - assert ( *findNext ( "ello", hello.begin( ) ) == 'e' ); - assert ( *findNext ( "ello", hello.begin( ) + 2 ) == 'e' ); + assert ( subStr( goodbye, 0, 4 ) == "good" ); + assert ( subStr( goodbye, 4, 3 ) == "bye" ); + assert ( subStr( goodbye, 1, 0 ) == "" ); + assert ( subStr( goodbye, 0, 7 ) == "goodbye" ); - assert ( *findNext ( "", blank.begin( ) ) == '\0' ); + assert ( subStr( blank, 0, 1 ) == " " ); + assert ( subStr( blank, 0, 0 ) == "" ); + assert ( subStr( blank2, 0, 0 ) == "" ); - cout << "testFindNext passed" << endl; + assert ( subStr( hello.begin( ), hello.end( ) ) == "hello" ); + assert ( subStr( hello.begin( ) + 4, hello.begin( ) + 5 ) == "o" ); + assert ( subStr( hello.begin( ), hello.begin( ) + 1 ) == "h" ); + assert ( subStr( goodbye.begin( ) + 1, goodbye.begin( ) + 3 ) == "oo" ); - } -void testFindPrev ( ) - { - cout << "Testing findPrev..." << endl; + cout << "testSubStrpassed" << endl << endl; - string racecar = "racecar"; - string hello = "hello"; - string blank = ""; + } - assert ( *findNext ( "race", racecar.begin( ) ) == '\0' ); - assert ( *findNext ( "race", racecar.begin( ) + 4 ) == 'r' ); - assert ( *findNext ( "car", racecar.begin( ) + 4 ) == '\0' ); - assert ( *findNext ( "car", racecar.begin( ) + 7 ) == 'c' ); - assert ( *findNext ( "hello", hello.begin( ) ) == '\0' ); - assert ( *findNext ( "ello", hello.begin( ) + 3 ) == '\0' ); - assert ( *findNext ( "ello", hello.begin( ) + 5 ) == 'e' ); +void testStripStr ( ) + { + cout << "Testing stripStr..." << endl; + + char arr[] = { ',', '.', '*', '&', '^', '%', ';', ' ' }; + vector< char > chars( arr, arr + sizeof( arr ) / sizeof( arr[ 0 ] ) ); + + string hello = "!hello!"; + string allSym = "\"*&^%;"; + string comma = "comma,"; + string period = "period."; + string blank = " "; + + assert ( stripStr( hello ) == "hello" ); + assert ( stripStr( allSym ) == "" ); + assert ( stripStr( comma ) == "comma" ); + assert ( stripStr( period ) == "period" ); + assert ( stripStr( blank ) == "" ); + + assert ( stripStr( hello, chars ) == "!hello!" ); + assert ( stripStr( allSym, chars ) == "\"" ); + assert ( stripStr( comma, chars ) == "comma" ); + assert ( stripStr( period, chars ) == "period" ); + assert ( stripStr( blank, chars ) == "" ); + + cout << "testStripStrpassed" << endl << endl; + } - assert ( *findNext ( "", blank.begin( ) ) == '\0' ); +void testIsAlpha ( ) + { + cout << "Testing isAlpha..." << endl; + + assert ( isAlpha( 'a' ) ); + assert ( isAlpha( 'A' ) ); + assert ( isAlpha( 'z' ) ); + assert ( isAlpha( 'Z' ) ); + assert ( isAlpha( 'g' ) ); + assert ( isAlpha( 'i' ) ); + assert ( isAlpha( 'P' ) ); + + assert ( !isAlpha( '1' ) ); + assert ( !isAlpha( '0' ) ); + assert ( !isAlpha( '9' ) ); + assert ( !isAlpha( '5' ) ); + assert ( !isAlpha( '6' ) ); + + assert ( !isAlpha( ' ' ) ); + assert ( !isAlpha( '!' ) ); + assert ( !isAlpha( '/' ) ); + assert ( !isAlpha( '?' ) ); + assert ( !isAlpha( '*' ) ); + assert ( !isAlpha( '-' ) ); + assert ( !isAlpha( '.' ) ); + assert ( !isAlpha( ',' ) ); + assert ( !isAlpha( '(' ) ); + assert ( !isAlpha( '}' ) ); + + cout << "testIsAlpha passed" << endl << endl; + } - cout << "testFindPrev passed" << endl; +void testIsNum ( ) + { + cout << "Testing isNum..." << endl; + + assert ( !isNum( 'a' ) ); + assert ( !isNum( 'A' ) ); + assert ( !isNum( 'z' ) ); + assert ( !isNum( 'Z' ) ); + assert ( !isNum( 'g' ) ); + assert ( !isNum( 'i' ) ); + assert ( !isNum( 'P' ) ); + + assert ( isNum( '1' ) ); + assert ( isNum( '0' ) ); + assert ( isNum( '9' ) ); + assert ( isNum( '5' ) ); + assert ( isNum( '6' ) ); + + assert ( !isNum( ' ' ) ); + assert ( !isNum( '!' ) ); + assert ( !isNum( '/' ) ); + assert ( !isNum( '?' ) ); + assert ( !isNum( '*' ) ); + assert ( !isNum( '-' ) ); + assert ( !isNum( '.' ) ); + assert ( !isNum( ',' ) ); + assert ( !isNum( '(' ) ); + assert ( !isNum( '}' ) ); + + cout << "testIsNum passed" << endl; } \ No newline at end of file diff --git a/util/tests/tokenizerTest.cpp b/util/tests/tokenizerTest.cpp index 891c00509ef9fca7acacf8c1d8937ced14c6fab7..f377cb3e022fe68e38b4ea430260dd69a119f27d 100644 --- a/util/tests/tokenizerTest.cpp +++ b/util/tests/tokenizerTest.cpp @@ -13,29 +13,29 @@ void testExecute ( string original ); int main ( ) { - cout << "Beginning testing for TokenizerTest_unit" << endl << endl; + cout << "Beginning testing for TokenizerTest" << endl << endl; string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. " "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "making it look like readable English. "; - testExecute ( original ); + testExecute( original ); - cout << "\nTests passed for TokenizerTest_unit :D" << endl; + cout << "\nTests passed for TokenizerTest :D" << endl; } void testExecute ( string original ) { Tokenizer myTokenizer; - myTokenizer.execute ( original, 0 ); + myTokenizer.execute( original, 0 ); - auto dict = myTokenizer.get ( ); + auto dict = myTokenizer.get( ); - for ( auto it = dict->begin ( ); it != dict->end ( ); it++ ) + for ( auto it = dict->begin( ); it != dict->end( ); it++ ) { cout << it->first << ':'; - for ( int i = 0; i < it->second.size ( ); ++i ) + for ( int i = 0; i < it->second.size( ); ++i ) { cout << it->second[ i ] << " "; }