diff --git a/.gitignore b/.gitignore index 303db96a46c25523284beb4dca75eed007d795e0..779b7592c82aaec669d76cb5b14014a3254bd674 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,6 @@ cmake-build-debug/CMakeFiles/cmake.check_cache cmake-build-debug/CMakeFiles/feature_tests.bin cmake-build-debug/CMakeFiles/feature_tests.c cmake-build-debug/CMakeFiles/feature_tests.cxx +cmake-build-debug/CMakeFiles/Makefile2 cmake-build-debug/CMakeFiles/progress.marks +cmake-build-debug/Makefile diff --git a/CMakeLists.txt b/CMakeLists.txt index af64924ab77467a4c2f3ee55793dca9e107409a7..26aa9a42697d73a1ac5ea363fc6cf9592a802401 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,11 +25,12 @@ add_executable(StringProcessingTest util/tests/stringProcessingTest.cpp) add_executable(TokenizerTest - util/Tokenizer.h + util/Tokenizer.cpp + util/stringProcessing.cpp util/tests/tokenizerTest.cpp) add_executable(StemmerTest - util/Stemmer.h + util/Stemmer.cpp util/tests/stemmerTest.cpp) add_executable(ParserTest diff --git a/cmake-build-debug/CMakeFiles/Makefile2 b/cmake-build-debug/CMakeFiles/Makefile2 index 1ec7d98848e6710ff31bcee3f56bb30cc62bd53f..def2daa5b36808e923869635e366f13c4d13e7e7 100644 --- a/cmake-build-debug/CMakeFiles/Makefile2 +++ b/cmake-build-debug/CMakeFiles/Makefile2 @@ -66,7 +66,7 @@ CMAKE_BINARY_DIR = /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-s CMakeFiles/StemmerTest.dir/all: $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/depend $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=8,9 "Built target StemmerTest" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=8,9,10 "Built target StemmerTest" .PHONY : CMakeFiles/StemmerTest.dir/all # Include target in all. @@ -76,7 +76,7 @@ all: CMakeFiles/StemmerTest.dir/all # Build rule for subdir invocation for target. CMakeFiles/StemmerTest.dir/rule: cmake_check_build_system - $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 2 + $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 3 $(MAKE) -f CMakeFiles/Makefile2 CMakeFiles/StemmerTest.dir/all $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 0 .PHONY : CMakeFiles/StemmerTest.dir/rule @@ -103,7 +103,7 @@ clean: CMakeFiles/StemmerTest.dir/clean CMakeFiles/StringProcessingTest.dir/all: $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/depend $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=10,11,12 "Built target StringProcessingTest" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=11,12,13 "Built target StringProcessingTest" .PHONY : CMakeFiles/StringProcessingTest.dir/all # Include target in all. @@ -140,7 +140,7 @@ clean: CMakeFiles/StringProcessingTest.dir/clean CMakeFiles/TokenizerTest.dir/all: $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/depend $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=13,14 "Built target TokenizerTest" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=14,15,16,17 "Built target TokenizerTest" .PHONY : CMakeFiles/TokenizerTest.dir/all # Include target in all. @@ -150,7 +150,7 @@ all: CMakeFiles/TokenizerTest.dir/all # Build rule for subdir invocation for target. CMakeFiles/TokenizerTest.dir/rule: cmake_check_build_system - $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 2 + $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 4 $(MAKE) -f CMakeFiles/Makefile2 CMakeFiles/TokenizerTest.dir/all $(CMAKE_COMMAND) -E cmake_progress_start /Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles 0 .PHONY : CMakeFiles/TokenizerTest.dir/rule @@ -177,7 +177,7 @@ clean: CMakeFiles/TokenizerTest.dir/clean CMakeFiles/URLTEST.dir/all: $(MAKE) -f CMakeFiles/URLTEST.dir/build.make CMakeFiles/URLTEST.dir/depend $(MAKE) -f CMakeFiles/URLTEST.dir/build.make CMakeFiles/URLTEST.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=15,16 "Built target URLTEST" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=18,19 "Built target URLTEST" .PHONY : CMakeFiles/URLTEST.dir/all # Include target in all. @@ -251,7 +251,7 @@ clean: CMakeFiles/ParserTest.dir/clean CMakeFiles/search-engine.dir/all: $(MAKE) -f CMakeFiles/search-engine.dir/build.make CMakeFiles/search-engine.dir/depend $(MAKE) -f CMakeFiles/search-engine.dir/build.make CMakeFiles/search-engine.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=26,27,28 "Built target search-engine" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=29,30,31 "Built target search-engine" .PHONY : CMakeFiles/search-engine.dir/all # Include target in all. @@ -288,7 +288,7 @@ clean: CMakeFiles/search-engine.dir/clean CMakeFiles/crawler-parser-test.dir/all: $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/depend $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/build - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=17,18,19,20,21,22,23,24,25 "Built target crawler-parser-test" + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/Users/veronicaday/Desktop/EECS398/eecs_398/project/eecs398-search/cmake-build-debug/CMakeFiles --progress-num=20,21,22,23,24,25,26,27,28 "Built target crawler-parser-test" .PHONY : CMakeFiles/crawler-parser-test.dir/all # Include target in all. diff --git a/cmake-build-debug/CMakeFiles/progress.marks b/cmake-build-debug/CMakeFiles/progress.marks index 9902f17848a8974ab57d57999b74a63198fe6e23..e85087affded170efcbc6f9672a6fc671d839ed0 100644 --- a/cmake-build-debug/CMakeFiles/progress.marks +++ b/cmake-build-debug/CMakeFiles/progress.marks @@ -1 +1 @@ -28 +31 diff --git a/cmake-build-debug/Makefile b/cmake-build-debug/Makefile index 4f98c8bef62db2ebaf23d6d2cabb321946a694d6..8279700a81ea95666767f82aa2c2c895cc202ea4 100644 --- a/cmake-build-debug/Makefile +++ b/cmake-build-debug/Makefile @@ -477,12 +477,40 @@ shared/urlTest.cpp.s: $(MAKE) -f CMakeFiles/URLTEST.dir/build.make CMakeFiles/URLTEST.dir/shared/urlTest.cpp.s .PHONY : shared/urlTest.cpp.s +util/Stemmer.o: util/Stemmer.cpp.o + +.PHONY : util/Stemmer.o + +# target to build an object file +util/Stemmer.cpp.o: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.o +.PHONY : util/Stemmer.cpp.o + +util/Stemmer.i: util/Stemmer.cpp.i + +.PHONY : util/Stemmer.i + +# target to preprocess a source file +util/Stemmer.cpp.i: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.i +.PHONY : util/Stemmer.cpp.i + +util/Stemmer.s: util/Stemmer.cpp.s + +.PHONY : util/Stemmer.s + +# target to generate assembly for a file +util/Stemmer.cpp.s: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.s +.PHONY : util/Stemmer.cpp.s + util/Tokenizer.o: util/Tokenizer.cpp.o .PHONY : util/Tokenizer.o # target to build an object file util/Tokenizer.cpp.o: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.o $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.o .PHONY : util/Tokenizer.cpp.o @@ -492,6 +520,7 @@ util/Tokenizer.i: util/Tokenizer.cpp.i # target to preprocess a source file util/Tokenizer.cpp.i: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.i $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.i .PHONY : util/Tokenizer.cpp.i @@ -501,6 +530,7 @@ util/Tokenizer.s: util/Tokenizer.cpp.s # target to generate assembly for a file util/Tokenizer.cpp.s: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.s $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.s .PHONY : util/Tokenizer.cpp.s @@ -511,6 +541,7 @@ util/stringProcessing.o: util/stringProcessing.cpp.o # target to build an object file util/stringProcessing.cpp.o: $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.o + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.o $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.o .PHONY : util/stringProcessing.cpp.o @@ -522,6 +553,7 @@ util/stringProcessing.i: util/stringProcessing.cpp.i # target to preprocess a source file util/stringProcessing.cpp.i: $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.i + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.i $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.i .PHONY : util/stringProcessing.cpp.i @@ -533,6 +565,7 @@ util/stringProcessing.s: util/stringProcessing.cpp.s # target to generate assembly for a file util/stringProcessing.cpp.s: $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.s + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.s $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.s .PHONY : util/stringProcessing.cpp.s @@ -693,6 +726,9 @@ help: @echo "... shared/urlTest.o" @echo "... shared/urlTest.i" @echo "... shared/urlTest.s" + @echo "... util/Stemmer.o" + @echo "... util/Stemmer.i" + @echo "... util/Stemmer.s" @echo "... util/Tokenizer.o" @echo "... util/Tokenizer.i" @echo "... util/Tokenizer.s" diff --git a/util/Stemmer.cpp b/util/Stemmer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0604e3f9e208e9dd83c0fdec066dd4965beb18a4 --- /dev/null +++ b/util/Stemmer.cpp @@ -0,0 +1,822 @@ + +#include "Stemmer.h" + +/** + * Stemmer Cstor + */ +Stemmer::Stemmer ( ) + { } + +/** + * Returns the stem of a word + * + * @param word + * @return + */ +std::string Stemmer::execute ( std::string word ) + { + word = step1a ( word ); + word = step1b ( word ); + word = step1c ( word ); + word = step2 ( word ); + word = step3 ( word ); + word = step4 ( word ); + word = step5a ( word ); + word = step5b ( word ); + return word; + } + +/** + * Number of consonant sequences + * + * <c><v> -> 0 + * <c>vc<v> -> 1 + * <c>vcvc<v> -> 2 + * <c>vcvcvc<v> -> 3 + * + * @param word + * @return + */ +int Stemmer::measure ( std::string word ) + { + int m = 0; + int begin = 0; + unsigned long end = word.size ( ) - 1; + + while ( true ) + { + if ( begin > end ) + { + return m; + } + if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + { + break; + } + begin += 1; + } + begin += 1; + + while ( true ) + { + while ( true ) + { + if ( begin > end ) + { + return m; + } + if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + { + break; + } + begin += 1; + } + begin += 1; + m += 1; + while ( true ) + { + if ( begin > end ) + { + return m; + } + if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) + { + break; + } + begin += 1; + } + begin += 1; + } + } + +/** + * Check if a vowel is present in the stem + * + * @param wordBeg + * @param wordEnd + * @param word + * @return + */ +bool Stemmer::isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word ) + { + while ( wordBeg != wordEnd ) + { + if ( !isConsonant ( wordBeg, word.begin ( ) ) ) + { + return true; + } + } + return false; + } + +/** + * Return true if the wordIt points to a consonant + * + * @param wordIt + * @param wordBegin + * @return + */ +bool Stemmer::isConsonant ( string::iterator wordIt, string::iterator wordBegin ) + { + if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'u' ) + { + return false; + } + if ( *wordIt == 'y' ) + { + if ( wordIt == wordBegin ) + { + return true; + } + else + { + return ( !isConsonant ( wordIt - 1, wordBegin ) ); + } + } + return true; + } + +/** + * Returns true if should add 'e' to end + * + * @param word + * @return + */ +bool Stemmer::addE ( string word ) + { + // AT -> ATE + // BL -> BLE + // IZ -> IZE + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + auto substrAT = findPrev ( "at", endPtr ); + auto substrBL = findPrev ( "bl", endPtr ); + auto substrIZ = findPrev ( "iz", endPtr ); + + if ( *substrAT != '\0' || *substrBL != '\0' || *substrIZ != '\0' ) + { + return true; + } + else + { + return false; + } + } + +/** + * Returns true if word ends in double constant + * Not LL, SS, ZZ + * @param word + * @return + */ +bool Stemmer::doubleCon ( string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + if ( word.size ( ) > 2 && *endPtr == *( endPtr - 1 ) ) + { + if ( *endPtr == 'l' || *endPtr == 's' || *endPtr == 'z' ) + { + return false; + } + else + { + return true; + } + } + return false; + } + +/** + * Returns true if a word ends in a + * Consonant, Vowel, Consonant pattern + * Except when second C is W, X, or Y + * + * @param word + * @return + */ +bool Stemmer::endCVC ( std::string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + if ( word.size ( ) > 2 ) + { + // the stem ends cvc + if ( isConsonant ( endPtr, word.begin ( ) ) && !isConsonant ( endPtr - 1, word.begin ( ) ) && + isConsonant ( endPtr - 2, word.begin ( ) ) ) + { + // the second c is not W, X or Y + if ( *( endPtr - 1 ) != 'w' && *( endPtr - 1 ) != 'x' && *( endPtr - 1 ) != 'y' ) + { + return true; + } + } + } + return false; + } + +/** + * Stem plural words + * + * @param word + * @return + */ +std::string Stemmer::step1a ( std::string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + // check S at end + if ( word.at ( end ) == 's' ) + { + string wordStem ( word.begin ( ), word.end ( ) ); + + auto substrSSES = findPrev ( "sses", endPtr ); + auto substrIES = findPrev ( "ies", endPtr ); + auto substrSS = findPrev ( "ss", endPtr ); + auto substrS = findPrev ( "s", endPtr ); + // sses -> ss + // caresses -> caress + if ( *substrSSES != '\0' ) + { + string wordStem ( word.begin ( ), substrSSES + 1 ); + } + // ies -> i + // ponies -> poni + else if ( *substrIES != '\0' ) + { + string wordStem ( word.begin ( ), substrIES + 1 ); + } + // ss -> ss + // caress -> caress + else if ( *substrSS != '\0' ) + { + string wordStem ( word.begin ( ), word.end ( ) ); + } + // s -> + // cats -> cat + else if ( *substrS != '\0' ) + { + string wordStem ( word.begin ( ), substrS + 1 ); + } + else + { + string wordStem ( word.begin ( ), word.end ( ) ); + } + + return wordStem; + } + return word; + } + +/** + * Stem ED and ING + * + * @param word + * @return + */ +std::string Stemmer::step1b ( std::string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + if ( measure ( word ) > 0 ) + { + string wordStem ( word.begin ( ), word.end ( ) ); + + auto substrEED = findPrev ( "eed", endPtr ); + auto substrED = findPrev ( "ed", endPtr ); + auto substrING = findPrev ( "ing", endPtr ); + + // check EED at end and m > 0 + // feed -> feed + // agreed -> agree + if ( *substrEED != '\0' ) + { + string wordStem ( word.begin ( ), substrEED + 1 ); + } + // check ED at end and preceeded by substr with vowel + // plastered -> plaster + // bled -> bled + else if ( *substrED != '\0' && isVowelPresent ( word.begin ( ), substrED, word ) ) + { + + string wordStem ( word.begin ( ), substrED + 1 ); + if ( addE ( wordStem ) ) + { + wordStem += 'e'; + } + else if ( doubleCon ( wordStem ) ) + { + wordStem = wordStem.substr ( 0, wordStem.size ( ) - 1 ); + } + else if ( measure ( wordStem ) > 1 && endCVC ( wordStem ) ) + { + wordStem += 'e'; + } + } + // check ING at end and proceeded by substr with vowel + // motoring -> motor + // sing -> sing + else if ( *substrING != '\0' && isVowelPresent ( word.begin ( ), substrING, word ) ) + { + string wordStem ( word.begin ( ), substrING + 1 ); + if ( addE ( wordStem ) ) + { + wordStem += 'e'; + } + } + + return wordStem; + } + return word; + } + +/** + * Checks for Y -> I + * @param word + * @return + */ +string step1c ( string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + // Y -> I + // happy -> happi + // sky -> sky + if ( *endPtr == 'y' ) + { + if ( isVowelPresent ( word.begin ( ), endPtr, word ) ) + { + word = word.substr ( 0, word.size ( ) - 1 ); + word += 'i'; + } + } + return word; + } + +/** + * Step 2 + * + * @param word + * @return + */ +string Stemmer::step2 ( std::string word ) + { + + if ( measure ( word ) == 0 ) + { + return word; + } + + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + string wordStem ( word.begin ( ), word.end ( ) ); + + auto substrATIONAL = findPrev ( "ational", endPtr ); + auto substrTIONAL = findPrev ( "tional", endPtr ); + auto substrENCI = findPrev ( "enci", endPtr ); + auto substrANCI = findPrev ( "anci", endPtr ); + auto substrIZER = findPrev ( "izer", endPtr ); + auto substrABLI = findPrev ( "abli", endPtr ); + auto substrALLI = findPrev ( "alli", endPtr ); + auto substrENTLI = findPrev ( "entli", endPtr ); + auto substrELI = findPrev ( "eli", endPtr ); + auto substrOUSLI = findPrev ( "ousli", endPtr ); + auto substrIZATION = findPrev ( "ization", endPtr ); + auto substrATION = findPrev ( "ation", endPtr ); + auto substrATOR = findPrev ( "ator", endPtr ); + auto substrALISM = findPrev ( "alism", endPtr ); + auto substrIVENESS = findPrev ( "iveness", endPtr ); + auto substrFULNESS = findPrev ( "fulness", endPtr ); + auto substrOUSNESS = findPrev ( "ousness", endPtr ); + auto substrALITI = findPrev ( "aliti", endPtr ); + auto substrIVITI = findPrev ( "iviti", endPtr ); + auto substrBILITI = findPrev ( "biliti", endPtr ); + + // ATIONAL -> ATE + // relational -> relate + if ( *substrATIONAL != '\0' ) + { + string wordStem ( word.begin ( ), substrATIONAL + 1 ); + wordStem + 'ate'; + } + // TIONAL -> TION + // conditional -> condition + // rational -> rational + else if ( *substrTIONAL != '\0' ) + { + string wordStem ( word.begin ( ), substrTIONAL + 1 ); + wordStem += 'tion'; + } + // ENCI -> ENCE + // valenci -> valence + else if ( *substrENCI != '\0' ) + { + string wordStem ( word.begin ( ), substrENCI + 1 ); + wordStem += 'ence'; + } + // ANCI -> ANCE + // hesitanci -> hesitance + else if ( *substrANCI != '\0' ) + { + string wordStem ( word.begin ( ), substrANCI + 1 ); + wordStem += 'ance'; + } + // IZER -> IZE + // digitizer -> digitize + else if ( *substrIZER != '\0' ) + { + string wordStem ( word.begin ( ), substrIZER + 1 ); + wordStem += 'ize'; + } + // ABLI -> ABLE + // conformabli -> comformable + else if ( *substrABLI != '\0' ) + { + string wordStem ( word.begin ( ), substrABLI + 1 ); + wordStem += 'able'; + } + // ALLI -> AL + // radicalli -> radical + else if ( *substrALLI != '\0' ) + { + string wordStem ( word.begin ( ), substrALLI + 1 ); + wordStem += 'al'; + } + // ENTLI -> ENT + // differentli -> different + else if ( *substrENTLI != '\0' ) + { + string wordStem ( word.begin ( ), substrENTLI + 1 ); + wordStem += 'ent'; + } + // ELI -> E + // vileli -> vile + else if ( *substrELI != '\0' ) + { + string wordStem ( word.begin ( ), substrELI + 1 ); + wordStem += 'e'; + } + // OUSLI -> OUS + // analogousli -> analogous + else if ( *substrOUSLI != '\0' ) + { + string wordStem ( word.begin ( ), substrOUSLI + 1 ); + wordStem += 'ous'; + } + // IZATION -> IZE + // vietnamization -> vietnamize + else if ( *substrIZATION != '\0' ) + { + string wordStem ( word.begin ( ), substrIZATION + 1 ); + wordStem += 'ize'; + } + // ATION -> ATE + // predication -> predicate + else if ( *substrATION != '\0' ) + { + string wordStem ( word.begin ( ), substrATION + 1 ); + wordStem += 'ate'; + } + // ATOR -> ATE + // predication -> predicate + else if ( *substrATOR != '\0' ) + { + string wordStem ( word.begin ( ), substrATOR + 1 ); + wordStem += 'ate'; + } + // ALISM -> AL + // feudalism -> feudal + else if ( *substrALISM != '\0' ) + { + string wordStem ( word.begin ( ), substrALISM + 1 ); + wordStem += 'al'; + } + // IVENESS -> IVE + // decisivenss -> decisive + else if ( *substrIVENESS != '\0' ) + { + string wordStem ( word.begin ( ), substrIVENESS + 1 ); + wordStem += 'ive'; + } + // FULNESS -> FUL + // hopefulness -> hopeful + else if ( *substrFULNESS != '\0' ) + { + string wordStem ( word.begin ( ), substrFULNESS + 1 ); + wordStem += 'ful'; + } + // OUSNESS -> OUS + // callousness -> callous + else if ( *substrOUSNESS != '\0' ) + { + string wordStem ( word.begin ( ), substrOUSNESS + 1 ); + wordStem += 'ous'; + } + // ALITI -> AL + // formalit -> callous + else if ( *substrOUSNESS != '\0' ) + { + string wordStem ( word.begin ( ), substrOUSNESS + 1 ); + wordStem += 'al'; + } + // IVITI -> IVE + // sensitiviti -> sensitive + else if ( *substrIVITI != '\0' ) + { + string wordStem ( word.begin ( ), substrIVITI + 1 ); + wordStem += 'ive'; + } + // BILITI -> BLE + // sensibiliti -> sensible + else if ( *substrBILITI != '\0' ) + { + string wordStem ( word.begin ( ), substrBILITI + 1 ); + wordStem += 'ble'; + } + + return wordStem; + } + +/** + * Step 3 + * + * @param word + * @return + */ +std::string Stemmer::step3 ( std::string word ) + { + + if ( measure ( word ) == 0 ) + { + return word; + } + + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + string wordStem ( word.begin ( ), word.end ( ) ); + + auto substrICATE = findPrev ( "icate", endPtr ); + auto substrATIVE = findPrev ( "ative", endPtr ); + auto substrALIZE = findPrev ( "alize", endPtr ); + auto substrICITI = findPrev ( "iciti", endPtr ); + auto substrICAL = findPrev ( "ical", endPtr ); + auto substrFUL = findPrev ( "ful", endPtr ); + auto substrNESS = findPrev ( "ness", endPtr ); + + // ICATE -> IC + // triplicate -> triplic + if ( *substrICATE != '\0' ) + { + string wordStem ( word.begin ( ), substrICATE + 1 ); + wordStem + 'ic'; + } + // ATIVE -> + // formative -> form + else if ( *substrATIVE != '\0' ) + { + string wordStem ( word.begin ( ), substrATIVE + 1 ); + } + // ALIZE -> AL + // formalize -> formal + else if ( *substrALIZE != '\0' ) + { + string wordStem ( word.begin ( ), substrALIZE + 1 ); + wordStem += 'al'; + } + // ICITI -> IC + // electriciti -> electric + else if ( *substrICITI != '\0' ) + { + string wordStem ( word.begin ( ), substrICITI + 1 ); + wordStem += 'ic'; + } + // ICAL -> IC + // electrical -> electric + else if ( *substrICAL != '\0' ) + { + string wordStem ( word.begin ( ), substrICAL + 1 ); + wordStem += 'ic'; + } + // FUL -> + // hopeful -> hope + else if ( *substrFUL != '\0' ) + { + string wordStem ( word.begin ( ), substrFUL + 1 ); + } + // NESS -> + // goodness -> good + else if ( *substrNESS != '\0' ) + { + string wordStem ( word.begin ( ), substrNESS + 1 ); + } + + return wordStem; + } + +/** + * Step 4 + * + * @param word + * @return + */ +std::string Stemmer::step4 ( std::string word ) + { + if ( measure ( word ) <= 1 ) + { + return word; + } + + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + string wordStem ( word.begin ( ), word.end ( ) ); + + + auto substrAL = findPrev ( "al", endPtr ); + auto substrANCE = findPrev ( "ance", endPtr ); + auto substrENCE = findPrev ( "ence", endPtr ); + auto substrER = findPrev ( "er", endPtr ); + auto substrIC = findPrev ( "ic", endPtr ); + auto substrABLE = findPrev ( "able", endPtr ); + auto substrIBLE = findPrev ( "ible", endPtr ); + auto substrANT = findPrev ( "ant", endPtr ); + auto substrEMENT = findPrev ( "ement", endPtr ); + auto substrMENT = findPrev ( "ment", endPtr ); + auto substrENT = findPrev ( "ent", endPtr ); + auto substrTION = findPrev ( "tion", endPtr ); + auto substrOU = findPrev ( "ou", endPtr ); + auto substrISM = findPrev ( "ism", endPtr ); + auto substrATE = findPrev ( "ate", endPtr ); + auto substrITI = findPrev ( "iti", endPtr ); + auto substrOUS = findPrev ( "ous", endPtr ); + auto substrIVE = findPrev ( "ive", endPtr ); + auto substrIZE = findPrev ( "ize", endPtr ); + + // AL -> + // revival -> reviv + if ( *substrAL != '\0' ) + { + string wordStem ( word.begin ( ), substrAL + 1 ); + } + // ANCE -> + // allowance -> allow + else if ( *substrANCE != '\0' ) + { + string wordStem ( word.begin ( ), substrANCE + 1 ); + } + // ENCE -> + // inference -> infer + else if ( *substrENCE != '\0' ) + { + string wordStem ( word.begin ( ), substrENCE + 1 ); + } + // ER -> + // airliner -> airlin + else if ( *substrER != '\0' ) + { + string wordStem ( word.begin ( ), substrER + 1 ); + } + // IC -> + // gyroscopic -> gyroscope + else if ( *substrIC != '\0' ) + { + string wordStem ( word.begin ( ), substrIC + 1 ); + } + // ABLE -> + // adjustable -> adjust + else if ( *substrABLE != '\0' ) + { + string wordStem ( word.begin ( ), substrABLE + 1 ); + } + // IBLE -> + // goodness -> good + else if ( *substrIBLE != '\0' ) + { + string wordStem ( word.begin ( ), substrIBLE + 1 ); + } + // ANT -> + // irritant -> irrit + else if ( *substrANT != '\0' ) + { + string wordStem ( word.begin ( ), substrANT + 1 ); + } + // EMENT -> + // replacement -> replace + else if ( *substrEMENT != '\0' ) + { + string wordStem ( word.begin ( ), substrEMENT + 1 ); + } + // MENT -> + // adjustment -> adjust + else if ( *substrMENT != '\0' ) + { + string wordStem ( word.begin ( ), substrMENT + 1 ); + } + // ENT -> + // dependent -> depend + else if ( *substrENT != '\0' ) + { + string wordStem ( word.begin ( ), substrENT + 1 ); + } + // TION -> + // stem must end in 't' or 's' + // adoption -> adopt + else if ( *substrTION != '\0' && ( *( substrTION - 1 ) == 's' || *( substrTION - 1 ) == 't' ) ) + { + string wordStem ( word.begin ( ), substrTION + 1 ); + } + // OU -> + // homologou -> homolog + else if ( *substrOU != '\0' ) + { + string wordStem ( word.begin ( ), substrOU + 1 ); + } + // ISM -> + // communism -> commun + else if ( *substrISM != '\0' ) + { + string wordStem ( word.begin ( ), substrISM + 1 ); + } + // ATE -> + // activate -> activ + else if ( *substrATE != '\0' ) + { + string wordStem ( word.begin ( ), substrATE + 1 ); + } + // ITI -> + // angulariti -> angular + else if ( *substrITI != '\0' ) + { + string wordStem ( word.begin ( ), substrITI + 1 ); + } + // OUS -> + // homologous -> homolog + else if ( *substrOUS != '\0' ) + { + string wordStem ( word.begin ( ), substrOUS + 1 ); + } + // IVE -> + // effective -> effect + else if ( *substrIVE != '\0' ) + { + string wordStem ( word.begin ( ), substrIVE + 1 ); + } + // IZE -> + // bowdlerize -> bowdler + else if ( *substrIZE != '\0' ) + { + string wordStem ( word.begin ( ), substrIZE + 1 ); + } + return wordStem; + + } + +/** + * Step 5a + * + * @param word + * @return + */ +std::string Stemmer::step5a ( std::string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + // E -> + // probabte -> probat + if ( measure ( word ) > 1 && *endPtr == 'e' ) + { + word = word.substr ( 0, word.size ( ) - 1 ); + return word; + } + // E -> + // cease -> cease + if ( measure ( word ) == 1 && !endCVC ( word ) && *endPtr == 'e' ) + { + word = word.substr ( 0, word.size ( ) - 1 ); + } + return word; + + } + +/** + * Step 5b + * + * @param word + * @return + */ +std::string Stemmer::step5b ( std::string word ) + { + unsigned long end = word.size ( ) - 1; + auto endPtr = word.begin ( ) + end; + + if ( word.size ( ) > 2 && measure ( word ) > 1 && *endPtr == 'l' && *( endPtr - 1 ) == 'l' ) + { + word = word.substr ( 0, word.size ( ) - 1 ); + } + return word; + } \ No newline at end of file diff --git a/util/Stemmer.h b/util/Stemmer.h index ef86e32ff5fab4aca76da5648a44f9066e037523..354b54ecfe680348f16b3b509286dd392776a7c0 100644 --- a/util/Stemmer.h +++ b/util/Stemmer.h @@ -10,13 +10,22 @@ */ class Stemmer { +public: - Stemmer ( ) - { } + /** + * Stemmer Cstor + */ + Stemmer ( ); + + /** + * Returns the stem of a word + * + * @param word + * @return + */ + std::string execute ( std::string word ); - std::string stem ( std::string word ) - { - } +private: /** * Number of consonant sequences @@ -29,57 +38,7 @@ class Stemmer * @param word * @return */ - int measure ( std::string word ) - { - int m = 0; - int begin = 0; - unsigned long end = word.size ( ) - 1; - - while ( true ) - { - if ( begin > end ) - { - return m; - } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) - { - break; - } - begin += 1; - } - begin += 1; - - while ( true ) - { - while ( true ) - { - if ( begin > end ) - { - return m; - } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) - { - break; - } - begin += 1; - } - begin += 1; - m += 1; - while ( true ) - { - if ( begin > end ) - { - return m; - } - if ( isConsonant ( word.begin ( ) + begin, word.begin ( ) ) ) - { - break; - } - begin += 1; - } - begin += 1; - } - } + int measure ( std::string word ); /** * Check if a vowel is present in the stem @@ -89,17 +48,7 @@ class Stemmer * @param word * @return */ - bool isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word ) - { - while ( wordBeg != wordEnd ) - { - if ( !isConsonant ( wordBeg, word.begin ( ) ) ) - { - return true; - } - } - return false; - } + bool isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word ); /** * Return true if the wordIt points to a consonant @@ -108,25 +57,7 @@ class Stemmer * @param wordBegin * @return */ - bool isConsonant ( string::iterator wordIt, string::iterator wordBegin ) - { - if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'u' ) - { - return false; - } - if ( *wordIt == 'y' ) - { - if ( wordIt == wordBegin ) - { - return true; - } - else - { - return ( !isConsonant ( wordIt - 1, wordBegin ) ); - } - } - return true; - } + bool isConsonant ( string::iterator wordIt, string::iterator wordBegin ); /** * Returns true if should add 'e' to end @@ -134,26 +65,7 @@ class Stemmer * @param word * @return */ - bool addE ( string word ) - { - // AT -> ATE - // BL -> BLE - // IZ -> IZE - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - auto substrAT = findPrev ( "at", endPtr ); - auto substrBL = findPrev ( "bl", endPtr ); - auto substrIZ = findPrev ( "iz", endPtr ); - - if ( *substrAT != '\0' || *substrBL != '\0' || *substrIZ != '\0' ) - { - return true; - } - else - { - return false; - } - } + bool addE ( string word ); /** * Returns true if word ends in double constant @@ -161,24 +73,7 @@ class Stemmer * @param word * @return */ - bool doubleCon ( string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - - if ( word.size ( ) > 2 && *endPtr == *( endPtr - 1 ) ) - { - if ( *endPtr == 'l' || *endPtr == 's' || *endPtr == 'z' ) - { - return false; - } - else - { - return true; - } - } - return false; - } + bool doubleCon ( string word ); /** * Returns true if a word ends in a @@ -188,27 +83,7 @@ class Stemmer * @param word * @return */ - bool endCVC ( std::string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - - if ( word.size ( ) > 2 ) - { - // the stem ends cvc - if ( isConsonant ( endPtr, word.begin ( ) ) && !isConsonant ( endPtr - 1, word.begin ( ) ) && - isConsonant ( endPtr - 2, word.begin ( ) ) ) - { - // the second c is not W, X or Y - if ( *( endPtr - 1 ) != 'w' && *( endPtr - 1 ) != 'x' && *( endPtr - 1 ) != 'y' ) - { - return true; - } - } - } - return false; - - } + bool endCVC ( std::string word ); /** * Stem plural words @@ -216,53 +91,7 @@ class Stemmer * @param word * @return */ - std::string step_1a ( std::string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - - // check S at end - if ( word.at ( end ) == 's' ) - { - string wordStem ( word.begin ( ), word.end ( ) ); - - auto substrSSES = findPrev ( "sses", endPtr ); - auto substrIES = findPrev ( "ies", endPtr ); - auto substrSS = findPrev ( "ss", endPtr ); - auto substrS = findPrev ( "s", endPtr ); - // sses -> ss - // caresses -> caress - if ( *substrSSES != '\0' ) - { - string wordStem ( word.begin ( ), substrSSES + 1 ); - } - // ies -> i - // ponies -> poni - else if ( *substrIES != '\0' ) - { - string wordStem ( word.begin ( ), substrIES + 1 ); - } - // ss -> ss - // caress -> caress - else if ( *substrSS != '\0' ) - { - string wordStem ( word.begin ( ), word.end ( ) ); - } - // s -> - // cats -> cat - else if ( *substrS != '\0' ) - { - string wordStem ( word.begin ( ), substrS + 1 ); - } - else - { - string wordStem ( word.begin ( ), word.end ( ) ); - } - - return wordStem; - } - return word; - } + std::string step1a ( std::string word ); /** * Stem ED and ING @@ -270,270 +99,23 @@ class Stemmer * @param word * @return */ - std::string step_1b ( std::string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - - if ( measure ( word ) > 0 ) - { - string wordStem ( word.begin ( ), word.end ( ) ); - - auto substrEED = findPrev ( "eed", endPtr ); - auto substrED = findPrev ( "ed", endPtr ); - auto substrING = findPrev ( "ing", endPtr ); - - // check EED at end and m > 0 - // feed -> feed - // agreed -> agree - if ( *substrEED != '\0' ) - { - string wordStem ( word.begin ( ), substrEED + 1 ); - } - // check ED at end and preceeded by substr with vowel - // plastered -> plaster - // bled -> bled - else if ( *substrED != '\0' && isVowelPresent ( word.begin ( ), substrED, word ) ) - { - - string wordStem ( word.begin ( ), substrED + 1 ); - if ( addE ( wordStem ) ) - { - wordStem += 'e'; - } - else if ( doubleCon ( wordStem ) ) - { - wordStem = wordStem.substr ( 0, wordStem.size ( ) - 1 ); - } - else if ( measure ( wordStem ) > 1 && endCVC ( wordStem ) ) - { - wordStem += 'e'; - } - } - // check ING at end and proceeded by substr with vowel - // motoring -> motor - // sing -> sing - else if ( *substrING != '\0' && isVowelPresent ( word.begin ( ), substrING, word ) ) - { - string wordStem ( word.begin ( ), substrING + 1 ); - if ( addE ( wordStem ) ) - { - wordStem += 'e'; - } - } - - return wordStem; - } - return word; - - } + std::string step1b ( std::string word ); /** * Checks for Y -> I + * * @param word * @return */ - string step1c ( string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - - // Y -> I - // happy -> happi - // sky -> sky - if ( *endPtr == 'y' ) - { - if ( isVowelPresent ( word.begin ( ), endPtr, word ) ) - { - word = word.substr ( 0, word.size ( ) - 1 ); - word += 'i'; - } - } - return word; - } + string step1c ( string word ); /** * Step 2 + * * @param word * @return */ - string step2 ( std::string word ) - { - - if ( measure ( word ) == 0 ) - { - return word; - } - - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); - - auto substrATIONAL = findPrev ( "ational", endPtr ); - auto substrTIONAL = findPrev ( "tional", endPtr ); - auto substrENCI = findPrev ( "enci", endPtr ); - auto substrANCI = findPrev ( "anci", endPtr ); - auto substrIZER = findPrev ( "izer", endPtr ); - auto substrABLI = findPrev ( "abli", endPtr ); - auto substrALLI = findPrev ( "alli", endPtr ); - auto substrENTLI = findPrev ( "entli", endPtr ); - auto substrELI = findPrev ( "eli", endPtr ); - auto substrOUSLI = findPrev ( "ousli", endPtr ); - auto substrIZATION = findPrev ( "ization", endPtr ); - auto substrATION = findPrev ( "ation", endPtr ); - auto substrATOR = findPrev ( "ator", endPtr ); - auto substrALISM = findPrev ( "alism", endPtr ); - auto substrIVENESS = findPrev ( "iveness", endPtr ); - auto substrFULNESS = findPrev ( "fulness", endPtr ); - auto substrOUSNESS = findPrev ( "ousness", endPtr ); - auto substrALITI = findPrev ( "aliti", endPtr ); - auto substrIVITI = findPrev ( "iviti", endPtr ); - auto substrBILITI = findPrev ( "biliti", endPtr ); - - // ATIONAL -> ATE - // relational -> relate - if ( *substrATIONAL != '\0' ) - { - string wordStem ( word.begin ( ), substrATIONAL + 1 ); - wordStem + 'ate'; - } - // TIONAL -> TION - // conditional -> condition - // rational -> rational - else if ( *substrTIONAL != '\0' ) - { - string wordStem ( word.begin ( ), substrTIONAL + 1 ); - wordStem += 'tion'; - } - // ENCI -> ENCE - // valenci -> valence - else if ( *substrENCI != '\0' ) - { - string wordStem ( word.begin ( ), substrENCI + 1 ); - wordStem += 'ence'; - } - // ANCI -> ANCE - // hesitanci -> hesitance - else if ( *substrANCI != '\0' ) - { - string wordStem ( word.begin ( ), substrANCI + 1 ); - wordStem += 'ance'; - } - // IZER -> IZE - // digitizer -> digitize - else if ( *substrIZER != '\0' ) - { - string wordStem ( word.begin ( ), substrIZER + 1 ); - wordStem += 'ize'; - } - // ABLI -> ABLE - // conformabli -> comformable - else if ( *substrABLI != '\0' ) - { - string wordStem ( word.begin ( ), substrABLI + 1 ); - wordStem += 'able'; - } - // ALLI -> AL - // radicalli -> radical - else if ( *substrALLI != '\0' ) - { - string wordStem ( word.begin ( ), substrALLI + 1 ); - wordStem += 'al'; - } - // ENTLI -> ENT - // differentli -> different - else if ( *substrENTLI != '\0' ) - { - string wordStem ( word.begin ( ), substrENTLI + 1 ); - wordStem += 'ent'; - } - // ELI -> E - // vileli -> vile - else if ( *substrELI != '\0' ) - { - string wordStem ( word.begin ( ), substrELI + 1 ); - wordStem += 'e'; - } - // OUSLI -> OUS - // analogousli -> analogous - else if ( *substrOUSLI != '\0' ) - { - string wordStem ( word.begin ( ), substrOUSLI + 1 ); - wordStem += 'ous'; - } - // IZATION -> IZE - // vietnamization -> vietnamize - else if ( *substrIZATION != '\0' ) - { - string wordStem ( word.begin ( ), substrIZATION + 1 ); - wordStem += 'ize'; - } - // ATION -> ATE - // predication -> predicate - else if ( *substrATION != '\0' ) - { - string wordStem ( word.begin ( ), substrATION + 1 ); - wordStem += 'ate'; - } - // ATOR -> ATE - // predication -> predicate - else if ( *substrATOR != '\0' ) - { - string wordStem ( word.begin ( ), substrATOR + 1 ); - wordStem += 'ate'; - } - // ALISM -> AL - // feudalism -> feudal - else if ( *substrALISM != '\0' ) - { - string wordStem ( word.begin ( ), substrALISM + 1 ); - wordStem += 'al'; - } - // IVENESS -> IVE - // decisivenss -> decisive - else if ( *substrIVENESS != '\0' ) - { - string wordStem ( word.begin ( ), substrIVENESS + 1 ); - wordStem += 'ive'; - } - // FULNESS -> FUL - // hopefulness -> hopeful - else if ( *substrFULNESS != '\0' ) - { - string wordStem ( word.begin ( ), substrFULNESS + 1 ); - wordStem += 'ful'; - } - // OUSNESS -> OUS - // callousness -> callous - else if ( *substrOUSNESS != '\0' ) - { - string wordStem ( word.begin ( ), substrOUSNESS + 1 ); - wordStem += 'ous'; - } - // ALITI -> AL - // formalit -> callous - else if ( *substrOUSNESS != '\0' ) - { - string wordStem ( word.begin ( ), substrOUSNESS + 1 ); - wordStem += 'al'; - } - // IVITI -> IVE - // sensitiviti -> sensitive - else if ( *substrIVITI != '\0' ) - { - string wordStem ( word.begin ( ), substrIVITI + 1 ); - wordStem += 'ive'; - } - // BILITI -> BLE - // sensibiliti -> sensible - else if ( *substrBILITI != '\0' ) - { - string wordStem ( word.begin ( ), substrBILITI + 1 ); - wordStem += 'ble'; - } - - return wordStem; - } + string step2 ( std::string word ); /** * Step 3 @@ -541,75 +123,7 @@ class Stemmer * @param word * @return */ - std::string step3 ( std:: string word ) - { - - if ( measure ( word ) == 0 ) - { - return word; - } - - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); - - auto substrICATE = findPrev ( "icate", endPtr ); - auto substrATIVE = findPrev ( "ative", endPtr ); - auto substrALIZE = findPrev ( "alize", endPtr ); - auto substrICITI = findPrev ( "iciti", endPtr ); - auto substrICAL = findPrev ( "ical", endPtr ); - auto substrFUL = findPrev ( "ful", endPtr ); - auto substrNESS = findPrev ( "ness", endPtr ); - - // ICATE -> IC - // triplicate -> triplic - if ( *substrICATE != '\0' ) - { - string wordStem ( word.begin ( ), substrICATE + 1 ); - wordStem + 'ic'; - } - // ATIVE -> - // formative -> form - else if ( *substrATIVE != '\0' ) - { - string wordStem ( word.begin ( ), substrATIVE + 1 ); - } - // ALIZE -> AL - // formalize -> formal - else if ( *substrALIZE != '\0' ) - { - string wordStem ( word.begin ( ), substrALIZE + 1 ); - wordStem += 'al'; - } - // ICITI -> IC - // electriciti -> electric - else if ( *substrICITI != '\0' ) - { - string wordStem ( word.begin ( ), substrICITI + 1 ); - wordStem += 'ic'; - } - // ICAL -> IC - // electrical -> electric - else if ( *substrICAL != '\0' ) - { - string wordStem ( word.begin ( ), substrICAL + 1 ); - wordStem += 'ic'; - } - // FUL -> - // hopeful -> hope - else if ( *substrFUL != '\0' ) - { - string wordStem ( word.begin ( ), substrFUL + 1 ); - } - // NESS -> - // goodness -> good - else if ( *substrNESS != '\0' ) - { - string wordStem ( word.begin ( ), substrNESS + 1 ); - } - - return wordStem; - } + std::string step3 ( std::string word ); /** * Step 4 @@ -617,188 +131,23 @@ class Stemmer * @param word * @return */ - std::string step4( std::string word ) - { - if ( measure ( word ) <= 1 ) - { - return word; - } - - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; - string wordStem ( word.begin ( ), word.end ( ) ); - - - auto substrAL = findPrev ( "al", endPtr ); - auto substrANCE = findPrev ( "ance", endPtr ); - auto substrENCE = findPrev ( "ence", endPtr ); - auto substrER = findPrev ( "er", endPtr ); - auto substrIC = findPrev ( "ic", endPtr ); - auto substrABLE = findPrev ( "able", endPtr ); - auto substrIBLE = findPrev ( "ible", endPtr ); - auto substrANT = findPrev ( "ant", endPtr ); - auto substrEMENT = findPrev ( "ement", endPtr ); - auto substrMENT = findPrev ( "ment", endPtr ); - auto substrENT = findPrev ( "ent", endPtr ); - auto substrTION = findPrev ( "tion", endPtr ); - auto substrOU = findPrev ( "ou", endPtr ); - auto substrISM = findPrev ( "ism", endPtr ); - auto substrATE = findPrev ( "ate", endPtr ); - auto substrITI = findPrev ( "iti", endPtr ); - auto substrOUS = findPrev ( "ous", endPtr ); - auto substrIVE = findPrev ( "ive", endPtr ); - auto substrIZE = findPrev ( "ize", endPtr ); - - // AL -> - // revival -> reviv - if ( *substrAL != '\0' ) - { - string wordStem ( word.begin ( ), substrAL + 1 ); - } - // ANCE -> - // allowance -> allow - else if ( *substrANCE != '\0' ) - { - string wordStem ( word.begin ( ), substrANCE + 1 ); - } - // ENCE -> - // inference -> infer - else if ( *substrENCE != '\0' ) - { - string wordStem ( word.begin ( ), substrENCE + 1 ); - } - // ER -> - // airliner -> airlin - else if ( *substrER != '\0' ) - { - string wordStem ( word.begin ( ), substrER + 1 ); - } - // IC -> - // gyroscopic -> gyroscope - else if ( *substrIC != '\0' ) - { - string wordStem ( word.begin ( ), substrIC + 1 ); - } - // ABLE -> - // adjustable -> adjust - else if ( *substrABLE != '\0' ) - { - string wordStem ( word.begin ( ), substrABLE + 1 ); - } - // IBLE -> - // goodness -> good - else if ( *substrIBLE != '\0' ) - { - string wordStem ( word.begin ( ), substrIBLE + 1 ); - } - // ANT -> - // irritant -> irrit - else if ( *substrANT != '\0' ) - { - string wordStem ( word.begin ( ), substrANT + 1 ); - } - // EMENT -> - // replacement -> replace - else if ( *substrEMENT != '\0' ) - { - string wordStem ( word.begin ( ), substrEMENT + 1 ); - } - // MENT -> - // adjustment -> adjust - else if ( *substrMENT != '\0' ) - { - string wordStem ( word.begin ( ), substrMENT + 1 ); - } - // ENT -> - // dependent -> depend - else if ( *substrENT != '\0' ) - { - string wordStem ( word.begin ( ), substrENT + 1 ); - } - // TION -> - // stem must end in 't' or 's' - // adoption -> adopt - else if ( *substrTION != '\0' && ( *( substrTION - 1) == 's' || *( substrTION - 1) == 't' ) ) - { - string wordStem ( word.begin ( ), substrTION + 1 ); - } - // OU -> - // homologou -> homolog - else if ( *substrOU != '\0' ) - { - string wordStem ( word.begin ( ), substrOU + 1 ); - } - // ISM -> - // communism -> commun - else if ( *substrISM != '\0' ) - { - string wordStem ( word.begin ( ), substrISM + 1 ); - } - // ATE -> - // activate -> activ - else if ( *substrATE != '\0' ) - { - string wordStem ( word.begin ( ), substrATE + 1 ); - } - // ITI -> - // angulariti -> angular - else if ( *substrITI != '\0' ) - { - string wordStem ( word.begin ( ), substrITI + 1 ); - } - // OUS -> - // homologous -> homolog - else if ( *substrOUS != '\0' ) - { - string wordStem ( word.begin ( ), substrOUS + 1 ); - } - // IVE -> - // effective -> effect - else if ( *substrIVE != '\0' ) - { - string wordStem ( word.begin ( ), substrIVE + 1 ); - } - // IZE -> - // bowdlerize -> bowdler - else if ( *substrIZE != '\0' ) - { - string wordStem ( word.begin ( ), substrIZE + 1 ); - } - return wordStem; - - } - - std::string step5a ( std::string word ) - { - unsigned long end = word.size ( ) - 1; - auto endPtr = word.begin ( ) + end; + std::string step4 ( std::string word ); - // E -> - // probabte -> probat - if ( measure ( word ) > 1 && *endPtr == 'e' ) - { - word = word.substr ( 0, word.size ( ) - 1 ); - return word; - } - // E -> - // cease -> cease - if ( measure ( word ) == 1 && !endCVC ( word ) && *endPtr == 'e') - { - word = word.substr ( 0, word.size ( ) - 1 ); - return word; - } - return word; - } - - std::string step5b ( std::string word ) - { -/** - * Step 5b -(m > 1 and *d and *L) -> single letter controll -> control - roll -> roll + /** + * Step 5a + * + * @param word + * @return + */ + std::string step5a ( std::string word ); - */ - } + /** + * Step 5b + * + * @param word + * @return + */ + std::string step5b ( std::string word ); }; diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index 5c4f472311a6f08a62acb4069da6f5b1f24e96eb..76f45b2798061c76ed752852157fe5bfd5fa363f 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -28,6 +28,8 @@ unordered_map< string, vector< int>> *Tokenizer::get ( ) const void Tokenizer::execute ( string originalText, int offset ) { vector< string > splitText = splitStr ( originalText, ' ' ); + //TODO make function to remove characters + //TODO normalize contractions string lowerString = ""; for ( int i = 0; i < splitText.size ( ); ++i ) { diff --git a/util/stringProcessing.cpp b/util/stringProcessing.cpp index aaf6743512c4fffe16b347dd43c2fc6dad2d752a..14e1cf1ab948be895780449b7594d9d603b7362a 100644 --- a/util/stringProcessing.cpp +++ b/util/stringProcessing.cpp @@ -3,10 +3,18 @@ // #include "stringProcessing.h" +#include "Stemmer.h" using namespace std; - +/** + * Finds the needle in the haystack + * returns position of first match + * + * @param haystack + * @param needle + * @return string::iterator + */ string::iterator findStr (string needle, string haystack ) { @@ -54,7 +62,13 @@ string::iterator findStr (string needle, string haystack ) } - +/** + * Finds the next position of the needle in the string + * + * @param needle + * @param pointer + * @return string::iterator + */ string::iterator findNext (string needle, string::iterator haystackPointer ) { auto beginNeedle = needle.begin ( ); @@ -99,6 +113,13 @@ string::iterator findNext (string needle, string::iterator haystackPointer ) return beginHaystack; } +/** + * Finds the previous position of the needle in the string + * + * @param needle + * @param haystackPointer + * @return + */ string::iterator findPrev ( string needle, string::iterator haystackPointer ) { auto beginNeedle = needle.begin ( ); @@ -143,7 +164,13 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer ) return beginHaystack; } - +/** + * Returns a vector of strings from @originalText, split by @delim + * + * @param originalText + * @param delim + * @return + */ vector< string > splitStr ( string originalText, char delim ) { vector< string > splitWords; @@ -166,14 +193,24 @@ vector< string > splitStr ( string originalText, char delim ) } - +/** + * Returns true if @word is a stopword + * + * @param word + * @return + */ bool isStopWord ( string word ) { return ( stopWords.find ( word ) != stopWords.end ( ) ); } - +/** + * Returns lowercase @word + * + * @param word + * @return + */ string toLower ( string word ) { auto iter = word.begin ( ); @@ -195,9 +232,15 @@ string toLower ( string word ) return lowerWord; } - - +/** + * Returns stemmed @word + * + * @param word + * @return + */ string stemWord(string word) { - return ""; + Stemmer stemmer; + word = stemmer.execute ( word ); + return word; } diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 9337dc58c5d2807d7b1d5bb983d00bd851110387..4de025e504da9c03479835d1cf09b5c96a6d876e 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -14,8 +14,6 @@ using namespace std; /** * Set of stopwords */ - - static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how", @@ -29,6 +27,7 @@ static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", /** * Finds the needle in the haystack * returns position of first match + * * @param haystack * @param needle * @return string::iterator @@ -37,6 +36,7 @@ string::iterator findStr ( string needle, string haystack ); /** * Finds the next position of the needle in the string + * * @param needle * @param pointer * @return string::iterator @@ -45,6 +45,7 @@ string::iterator findNext ( string needle, string::iterator haystackPointer ); /** * Finds the previous position of the needle in the string + * * @param needle * @param haystackPointer * @return @@ -53,6 +54,7 @@ string::iterator findPrev ( string needle, string::iterator haystackPointer ); /** * Returns a vector of strings from @originalText, split by @delim + * * @param originalText * @param delim * @return @@ -61,6 +63,7 @@ vector< string > splitStr ( string originalText, char delim ); /** * Returns true if @word is a stopword + * * @param word * @return */ @@ -68,14 +71,15 @@ bool isStopWord ( string word ); /** * Returns lowercase @word + * * @param word * @return */ string toLower ( string word ); -//TODO /** * Returns stemmed @word + * * @param word * @return */ diff --git a/util/tests/stemmerTest.cpp b/util/tests/stemmerTest.cpp index f942e1a5f50d386c5b46084c891c94372295b9c7..aaa8053a8c127b05b10a9e24ee103925b1065945 100644 --- a/util/tests/stemmerTest.cpp +++ b/util/tests/stemmerTest.cpp @@ -1,4 +1,18 @@ -// -// Created by Veronica Day on 2/22/18. -// + +#include <string> +#include <vector> +#include "../stringProcessing.h" +#include <iostream> +#include <cassert> + +int main ( ) + { + + cout << "Beginning testing for Stemmer" << endl << endl; + + assert ( false ); + + cout << "\nTests passed for Stemmer :D" << endl; + + } diff --git a/util/tests/stringProcessingTest.cpp b/util/tests/stringProcessingTest.cpp index 8e35e1449805215f6bedc76f06df9e15fdcf300e..f54f4ab5cc0c4858200d6fb3bf55036f8fbd0025 100644 --- a/util/tests/stringProcessingTest.cpp +++ b/util/tests/stringProcessingTest.cpp @@ -1,6 +1,3 @@ -// -// Created by Veronica Day on 2/13/18. -// #include <string> #include <vector> @@ -25,7 +22,7 @@ void testFindPrev ( ); int main ( ) { - cout << "Beginning testing for StringProcessing_unit" << endl << endl; + cout << "Beginning testing for StringProcessing" << endl << endl; string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. " "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," @@ -38,7 +35,7 @@ int main ( ) testFindNext ( ); testFindPrev ( ); - cout << "\nTests passed for StringProcessing_unit :D" << endl; + cout << "\nTests passed for StringProcessing :D" << endl; } diff --git a/util/tests/tokenizerTest.cpp b/util/tests/tokenizerTest.cpp index 0ccb13becf22d4d9db5ff86597829d47b90bbd38..891c00509ef9fca7acacf8c1d8937ced14c6fab7 100644 --- a/util/tests/tokenizerTest.cpp +++ b/util/tests/tokenizerTest.cpp @@ -1,6 +1,3 @@ -// -// Created by Veronica Day on 2/13/18. -// #include <string> #include <vector> @@ -31,7 +28,7 @@ int main ( ) void testExecute ( string original ) { Tokenizer myTokenizer; - myTokenizer.execute ( original ); + myTokenizer.execute ( original, 0 ); auto dict = myTokenizer.get ( );