Merge branch 'milestone1' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into milestone1

a4133948 · jsclose · 1c16250a · e73ff17c · a4133948 · a4133948
Commit a4133948 authored 7 years ago by jsclose
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
-//
-// Created by Jake Close on 3/5/18.
-//
-
-

 #include "Parser.h"

@@ -60,7 +55,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 * @param word
 * @return
 */
-string Parser::extract_url ( string word )
+string Parser::extract_url ( string & word )
 	{
 	string url = "";
 	if ( *findStr ( "<a", word ) != '\0' )

--- a/parser/Parser.h
+++ b/parser/Parser.h
-//
-// Created by Veronica Day on 1/28/18.
-//
-
-// keep running count of offset, if stop word: don't incrememnt and remove stopword
-// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
-//

 #pragma once
 #include <string>
@@ -65,7 +58,7 @@ private:
 	 * @param word
 	 * @return
 	 */
-	string extract_url ( string word );
+	string extract_url ( string & word );


 	/**

--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
-//
-// Created by anvia on 2/6/2018.
-//

 #include <string>
 #include <cassert>
@@ -38,10 +35,10 @@ void testSimple ( )
 	assert ( dictionary != nullptr );
 	assert ( dictionary->size () == 2);
 	assert ( dictionary->find ( "cat" ) != dictionary->end () );
-	assert ( dictionary->find ( "title" ) != dictionary->end () );
+	assert ( dictionary->find ( "titl" ) != dictionary->end () );
 	assert ( dictionary->find ( "this" ) == dictionary->end () );
 	assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
-	assert ( dictionary->at ( "title" )[ 0 ] == 1 );
+	assert ( dictionary->at ( "titl" )[ 0 ] == 1 );

 	delete dictionary;

@@ -52,37 +49,34 @@ void testComplex ( )
 	ProducerConsumerQueue < string >  urlFrontierTest;
 	ifstream file("../tests/cats.html");
 	string temp;
-	char docString[10240];
-	strcpy(docString, "<title>Joe the Cat</title>\n");
-
-	strcat(docString, "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n");
-	while(std::getline(file, temp)) {
-		//strcat(docString, str(temp));
+	string docString = "<title>Joe the Cat</title>\n";
+	docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
+	while ( std::getline ( file, temp ) )
+		{
+		docString += temp;
 		}
+
 	ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html");
-	Document document ( url, docString );
+	char * writable = new char[docString.size( ) + 1];
+	std::copy(docString.begin( ), docString.end( ), writable);
+	writable[ docString.size( ) ] = '\0';
+
+	Document document ( url, writable );

 	Parser parser ( &urlFrontierTest );
 	auto dictionary = parser.execute ( &document );

-//	cout << dictionary->size () << endl;
-//	for (auto p : *dictionary)
-//		cout << p.first << endl;
-
 	assert ( dictionary != nullptr );
 	assert ( dictionary->size () == 3);

 	assert ( dictionary->find ( "cat" ) != dictionary->end () );
-	assert ( dictionary->find ( "story" ) != dictionary->end () );
+	assert ( dictionary->find ( "stori" ) != dictionary->end () );
 	assert ( dictionary->find ( "joe" ) != dictionary->end () );

 	assert ( dictionary->find ( "the" ) == dictionary->end () );
 	assert ( dictionary->find ( "of" ) == dictionary->end () );

-//	assert ( dictionary->at ( "cat" )[ 0 ] == 1 );
-//	assert ( dictionary->at ( "story" )[ 0 ] == 0 );
-//	cout << urlFrontierTest->Size () << endl;
-//	cout << urlFrontierTest->Pop () << endl;
 	delete dictionary;
+	delete[] writable;

 	}
\ No newline at end of file
--- a/shared/Document.cpp
+++ b/shared/Document.cpp
-//
-// Created by Jake Close on 3/5/18.
-//
-
-

 #include "Document.h"


--- a/util/Stemmer.cpp
+++ b/util/Stemmer.cpp
+
+#include "Stemmer.h"
+#include "stringProcessing.h"
+
+/**
+ * Stemmer Cstor
+ */
+Stemmer::Stemmer ( )
+	{ }
+
+/**
+ * Returns the stem of a word
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::execute ( std::string word )
+	{
+	word = step1a( word );
+	word = step1b( word );
+	word = step1c( word );
+	word = step2( word );
+	word = step3( word );
+	word = step4( word );
+	word = step5a( word );
+	word = step5b( word );
+	return word;
+	}
+
+/**
+ * Number of consonant sequences
+ *
+ * <c><v>       -> 0
+ * <c>vc<v>  	 -> 1
+ * <c>vcvc<v>   -> 2
+ * <c>vcvcvc<v> -> 3
+ *
+ * @param word
+ * @return
+ */
+int Stemmer::measure ( std::string word )
+	{
+	int m = 0;
+	int begin = 0;
+	unsigned long end = word.size( ) - 1;
+	// Looking for CVC pattern
+	while ( begin <= end )
+		{
+		if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) )
+			{
+			break;
+			}
+		begin += 1;
+		}
+	if ( begin > end )
+		{
+		return m;
+		}
+	begin += 1;
+
+	while ( begin <= end )
+		{
+		while ( begin <= end )
+			{
+			if ( isConsonant( word.begin( ) + begin, word.begin( ) ) )
+				{
+				break;
+				}
+			begin += 1;
+			}
+		if ( begin > end )
+			{
+			return m;
+			}
+		begin += 1;
+		m += 1;
+		while ( begin <= end )
+			{
+			if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) )
+				{
+				break;
+				}
+			begin += 1;
+			}
+		if ( begin > end )
+			{
+			return m;
+			}
+		begin += 1;
+		}
+		return m;
+
+	}
+
+/**
+ * Check if a vowel is present in the stem
+ *
+ * @param wordBeg
+ * @param wordEnd
+ * @param word
+ * @return
+ */
+bool Stemmer::isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word )
+	{
+	while ( wordBeg != wordEnd )
+		{
+		if ( !isConsonant( wordBeg, word.begin( ) ) )
+			{
+			return true;
+			}
+		++wordBeg;
+		}
+	return false;
+	}
+
+/**
+ * Return true if the wordIt points to a consonant
+ *
+ * @param wordIt
+ * @param wordBegin
+ * @return
+ */
+bool Stemmer::isConsonant ( string::iterator wordIt, string::iterator wordBegin )
+	{
+	if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'o' || *wordIt == 'u' )
+		{
+		return false;
+		}
+	if ( *wordIt == 'y' )
+		{
+		if ( wordIt == wordBegin )
+			{
+			return true;
+			}
+		else
+			{
+			return ( !isConsonant( wordIt - 1, wordBegin ) );
+			}
+		}
+	return true;
+	}
+
+/**
+ * Returns true if should add 'e' to end
+ *
+ * @param word
+ * @return
+ */
+bool Stemmer::addE ( string word )
+	{
+	// AT -> ATE
+	// BL -> BLE
+	// IZ -> IZE
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+	auto substrAT = findPrev( "at", endPtr, begPtr + word.size( ) - 3 );
+	auto substrBL = findPrev( "bl", endPtr, begPtr + word.size( ) - 3 );
+	auto substrIZ = findPrev( "iz", endPtr, begPtr + word.size( ) - 3 );
+
+	if ( *substrAT != '\0' || *substrBL != '\0' || *substrIZ != '\0' )
+		{
+		return true;
+		}
+	else
+		{
+		return false;
+		}
+	}
+
+/**
+ * Returns true if word ends in double constant
+ * Not LL, SS, ZZ
+ * @param word
+ * @return
+ */
+bool Stemmer::doubleCon ( string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto endPtr = word.begin( ) + end;
+
+	if ( word.size( ) > 2 && *endPtr == *( endPtr - 1 ) )
+		{
+		if ( *endPtr == 'l' || *endPtr == 's' || *endPtr == 'z' )
+			{
+			return false;
+			}
+		else
+			{
+			return true;
+			}
+		}
+	return false;
+	}
+
+/**
+ * Returns true if a word ends in a
+ * Consonant, Vowel, Consonant pattern
+ * Except when second C is W, X, or Y
+ *
+ * @param word
+ * @return
+ */
+bool Stemmer::endCVC ( std::string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto endPtr = word.begin( ) + end - 1;
+
+	if ( word.size( ) > 3 )
+		{
+		// the stem ends cvc
+		if ( isConsonant( endPtr, word.begin( ) ) && !isConsonant( endPtr - 1, word.begin( ) ) &&
+		     isConsonant( endPtr - 2, word.begin( ) ) )
+			{
+			// the second c is not W, X or Y
+			if ( *( endPtr - 1 ) != 'w' && *( endPtr - 1 ) != 'x' && *( endPtr - 1 ) != 'y' )
+				{
+				return true;
+				}
+			}
+		}
+	return false;
+	}
+
+/**
+ * Stem plural words
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step1a ( std::string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+
+	// check S at end
+	if ( word.at( end ) == 's' )
+		{
+		string wordStem( word.begin( ), word.end( ) );
+
+		auto substrSSES = findPrev( "sses", endPtr, begPtr + word.size( ) - 5 );
+		auto substrIES = findPrev( "ies", endPtr, begPtr + word.size( ) - 4 );
+		auto substrSS = findPrev( "ss", endPtr, begPtr + word.size( ) - 3 );
+		auto substrS = findPrev( "s", endPtr, begPtr + word.size( ) - 2 );
+		// sses -> ss
+		// caresses -> caress
+		if ( *substrSSES != '\0' )
+			{
+			wordStem = subStr( word.begin( ), substrSSES );
+			wordStem += "ss";
+			}
+			// ies -> i
+			// ponies -> poni
+		else if ( *substrIES != '\0' )
+			{
+			wordStem = subStr( word.begin( ), substrIES );
+			wordStem += 'i';
+			}
+			// ss -> ss
+			// caress -> caress
+		else if ( *substrSS != '\0' )
+			{
+			// do nothing
+			}
+			// s ->
+			// cats -> cat
+		else if ( *substrS != '\0' )
+			{
+			wordStem = subStr( word.begin( ), substrS );
+			}
+		else
+			{
+			wordStem = subStr( word.begin( ), word.end( ) );
+			}
+
+		return wordStem;
+		}
+	return word;
+	}
+
+/**
+ * Stem ED and ING
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step1b ( std::string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+
+	string wordStem( word.begin( ), word.end( ) );
+
+	auto substrEED = findPrev( "eed", endPtr, begPtr + word.size( ) - 4 );
+	auto substrED = findPrev( "ed", endPtr, begPtr + word.size( ) - 3 );
+	auto substrING = findPrev( "ing", endPtr, begPtr + word.size( ) - 4 );
+
+	// check EED at end and m > 0
+	// feed -> feed
+	// agreed -> agree
+	if ( measure( word ) > 1 && *substrEED != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrEED );
+		wordStem += "ee";
+		}
+		// check ED at end and preceeded by substr with vowel
+		// plastered -> plaster
+		// bled -> bled
+	else if ( measure( word ) > 1 && *substrED != '\0' && isVowelPresent( word.begin( ), substrED, word ) )
+		{
+		wordStem = subStr( word.begin( ), substrED );
+		if ( addE( wordStem ) )
+			{
+			wordStem += 'e';
+			}
+		else if ( doubleCon( wordStem ) )
+			{
+			wordStem = subStr( word, 0, wordStem.size( ) - 1 );
+			}
+		else if ( measure( word ) == 1 && endCVC( wordStem + 'e' ) )
+			{
+			wordStem += 'e';
+			}
+		}
+		// check ING at end and proceeded by substr with vowel
+		// motoring -> motor
+		// sing -> sing
+	else if ( *substrING != '\0' && isVowelPresent( word.begin( ), substrING, word ) )
+		{
+		wordStem = subStr( word.begin( ), substrING );
+		if ( addE( wordStem ) )
+			{
+			wordStem += 'e';
+			}
+		else if ( doubleCon( wordStem ) )
+			{
+			wordStem = subStr( word, 0, wordStem.size( ) - 1 );
+			}
+		else if ( measure( wordStem ) == 1 && endCVC( wordStem + 'e' ) )
+			{
+			wordStem += 'e';
+			}
+		}
+
+	return wordStem;
+
+	}
+
+/**
+ * Checks for Y -> I
+ * @param word
+ * @return
+ */
+string Stemmer::step1c ( string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto endPtr = word.begin( ) + end;
+
+	// Y -> I
+	// happy -> happi
+	// sky -> sky
+	if ( *endPtr == 'y' )
+		{
+		if ( isVowelPresent( word.begin( ), endPtr, word ) )
+			{
+			word = subStr( word, 0, word.size( ) - 1 );
+			word += 'i';
+			}
+		}
+	return word;
+	}
+
+/**
+ * Step 2
+ *
+ * @param word
+ * @return
+ */
+string Stemmer::step2 ( std::string word )
+	{
+	if ( measure( word ) == 0 )
+		{
+		return word;
+		}
+
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+	string wordStem( word.begin( ), word.end( ) );
+
+	auto substrATIONAL = findPrev( "ational", endPtr, begPtr + word.size( ) - 8 );
+	auto substrTIONAL = findPrev( "tional", endPtr, begPtr + word.size( ) - 7 );
+	auto substrENCI = findPrev( "enci", endPtr, begPtr + word.size( ) - 5 );
+	auto substrANCI = findPrev( "anci", endPtr, begPtr + word.size( ) - 5 );
+	auto substrIZER = findPrev( "izer", endPtr, begPtr + word.size( ) - 5 );
+	auto substrABLI = findPrev( "abli", endPtr, begPtr + word.size( ) - 5 );
+	auto substrALLI = findPrev( "alli", endPtr, begPtr + word.size( ) - 5 );
+	auto substrENTLI = findPrev( "entli", endPtr, begPtr + word.size( ) - 6 );
+	auto substrELI = findPrev( "eli", endPtr, begPtr + word.size( ) - 4 );
+	auto substrOUSLI = findPrev( "ousli", endPtr, begPtr + word.size( ) - 6 );
+	auto substrIZATION = findPrev( "ization", endPtr, begPtr + word.size( ) - 8 );
+	auto substrATION = findPrev( "ation", endPtr, begPtr + word.size( ) - 6 );
+	auto substrATOR = findPrev( "ator", endPtr, begPtr + word.size( ) - 5 );
+	auto substrALISM = findPrev( "alism", endPtr, begPtr + word.size( ) - 6 );
+	auto substrIVENESS = findPrev( "iveness", endPtr, begPtr + word.size( ) - 8 );
+	auto substrFULNESS = findPrev( "fulness", endPtr, begPtr + word.size( ) - 8 );
+	auto substrOUSNESS = findPrev( "ousness", endPtr, begPtr + word.size( ) - 8 );
+	auto substrALITI = findPrev( "aliti", endPtr, begPtr + word.size( ) - 6 );
+	auto substrIVITI = findPrev( "iviti", endPtr, begPtr + word.size( ) - 6 );
+	auto substrBILITI = findPrev( "biliti", endPtr, begPtr + word.size( ) - 7 );
+
+	// ATIONAL -> ATE
+	// relational -> relate
+	if ( *substrATIONAL != '\0' && ( begPtr + 1 ) != substrATIONAL )
+		{
+		wordStem = subStr( word.begin( ), substrATIONAL );
+		wordStem += "ate";
+		}
+		// TIONAL -> TION
+		// conditional -> condition
+		// rational -> rational
+	else if ( *substrTIONAL != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrTIONAL );
+		wordStem += "tion";
+		}
+		// ENCI -> ENCE
+		// valenci -> valence
+	else if ( *substrENCI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrENCI );
+		wordStem += "ence";
+		}
+		// ANCI -> ANCE
+		// hesitanci ->	hesitance
+	else if ( *substrANCI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrANCI );
+		wordStem += "ance";
+		}
+		// IZER -> IZE
+		// digitizer -> digitize
+	else if ( *substrIZER != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIZER );
+		wordStem += "ize";
+		}
+		// ABLI -> ABLE
+		// conformabli -> comformable
+	else if ( *substrABLI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrABLI );
+		wordStem += "able";
+		}
+		// ALLI -> AL
+		// radicalli -> radical
+	else if ( *substrALLI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrALLI );
+		wordStem += "al";
+		}
+		// ENTLI -> ENT
+		// differentli -> different
+	else if ( *substrENTLI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrENTLI );
+		wordStem += "ent";
+		}
+		// ELI -> E
+		// vileli -> vile
+	else if ( *substrELI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrELI );
+		wordStem += 'e';
+		}
+		// OUSLI -> OUS
+		// analogousli	->	analogous
+	else if ( *substrOUSLI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrOUSLI );
+		wordStem += "ous";
+		}
+		// IZATION -> IZE
+		// vietnamization	->	vietnamize
+	else if ( *substrIZATION != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIZATION );
+		wordStem += "ize";
+		}
+		// ATION -> ATE
+		// predication	->	predicate
+	else if ( *substrATION != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrATION );
+		wordStem += "ate";
+		}
+		// ATOR -> ATE
+		// predication	->	predicate
+	else if ( *substrATOR != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrATOR );
+		wordStem += "ate";
+		}
+		// ALISM -> AL
+		// feudalism -> feudal
+	else if ( *substrALISM != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrALISM );
+		wordStem += "al";
+		}
+		// IVENESS -> IVE
+		// decisivenss	->	decisive
+	else if ( *substrIVENESS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIVENESS );
+		wordStem += "ive";
+		}
+		// FULNESS -> FUL
+		// hopefulness	->	hopeful
+	else if ( *substrFULNESS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrFULNESS );
+		wordStem += "ful";
+		}
+		// OUSNESS -> OUS
+		// callousness	->	callous
+	else if ( *substrOUSNESS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrOUSNESS );
+		wordStem += "ous";
+		}
+		// ALITI -> AL
+		// formalit	->	callous
+	else if ( *substrOUSNESS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrOUSNESS );
+		wordStem += "al";
+		}
+		// IVITI -> IVE
+		// sensitiviti	->	sensitive
+	else if ( *substrIVITI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIVITI );
+		wordStem += "ive";
+		}
+		// BILITI -> BLE
+		// sensibiliti	->	sensible
+	else if ( *substrBILITI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrBILITI );
+		wordStem += "ble";
+		}
+
+	return wordStem;
+	}
+
+/**
+ * Step 3
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step3 ( std::string word )
+	{
+
+	if ( measure( word ) == 0 )
+		{
+		return word;
+		}
+
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+	string wordStem( word.begin( ), word.end( ) );
+
+	auto substrICATE = findPrev( "icate", endPtr, begPtr + word.size( ) - 6 );
+	auto substrATIVE = findPrev( "ative", endPtr, begPtr + word.size( ) - 6 );
+	auto substrALIZE = findPrev( "alize", endPtr, begPtr + word.size( ) - 6 );
+	auto substrICITI = findPrev( "iciti", endPtr, begPtr + word.size( ) - 6 );
+	auto substrICAL = findPrev( "ical", endPtr, begPtr + word.size( ) - 4 );
+	auto substrFUL = findPrev( "ful", endPtr, begPtr + word.size( ) - 4 );
+	auto substrNESS = findPrev( "ness", endPtr, begPtr + word.size( ) - 5 );
+
+	// ICATE -> IC
+	// triplicate -> triplic
+	if ( *substrICATE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrICATE );
+		wordStem += "ic";
+		}
+		// ATIVE ->
+		// formative -> form
+	else if ( *substrATIVE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrATIVE );
+		}
+		// ALIZE -> AL
+		// formalize -> formal
+	else if ( *substrALIZE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrALIZE );
+		wordStem += "al";
+		}
+		// ICITI -> IC
+		// electriciti ->	electric
+	else if ( *substrICITI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrICITI );
+		wordStem += "ic";
+		}
+		// ICAL -> IC
+		// electrical -> electric
+	else if ( *substrICAL != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrICAL );
+		wordStem += "ic";
+		}
+		// FUL ->
+		// hopeful -> hope
+	else if ( *substrFUL != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrFUL );
+		}
+		// NESS ->
+		// goodness -> good
+	else if ( *substrNESS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrNESS );
+		}
+
+	return wordStem;
+	}
+
+/**
+ * Step 4
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step4 ( std::string word )
+	{
+	if ( measure( word ) <= 2 )
+		{
+		return word;
+		}
+
+	unsigned long end = word.size( ) - 1;
+	auto begPtr = word.begin( );
+	auto endPtr = begPtr + end;
+	string wordStem( word.begin( ), word.end( ) );
+
+
+	auto substrAL = findPrev( "al", endPtr, begPtr + word.size( ) - 3 );
+	auto substrANCE = findPrev( "ance", endPtr, begPtr + word.size( ) - 5 );
+	auto substrENCE = findPrev( "ence", endPtr, begPtr + word.size( ) - 5 );
+	auto substrER = findPrev( "er", endPtr, begPtr + word.size( ) - 3 );
+	auto substrIC = findPrev( "ic", endPtr, begPtr + word.size( ) - 3 );
+	auto substrABLE = findPrev( "able", endPtr, begPtr + word.size( ) - 5 );
+	auto substrIBLE = findPrev( "ible", endPtr, begPtr + word.size( ) - 5 );
+	auto substrANT = findPrev( "ant", endPtr, begPtr + word.size( ) - 4 );
+	auto substrEMENT = findPrev( "ement", endPtr, begPtr + word.size( ) - 6 );
+	auto substrMENT = findPrev( "ment", endPtr, begPtr + word.size( ) - 5 );
+	auto substrENT = findPrev( "ent", endPtr, begPtr + word.size( ) - 4 );
+	auto substrION = findPrev( "ion", endPtr, begPtr + word.size( ) - 4 );
+	auto substrOU = findPrev( "ou", endPtr, begPtr + word.size( ) - 3 );
+	auto substrISM = findPrev( "ism", endPtr, begPtr + word.size( ) - 4 );
+	auto substrATE = findPrev( "ate", endPtr, begPtr + word.size( ) - 4 );
+	auto substrITI = findPrev( "iti", endPtr, begPtr + word.size( ) - 4 );
+	auto substrOUS = findPrev( "ous", endPtr, begPtr + word.size( ) - 4 );
+	auto substrIVE = findPrev( "ive", endPtr, begPtr + word.size( ) - 4 );
+	auto substrIZE = findPrev( "ize", endPtr, begPtr + word.size( ) - 4 );
+
+	// AL ->
+	// revival -> reviv
+	if ( *substrAL != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrAL );
+		}
+		// ANCE ->
+		// allowance -> allow
+	else if ( *substrANCE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrANCE );
+		}
+		// ENCE ->
+		// inference -> infer
+	else if ( *substrENCE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrENCE );
+		}
+		// ER ->
+		// airliner ->	airlin
+	else if ( *substrER != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrER );
+		}
+		// IC ->
+		// gyroscopic -> gyroscope
+	else if ( *substrIC != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIC );
+		}
+		// ABLE ->
+		// adjustable -> adjust
+	else if ( *substrABLE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrABLE );
+		}
+		// IBLE ->
+		// goodness -> good
+	else if ( *substrIBLE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIBLE );
+		}
+		// ANT ->
+		// irritant -> irrit
+	else if ( *substrANT != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrANT );
+		}
+		// EMENT ->
+		// replacement -> replace
+	else if ( *substrEMENT != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrEMENT );
+		}
+		// MENT ->
+		// adjustment -> adjust
+	else if ( *substrMENT != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrMENT );
+		}
+		// ENT ->
+		// dependent -> depend
+	else if ( *substrENT != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrENT );
+		}
+		// TION ->
+		// stem must end in 't' or 's'
+		// adoption -> adopt
+	else if ( *substrION != '\0' && ( *( substrION - 1 ) == 's' || *( substrION - 1 ) == 't' ) )
+		{
+		wordStem = subStr( word.begin( ), substrION );
+		}
+		// OU ->
+		// homologou -> homolog
+	else if ( *substrOU != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrOU );
+		}
+		// ISM ->
+		// communism -> commun
+	else if ( *substrISM != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrISM );
+		}
+		// ATE ->
+		// activate -> activ
+	else if ( *substrATE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrATE );
+		}
+		// ITI ->
+		// angulariti -> angular
+	else if ( *substrITI != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrITI );
+		}
+		// OUS ->
+		// homologous -> homolog
+	else if ( *substrOUS != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrOUS );
+		}
+		// IVE ->
+		// effective -> effect
+	else if ( *substrIVE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIVE );
+		}
+		// IZE ->
+		// bowdlerize -> bowdler
+	else if ( *substrIZE != '\0' )
+		{
+		wordStem = subStr( word.begin( ), substrIZE );
+		}
+	return wordStem;
+
+	}
+
+/**
+ * Step 5a
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step5a ( std::string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto endPtr = word.begin( ) + end;
+
+	// E ->
+	// probabte -> probat
+	if ( measure( word ) > 1 && *endPtr == 'e' )
+		{
+		word = subStr( word, 0, word.size( ) - 1 );
+		return word;
+		}
+	// E ->
+	// cease -> cease
+	if ( measure( word ) == 1 && !endCVC( word ) && *endPtr == 'e' )
+		{
+		word = subStr( word, 0, word.size( ) - 1 );
+		}
+	return word;
+
+	}
+
+/**
+ * Step 5b
+ *
+ * @param word
+ * @return
+ */
+std::string Stemmer::step5b ( std::string word )
+	{
+	unsigned long end = word.size( ) - 1;
+	auto endPtr = word.begin( ) + end;
+
+	if ( word.size( ) > 2 && measure( word ) > 1 && *endPtr == 'l' && *( endPtr - 1 ) == 'l' )
+		{
+		word = subStr( word, 0, word.size( ) - 1 );
+		}
+	return word;
+	}
\ No newline at end of file
--- a/util/Stemmer.h
+++ b/util/Stemmer.h
-//
-// Created by Veronica Day on 2/22/18.
-//
-

 #pragma once

+#include <string>
+#include "stringProcessing.h"
+
+/**
+ * Modeled after the Porter Stemmer algorithm
+ * http://snowball.tartarus.org/algorithms/porter/stemmer.html
+ */
 class Stemmer
 	{
+public:
+
+	/**
+	 * Stemmer Cstor
+	 */
+	Stemmer ( );
+
+	/**
+	 * Returns the stem of a word
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string execute ( std::string word );
+
+private:
+
+	/**
+	 * Number of consonant sequences
+	 *
+	 * <c><v>       -> 0
+	 * <c>vc<v>  	 -> 1
+	 * <c>vcvc<v>   -> 2
+	 * <c>vcvcvc<v> -> 3
+	 *
+	 * @param word
+	 * @return
+	 */
+	int measure ( std::string word );
+
+	/**
+	 * Check if a vowel is present in the stem
+	 *
+	 * @param wordBeg
+	 * @param wordEnd
+	 * @param word
+	 * @return
+	 */
+	bool isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word );
+
+	/**
+	 * Return true if the wordIt points to a consonant
+	 *
+	 * @param wordIt
+	 * @param wordBegin
+	 * @return
+	 */
+	bool isConsonant ( string::iterator wordIt, string::iterator wordBegin );
+
+	/**
+	 * Returns true if should add 'e' to end
+	 *
+	 * @param word
+	 * @return
+	 */
+	bool addE ( string word );
+
+	/**
+	 * Returns true if word ends in double constant
+	 * Not LL, SS, ZZ
+	 * @param word
+	 * @return
+	 */
+	bool doubleCon ( string word );
+
+	/**
+	 * Returns true if a word ends in a
+	 * Consonant, Vowel, Consonant pattern
+	 * Except when second C is W, X, or Y
+	 *
+	 * @param word
+	 * @return
+	 */
+	bool endCVC ( std::string word );
+
+	/**
+	 * Stem plural words
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step1a ( std::string word );
+
+	/**
+	 * Stem ED and ING
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step1b ( std::string word );
+
+	/**
+	 * Checks for Y -> I
+	 *
+	 * @param word
+	 * @return
+	 */
+	string step1c ( string word );
+
+	/**
+	 * Step 2
+	 *
+	 * @param word
+	 * @return
+	 */
+	string step2 ( std::string word );
+
+	/**
+	 * Step 3
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step3 ( std::string word );
+
+	/**
+	 * Step 4
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step4 ( std::string word );
+
+	/**
+	 * Step 5a
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step5a ( std::string word );
+
+	/**
+	 * Step 5b
+	 *
+	 * @param word
+	 * @return
+	 */
+	std::string step5b ( std::string word );

 	};


--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
+
+#include "Tokenizer.h"
+
+/**
+ * Tokenizer Cstor
+ */
+Tokenizer::Tokenizer ( )
+	{
+	docIndex = new unordered_map< string, vector< int>>;
+	}
+
+/**
+ * Returns pointer to the docIndex dictionary
+ *
+ * @return pointer to unordered_map< string, vector< int>>
+ */
+unordered_map< string, vector< int>> *Tokenizer::get ( ) const
+	{
+	return docIndex;
+	}
+
+/**
+ * Executes the Tokenizer
+ * Sends tokens to dictionary
+ *
+ * token -> [offsets]
+ * @param originalText
+ * @param offset
+ */
+void Tokenizer::execute ( string & originalText, int offset )
+	{
+	vector< string > splitText = splitStr( originalText, ' ' );
+	string processedString = "";
+	for ( int i = 0; i < splitText.size( ); ++i )
+		{
+		// case fold
+		processedString = toLower( splitText[ i ] );
+		//strip all characters
+		processedString = stripStr( processedString );
+
+		if ( !isStopWord( processedString ) )
+			{
+			// stem word
+			processedString = stem.execute( processedString );
+			( *docIndex )[ processedString ].push_back( offset );
+			++offset;
+			}
+		}
+	}
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
-//
-// Created by anvia on 1/31/2018.
-//
+
 #pragma once
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "stringProcessing.h"
+#include "Stemmer.h"

 using namespace std;

 class Tokenizer
 	{
-	public:
-		Tokenizer ( )
-			{
-			docIndex = new unordered_map< string, vector< int>>;
-			}
-
-		unordered_map< string, vector< int>> *get ( ) const
-			{
-			return docIndex;
-			}
-
-		void execute ( string originalText, int offset )
-			{
-			vector< string > splitText = splitStr ( originalText, ' ' );
-			string lowerString = "";
-			for ( int i = 0; i < splitText.size ( ); ++i )
-				{
-				lowerString = toLower ( splitText[ i ] );
-				if ( !isStopWord ( lowerString ) )
-					{
-					( *docIndex )[ lowerString ].push_back ( offset );
-					++offset;
-					}
-				}
-			}
+
+public:
+
+	/**
+ 	* Tokenizer Cstor
+ 	*/
+	Tokenizer ( );
+
+	/**
+ 	* Returns pointer to the docIndex dictionary
+	 *
+ 	* @return pointer to unordered_map< string, vector< int>>
+ 	*/
+	unordered_map< string, vector< int>> *get ( ) const;
+
+	/**
+	 * Executes the Tokenizer
+	 * Sends tokens to dictionary
+	 *
+	 * token -> [offsets]
+	 * @param originalText
+	 * @param offset
+	 */
+	void execute ( string &originalText, int offset );

 	private:
 		unordered_map< string, vector< int>> *docIndex;
+		Stemmer stem;
 	};
--- a/util/stringProcessing.cpp
+++ b/util/stringProcessing.cpp
@@ -3,158 +3,380 @@
 //

 #include "stringProcessing.h"
+#include "Stemmer.h"
+#include <cassert>

 using namespace std;

-
-string::iterator findStr (string needle, string haystack )
+/**
+ * Finds the needle in the haystack
+ * returns position of first match
+ *
+ * @param haystack
+ * @param needle
+ * @return string::iterator
+ */
+string::iterator findStr ( string needle, string haystack )
 	{

-	auto beginNeedle = needle.begin ( );
-	auto beginHaystack = haystack.begin();
+	auto beginNeedle = needle.begin( );
+	auto beginHaystack = haystack.begin( );

 	while ( *beginHaystack != '\0' )
-	{
+		{
 		//keep looking for instance of a match
 		if ( *beginHaystack != *beginNeedle )
-		{
+			{
 			++beginHaystack;
-		}
+			}

 		else if ( *beginHaystack == *beginNeedle )
-		{
+			{
 			/* want to keep the original iterator where it is so it
 				can return the beginning of the matched word if found */
 			auto temp = beginHaystack;
 			while ( *temp == *beginNeedle )
-			{
+				{
 				++temp;
 				++beginNeedle;
 				//if it hits the end of the needleing, it signifies an exact match
 				if ( *beginNeedle == '\0' )
-				{
+					{
 					//this is pointing at the beginning of the match
 					return beginHaystack;
-				}
+					}

-			}
+				}
 			//need to reset because still has to search rest of the string for a match
-			beginNeedle = needle.begin ( );
+			beginNeedle = needle.begin( );
 			//sets the original text pointer to where the last search left off
 			beginHaystack = temp;
-		}
+			}

 		else
-		{
+			{
 			//DO NOTHING
+			}
 		}
-	}

 	return beginHaystack;

 	}

-
-string::iterator findNext (string needle, string::iterator haystackPointer )
+/**
+ * Finds the next position of the needle in the string
+ *
+ * @param needle
+ * @param pointer
+ * @return string::iterator
+ */
+string::iterator findNext ( string needle, string::iterator haystackPointer )
 	{
-	auto beginNeedle = needle.begin ( );
-	auto beginHaystack = haystackPointer;
-	while ( *beginHaystack != '\0' )
-	{
-		//keep looking for instance of a match
-		if ( *beginHaystack != *beginNeedle )
+	auto beginNeedle = needle.begin( );
+	while ( *haystackPointer != '\0' )
 		{
-			++beginHaystack;
-		}
+		//keep looking for instance of a match
+		if ( *haystackPointer != *beginNeedle )
+			{
+			++haystackPointer;
+			}

-		else if ( *beginHaystack == *beginNeedle )
-		{
+		else if ( *haystackPointer == *beginNeedle )
+			{
 			/* want to keep the original iterator where it is so it
 				can return the beginning of the matched word if found */
-			auto temp = beginHaystack;
+			auto temp = haystackPointer;
 			while ( *temp == *beginNeedle )
-			{
+				{
 				++temp;
 				++beginNeedle;
 				//if it hits the end of the needleing, it signifies an exact match
 				if ( *beginNeedle == '\0' )
-				{
+					{
 					//this is pointing at the beginning of the match
-					return beginHaystack;
-				}
+					return haystackPointer;
+					}

-			}
+				}
 			//need to reset because still has to search rest of the string for a match
-			beginNeedle = needle.begin ( );
+			beginNeedle = needle.begin( );
 			//sets the original text pointer to where the last search left off
-			beginHaystack = temp;
-		}
+			haystackPointer = temp;
+			}

 		else
-		{
+			{
 			//DO NOTHING
+			}
 		}
-	}

-	return beginHaystack;
+	return haystackPointer;
 	}

+/**
+ * Finds the previous position of the needle in the string
+ *
+ * @param needle
+ * @param haystackPointer
+ * @return string::iterator
+ */
+string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg )
+	{
+	auto begNeedle = needle.begin( );
+	auto endNeedle = begNeedle + ( needle.size( ) - 1 );

+	while ( haystackPointer != haystackBeg )
+		{
+		//keep looking for instance of a match
+		if ( *haystackPointer != *endNeedle )
+			{
+			--haystackPointer;
+			}
+
+		else if ( *haystackPointer == *endNeedle )
+			{
+			/* want to keep the original iterator where it is so it
+				can return the beginning of the matched word if found */
+			auto temp = haystackPointer;
+			while ( *temp == *endNeedle )
+				{
+				//if it hits the end of the needleing, it signifies an exact match
+				if ( endNeedle == begNeedle && *temp == *endNeedle )
+					{
+					//this is pointing at the beginning of the match
+					return temp;
+					}
+
+				if ( temp != haystackBeg )
+					{
+					--temp;
+					}
+				if ( endNeedle != begNeedle )
+					{
+					--endNeedle;
+					}

-vector< string > splitStr ( string originalText, char delim )
+				}
+			//need to reset because still has to search rest of the string for a match
+			endNeedle = begNeedle + ( needle.size( ) - 1 );
+			//sets the original text pointer to where the last search left off
+			haystackPointer = temp;
+			}
+
+		else
+			{
+			//DO NOTHING
+			}
+		}
+
+	return needle.end( );
+	}
+
+/**
+ * Returns a vector of strings from @originalText, split by @delim
+ *
+ * @param originalText
+ * @param delim
+ * @return vector < string >
+ */
+vector< string > splitStr ( string & originalText, char delim )
 	{
 	vector< string > splitWords;
-	auto begin = originalText.begin ( );
+	auto begin = originalText.begin( );

 	while ( *begin != '\0' )
-	{
+		{
 		string word = "";
 		while ( *begin != delim && *begin != '\0' )
-		{
+			{
 			word += *begin;
 			++begin;
-		}
+			}

-		splitWords.push_back ( word );
+		splitWords.push_back( word );
 		++begin;
-	}
+		}

 	return splitWords;

 	}

-
-bool isStopWord ( string word )
+/**
+ * Returns true if @word is a stopword
+ *
+ * @param word
+ * @return bool
+ */
+bool isStopWord ( string & word )
 	{
-	return ( stopWords.find ( word ) != stopWords.end ( ) );
+	return ( stopWords.find( word ) != stopWords.end( ) );

 	}

-
-string toLower ( string word )
+/**
+ * Returns lowercase @word
+ *
+ * @param word
+ * @return string
+ */
+string toLower ( string & word )
 	{
-	auto iter = word.begin ( );
+	auto iter = word.begin( );
 	string lowerWord = "";
 	while ( *iter != '\0' )
-	{
-		if ( *iter >= 'A' && *iter <= 'Z' )
 		{
+		if ( *iter >= 'A' && *iter <= 'Z' )
+			{
 			lowerWord += ( *iter + 32 );
-		}
+			}

 		else
-		{
+			{
 			lowerWord += *iter;
-		}
+			}
 		++iter;
-	}
+		}

 	return lowerWord;
 	}

+/**
+ * Returns stemmed @word
+ *
+ * @param word
+ * @return string
+ */
+string stemWord ( string & word )
+	{
+	Stemmer stemmer;
+	word = stemmer.execute( word );
+	return word;
+	}

+/**
+ * Returns a substring [ post, len )
+ *
+ * @param word
+ * @param pos
+ * @param len
+ * @return string
+ */
+string subStr ( string & word, size_t pos, size_t len )
+	{
+	string substr = "";
+	for ( int i = 0; i < len; ++i )
+		{
+		substr += word.at( pos );
+		++pos;
+		}
+	return substr;
+	}

-string stemWord(string word)
+/**
+ * Returns a substring [ begin, end )
+ *
+ * @param pos
+ * @param len
+ * @return string
+ */
+string subStr ( string::iterator begin, string::iterator end )
 	{
-	return "";
+	string substr = "";
+	while ( begin != end )
+		{
+		substr += *begin;
+		++begin;
+		}
+	return substr;
+	}
+
+/**
+ * Removes the chars in vector from word
+ *
+ * @param word
+ * @param chars
+ * @return string
+ */
+string stripStr ( string & word, vector< char > chars )
+	{
+	string wordStripped = "";
+	auto begin = word.begin( );
+	bool isSymbol = false;
+
+	while ( begin != word.end( ) )
+		{
+		for ( int i = 0; i < chars.size( ); ++i )
+			{
+			if ( *begin == chars[ i ] )
+				{
+				isSymbol = true;
+				}
+			}
+		if ( !isSymbol )
+			{
+			wordStripped += *begin;
+			}
+		++begin;
+		}
+	return wordStripped;
+	}
+
+/**
+ * Removes all chars from word
+ * Assumes word is lowercase
+ *
+ * @param word
+ * @param chars
+ * @return string
+ */
+string stripStr ( string & word )
+	{
+	string wordStripped = "";
+	auto begin = word.begin( );
+
+	while ( begin != word.end( ) )
+		{
+		if ( isAlpha( *begin ) || isNum( *begin ) )
+			{
+			wordStripped += *begin;
+			}
+		++begin;
+		}
+	return wordStripped;
 	}
+
+/**
+ * Returns true is character is a letter
+ *
+ * @param ch
+ * @return bool
+ */
+bool isAlpha ( char ch )
+	{
+	// capital letter
+	if ( ch >= 'A' && ch <= 'Z' )
+		{
+		return true;
+		}
+	// lowercase letter
+	if ( ch >= 'a' && ch <= 'z' )
+		{
+		return true;
+		}
+	return false;
+	}
+
+/**
+ * Returns true is character is a number
+ *
+ * @param ch
+ * @return bool
+ */
+bool isNum ( char ch )
+	{
+	if ( ch >= '0' && ch <= '9' )
+		{
+		return true;
+		}
+	return false;
+	}
\ No newline at end of file
--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -14,57 +14,127 @@ using namespace std;
 /**
 * Set of stopwords
 */
+static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few",
+                                   "from",
+                                   "for", "have", "he", "her", "here", "him", "his", "how",
+                                   "i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our",
+                                   "she",
+                                   "some", "the", "their", "them", "there", "they", "that",
+                                   "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will",
+                                   "with",
+                                   "you", "your" };

-
- static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
-                            "for", "have", "he", "her", "here", "him", "his", "how",
-                            "i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
-                            "some", "the", "their", "them", "there", "they", "that",
-                            "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
-                            "you", "your" };
 /**
 * Finds the needle in the haystack
 * returns position of first match
+ *
 * @param haystack
 * @param needle
 * @return string::iterator
 */
-string::iterator findStr (string needle, string haystack );
+string::iterator findStr ( string needle, string haystack );

 /**
 * Finds the next position of the needle in the string
+ *
 * @param needle
 * @param pointer
 * @return string::iterator
 */
-string::iterator findNext (string needle, string::iterator haystackPointer );
+string::iterator findNext ( string needle, string::iterator haystackPointer );

+/**
+ * Finds the previous position of the needle in the string
+ *
+ * @param needle
+ * @param haystackPointer
+ * @param haystackBeg
+ * @return string::iterator
+ */
+string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg );

 /**
 * Returns a vector of strings from @originalText, split by @delim
+ *
 * @param originalText
 * @param delim
- * @return
+ * @return vector< string >
 */
-vector< string > splitStr ( string originalText, char delim );
+vector< string > splitStr ( string & originalText, char delim );
+
 /**
 * Returns true if @word is a stopword
+ *
 * @param word
- * @return
+ * @return bool
 */
-bool isStopWord ( string word );
+bool isStopWord ( string & word );
+
 /**
 * Returns lowercase @word
+ *
 * @param word
- * @return
+ * @return string
 */
-string toLower ( string word );
+string toLower ( string & word );

-//TODO
 /**
 * Returns stemmed @word
+ *
 * @param word
- * @return
+ * @return string
+ */
+string stemWord ( string & word );
+
+/**
+ * Returns a substring [ post, len )
+ *
+ * @param word
+ * @param pos
+ * @param len
+ * @return string
+ */
+string subStr ( string & word, size_t pos, size_t len );
+
+/**
+ * Returns a substring [ begin, end )
+ *
+ * @param pos
+ * @param len
+ * @return string
 */
-string stemWord(string word);
+string subStr ( string::iterator begin, string::iterator end );

+/**
+ * Removes the chars in vector from word
+ *
+ * @param word
+ * @param chars
+ * @return string
+ */
+string stripStr ( string & word, vector< char > chars );
+
+/**
+ * Removes all chars from word
+ * Assumes word is lowercase
+ *
+ * @param word
+ * @return string
+ */
+string stripStr ( string & word );
+
+/**
+ * Returns true is character is a letter
+ *
+ * @param ch
+ * @return bool
+ */
+bool isAlpha ( char ch );
+
+/**
+ * Returns true is character is a number
+ *
+ * @param ch
+ * @return bool
+ */
+bool isNum ( char ch );
\ No newline at end of file
--- a/util/tests/stemmerTest.cpp
+++ b/util/tests/stemmerTest.cpp
-//
-// Created by Veronica Day on 2/22/18.
-//
+
+#include <string>
+#include <vector>
+#include "../Stemmer.h"
+#include <iostream>
+#include <cassert>
+
+int main ( )
+	{
+
+	cout << "Beginning testing for Stemmer" << endl;
+
+	Stemmer stem;
+
+	assert ( stem.execute( "caresses" ) == "caress" );
+	assert ( stem.execute( "ponies" ) == "poni" );
+	assert ( stem.execute( "ties" ) == "ti" );
+	assert ( stem.execute( "caress" ) == "caress" );
+	assert ( stem.execute( "cats" ) == "cat" );
+	assert ( stem.execute( "feed" ) == "feed" );
+
+	assert ( stem.execute( "agreed" ) == "agre" );
+	assert ( stem.execute( "plastered" ) == "plaster" );
+	assert ( stem.execute( "bled" ) == "bled" );
+	assert ( stem.execute( "motoring" ) == "motor" );
+	assert ( stem.execute( "conflated" ) == "conflat" );
+
+	assert ( stem.execute( "troubled" ) == "troubl" );
+	assert ( stem.execute( "sized" ) == "size" );
+	assert ( stem.execute( "hopping" ) == "hop" );
+	assert ( stem.execute( "tanning" ) == "tan" );
+	assert ( stem.execute( "tanned" ) == "tan" );
+
+	assert ( stem.execute( "falling" ) == "fall" );
+	assert ( stem.execute( "hissing" ) == "hiss" );
+	assert ( stem.execute( "fizzed" ) == "fizz" );
+	assert ( stem.execute( "failing" ) == "fail" );
+	assert ( stem.execute( "filing" ) == "file" );
+
+	assert ( stem.execute( "happy" ) == "happi" );
+	assert ( stem.execute( "sky" ) == "sky" );
+	assert ( stem.execute( "relational" ) == "relat" );
+	assert ( stem.execute( "conditional" ) == "condit" );
+	assert ( stem.execute( "rational" ) == "ration" );
+
+	assert ( stem.execute( "valenci" ) == "valenc" );
+	assert ( stem.execute( "hesitanci" ) == "hesit" );
+	assert ( stem.execute( "digitizer" ) == "digit" );
+	assert ( stem.execute( "conformabli" ) == "conform" );
+	assert ( stem.execute( "radicalli" ) == "radic" );
+	assert ( stem.execute( "differentli" ) == "differ" );
+
+	assert ( stem.execute( "vileli" ) == "vile" );
+	assert ( stem.execute( "analogousli" ) == "analog" );
+	assert ( stem.execute( "vietnamization" ) == "vietnam" );
+	assert ( stem.execute( "predication" ) == "predic" );
+	assert ( stem.execute( "operator" ) == "oper" );
+	assert ( stem.execute( "feudalism" ) == "feudal" );
+
+	assert ( stem.execute( "decisiveness" ) == "decis" );
+	assert ( stem.execute( "hopefulness" ) == "hope" );
+	assert ( stem.execute( "callousness" ) == "callous" );
+	assert ( stem.execute( "formaliti" ) == "formal" );
+	assert ( stem.execute( "sensitiviti" ) == "sensit" );
+	assert ( stem.execute( "sensibiliti" ) == "sensibl" );
+
+	assert ( stem.execute( "triplicate" ) == "triplic" );
+	assert ( stem.execute( "formative" ) == "form" );
+	assert ( stem.execute( "formalize" ) == "formal" );
+	assert ( stem.execute( "electriciti" ) == "electr" );
+	assert ( stem.execute( "electrical" ) == "electr" );
+	assert ( stem.execute( "hopeful" ) == "hope" );
+
+	assert ( stem.execute( "goodness" ) == "good" );
+	assert ( stem.execute( "revival" ) == "reviv" );
+	assert ( stem.execute( "allowance" ) == "allow" );
+	assert ( stem.execute( "inference" ) == "infer" );
+
+	assert ( stem.execute( "airliner" ) == "airlin" );
+	assert ( stem.execute( "gyroscopic" ) == "gyroscop" );
+	assert ( stem.execute( "adjustable" ) == "adjust" );
+	assert ( stem.execute( "defensible" ) == "defens" );
+	assert ( stem.execute( "irritant" ) == "irrit" );
+	assert ( stem.execute( "replacement" ) == "replac" );
+	assert ( stem.execute( "adjustment" ) == "adjust" );
+	assert ( stem.execute( "dependent" ) == "depend" );
+
+	assert ( stem.execute( "adoption" ) == "adopt" );
+	assert ( stem.execute( "homologou" ) == "homolog" );
+	assert ( stem.execute( "communism" ) == "commun" );
+	assert ( stem.execute( "activate" ) == "activ" );
+	assert ( stem.execute( "angulariti" ) == "angular" );
+	assert ( stem.execute( "homologous" ) == "homolog" );
+	assert ( stem.execute( "effective" ) == "effect" );
+
+	assert ( stem.execute( "bowdlerize" ) == "bowdler" );
+	assert ( stem.execute( "probate" ) == "probat" );
+	assert ( stem.execute( "cease" ) == "ceas" );
+	assert ( stem.execute( "controll" ) == "control" );
+	assert ( stem.execute( "roll" ) == "roll" );
+	assert ( stem.execute( "university" ) == "univers" );
+	assert ( stem.execute( "example" ) == "exampl" );
+
+	assert ( stem.execute( "do" ) == "do" );
+	assert ( stem.execute( "you" ) == "you" );
+	assert ( stem.execute( "really" ) == "real" );
+	assert ( stem.execute( "weakness" ) == "weak" );
+	assert ( stem.execute( "yields" ) == "yield" );
+	assert ( stem.execute( "temptation" ) == "temptat" );
+	assert ( stem.execute( "are" ) == "ar" );
+	assert ( stem.execute( "terrible" ) == "terribl" );
+
+
+	cout << "\nTests passed for Stemmer :D" << endl;
+
+	}

--- a/util/tests/stringProcessingTest.cpp
+++ b/util/tests/stringProcessingTest.cpp
-//
-// Created by Veronica Day on 2/13/18.
-//

 #include <string>
 #include <vector>
 #include "../stringProcessing.h"
+#include "../Stemmer.h"
 #include <iostream>
 #include <cassert>

@@ -12,76 +10,163 @@ using namespace std;

 void testFindStr ( string original );

+void testFindNext ( );
+
+void testFindPrev ( );
+
 void testSplitStr ( string original );

+void testIsStopWord ( );
+
 void testToLower ( );

-void testIsStopWord ( );
+void testStemWord ( );
+
+void testSubStr ( );
+
+void testStripStr ( );
+
+void testIsAlpha ( );
+
+void testIsNum ( );

 int main ( )
 	{

-	cout << "Beginning testing for StringProcessing_unit" << endl << endl;
+	cout << "Beginning testing for StringProcessing" << endl << endl;

 	string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. "
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";

-	testFindStr ( original );
-	testSplitStr ( original );
-	testToLower ( );
-	testIsStopWord ( );
+	testFindStr( original );
+	testFindNext( );
+	testFindPrev( );
+	testSplitStr( original );
+	testIsStopWord( );
+	testToLower( );
+	testStemWord( );
+	testSubStr( );
+	testStripStr( );
+	testIsAlpha( );
+	testIsNum( );

-	cout << "\nTests passed for StringProcessing_unit :D" << endl;
+	cout << "\nTests passed for StringProcessing :D" << endl;

 	}

 void testFindStr ( string original )
 	{
 	cout << "Testing findStr..." << endl;
-	assert( *findStr ( original, "established" ) == 'e' );
-	assert( *findStr ( original, "Lorem Ipsum" ) == 'L' );
+	assert( *findStr( "established", original ) == 'e' );
+	assert( *findStr( "Lorem Ipsum", original ) == 'L' );

 	string title = "<title> This is a test </title>";
-	auto word = findStr ( title, "<title>" );
+	auto word = findStr( "<title>", title );
 	assert( *word == '<' );
-	auto titleIt = title.begin ( );
-	while ( word != title.end ( ) && titleIt != title.end ( ) )
+	auto titleIt = title.begin( );
+	while ( word != title.end( ) && titleIt != title.end( ) )
 		{
 		assert( *word == *titleIt );
 		++word;
 		++titleIt;
 		}

-	auto word1 = findStr ( title, "</title>" );
+	auto word1 = findStr( "</title>", title );
 	assert( *word1 == '<' && *( word1 + 1 ) == '/' );
-	assert( *findStr ( original, "</title>" ) == '\0' );
-	assert( *findStr ( original, "orange" ) == '\0' );
-	assert( *findStr ( "apple", "orange" ) == '\0' );
-	auto word2 = findStr ( "bigbird", "bird" );
+	assert( *findStr( "</title>", original ) == '\0' );
+	assert( *findStr( "orange", original ) == '\0' );
+	assert( *findStr( "orange", "apple" ) == '\0' );
+	auto word2 = findStr( "bird", "bigbird" );
 	assert( *word2 == 'b' && *( word2 + 1 ) == 'i' && *( word2 + 2 ) == 'r' );

-	cout << "testFindStr passed" << endl;
+	cout << "testFindStr passed" << endl << endl;
+
+	}
+
+void testFindNext ( )
+	{
+	cout << "Testing findNext..." << endl;
+
+	string racecar = "racecar";
+	string hello = "hello";
+	string blank = "";
+
+	assert ( *findNext( "race", racecar.begin( ) ) == 'r' );
+	assert ( *findNext( "race", racecar.begin( ) + 4 ) == '\0' );
+	assert ( *findNext( "car", racecar.begin( ) + 4 ) == 'c' );
+
+	assert ( *findNext( "hello", hello.begin( ) ) == 'h' );
+	assert ( *findNext( "ello", hello.begin( ) ) == 'e' );
+	assert ( *findNext( "ello", hello.begin( ) + 2 ) == '\0' );
+
+	assert ( *findNext( "", blank.begin( ) ) == '\0' );
+
+	cout << "testFindNext passed" << endl << endl;

 	}

+void testFindPrev ( )
+	{
+	cout << "Testing findPrev..." << endl;
+
+	string racecar = "racecar";
+	string hello = "hello";
+	string blank = "";
+
+	assert ( *findPrev( "race", racecar.begin( ), racecar.begin( ) ) == '\0' );
+	assert ( *findPrev( "race", racecar.begin( ) + 4, racecar.begin( ) ) == 'r' );
+	assert ( *findPrev( "car", racecar.begin( ) + 4, racecar.begin( ) ) == '\0' );
+	assert ( *findPrev( "car", racecar.begin( ) + 7, racecar.begin( ) ) == 'c' );
+
+	assert ( *findPrev( "hello", hello.begin( ), hello.begin( ) ) == '\0' );
+	assert ( *findPrev( "ello", hello.begin( ) + 3, hello.begin( ) ) == '\0' );
+	assert ( *findPrev( "ello", hello.begin( ) + 5, hello.begin( ) ) == 'e' );
+
+	assert ( *findPrev( "", blank.begin( ), blank.begin( ) ) == '\0' );
+
+	string fall = "fall";
+	assert ( *findPrev( "bl", fall.begin( ) + 3, fall.begin( ) ) == '\0' );
+
+	cout << "testFindPrev passed" << endl << endl;
+
+	}

 void testSplitStr ( string original )
 	{
 	cout << "Testing splitStr..." << endl;

-	vector< string > vec = splitStr ( original, ' ' );
-	assert( vec.size ( ) == 53 );
+	vector< string > vec = splitStr( original, ' ' );
+	assert( vec.size( ) == 53 );

 	string word = "hello\ngoodbye";
-	vec = splitStr ( word, '\n' );
-	assert( vec.size ( ) == 2 );
+	vec = splitStr( word, '\n' );
+	assert( vec.size( ) == 2 );
 	assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );

-	cout << "testSplitStr passed" << endl;
+	cout << "testSplitStr passed" << endl << endl;

 	}

+void testIsStopWord ( )
+	{
+	cout << "Testing isStopWord..." << endl;
+
+	string is = "is";
+	string hello = "Hello";
+	string none = "none";
+	string blank = "";
+	string blank2 = " ";
+
+	assert ( isStopWord( is ) );
+	assert ( !isStopWord( hello ) );
+	assert ( isStopWord( none ) );
+	assert ( !isStopWord( blank ) );
+	assert ( !isStopWord( blank2 ) );
+
+	cout << "testIsStopWord passed" << endl << endl;
+
+	}

 void testToLower ( )
 	{
@@ -93,11 +178,11 @@ void testToLower ( )
 	string word4 = "";
 	string word5 = " ";

-	string test = toLower ( word );
-	string test2 = toLower ( word2 );
-	string test3 = toLower ( word3 );
-	string test4 = toLower ( word4 );
-	string test5 = toLower ( word5 );
+	string test = toLower( word );
+	string test2 = toLower( word2 );
+	string test3 = toLower( word3 );
+	string test4 = toLower( word4 );
+	string test5 = toLower( word5 );

 	assert ( test == "hello" );
 	assert ( test2 == "hello" );
@@ -105,26 +190,144 @@ void testToLower ( )
 	assert ( test4 == "" );
 	assert ( test5 == " " );

-	cout << "testToLower passed" << endl;
+	cout << "testToLower passed" << endl << endl;
 	}

+void testStemWord ( )
+	{
+	cout << "Testing stemWord..." << endl;
+	Stemmer stem;

-void testIsStopWord ( )
+	assert ( stem.execute( "cats" ) == "cat" );
+	assert ( stem.execute( "wilde" ) == "wild" );
+	assert( stem.execute( "zoo" ) == "zoo" );
+	assert( stem.execute( "troublesome" ) == "troublesom" );
+
+	cout << "testStemWord passed" << endl << endl;
+	}
+
+void testSubStr ( )
 	{
-	cout << "Testing isStopWord..." << endl;
+	cout << "Testing subStr..." << endl;

-	string is = "is";
-	string hello = "Hello";
-	string none = "none";
-	string blank = "";
-	string blank2 = " ";
+	string hello = "hello";
+	string goodbye = "goodbye";
+	string blank = " ";
+	string blank2 = "";
+
+	assert ( subStr( hello, 1, 4 ) == "ello" );
+	assert ( subStr( hello, 0, 5 ) == "hello" );
+	assert ( subStr( hello, 0, 1 ) == "h" );
+	assert ( subStr( hello, 1, 2 ) == "el" );
+
+	assert ( subStr( goodbye, 0, 4 ) == "good" );
+	assert ( subStr( goodbye, 4, 3 ) == "bye" );
+	assert ( subStr( goodbye, 1, 0 ) == "" );
+	assert ( subStr( goodbye, 0, 7 ) == "goodbye" );
+
+	assert ( subStr( blank, 0, 1 ) == " " );
+	assert ( subStr( blank, 0, 0 ) == "" );
+	assert ( subStr( blank2, 0, 0 ) == "" );
+
+	assert ( subStr( hello.begin( ), hello.end( ) ) == "hello" );
+	assert ( subStr( hello.begin( ) + 4, hello.begin( ) + 5 ) == "o" );
+	assert ( subStr( hello.begin( ), hello.begin( ) + 1 ) == "h" );
+	assert ( subStr( goodbye.begin( ) + 1, goodbye.begin( ) + 3 ) == "oo" );
+
+
+	cout << "testSubStrpassed" << endl << endl;

-	assert ( isStopWord ( is ) );
-	assert ( !isStopWord ( hello ) );
-	assert ( isStopWord ( none ) );
-	assert ( !isStopWord ( blank ) );
-	assert ( !isStopWord ( blank2 ) );
+	}
+
+
+void testStripStr ( )
+	{
+	cout << "Testing stripStr..." << endl;
+
+	char arr[] = { ',', '.', '*', '&', '^', '%', ';', ' ' };
+	vector< char > chars( arr, arr + sizeof( arr ) / sizeof( arr[ 0 ] ) );
+
+	string hello = "!hello!";
+	string allSym = "\"*&^%;";
+	string comma = "comma,";
+	string period = "period.";
+	string blank = " ";
+
+	assert ( stripStr( hello ) == "hello" );
+	assert ( stripStr( allSym ) == "" );
+	assert ( stripStr( comma ) == "comma" );
+	assert ( stripStr( period ) == "period" );
+	assert ( stripStr( blank ) == "" );
+
+	assert ( stripStr( hello, chars ) == "!hello!" );
+	assert ( stripStr( allSym, chars ) == "\"" );
+	assert ( stripStr( comma, chars ) == "comma" );
+	assert ( stripStr( period, chars ) == "period" );
+	assert ( stripStr( blank, chars ) == "" );
+
+	cout << "testStripStrpassed" << endl << endl;
+	}

-	cout << "testIsStopWord passed" << endl;
+void testIsAlpha ( )
+	{
+	cout << "Testing isAlpha..." << endl;
+
+	assert ( isAlpha( 'a' ) );
+	assert ( isAlpha( 'A' ) );
+	assert ( isAlpha( 'z' ) );
+	assert ( isAlpha( 'Z' ) );
+	assert ( isAlpha( 'g' ) );
+	assert ( isAlpha( 'i' ) );
+	assert ( isAlpha( 'P' ) );
+
+	assert ( !isAlpha( '1' ) );
+	assert ( !isAlpha( '0' ) );
+	assert ( !isAlpha( '9' ) );
+	assert ( !isAlpha( '5' ) );
+	assert ( !isAlpha( '6' ) );
+
+	assert ( !isAlpha( ' ' ) );
+	assert ( !isAlpha( '!' ) );
+	assert ( !isAlpha( '/' ) );
+	assert ( !isAlpha( '?' ) );
+	assert ( !isAlpha( '*' ) );
+	assert ( !isAlpha( '-' ) );
+	assert ( !isAlpha( '.' ) );
+	assert ( !isAlpha( ',' ) );
+	assert ( !isAlpha( '(' ) );
+	assert ( !isAlpha( '}' ) );
+
+	cout << "testIsAlpha passed" << endl << endl;
+	}

+void testIsNum ( )
+	{
+	cout << "Testing isNum..." << endl;
+
+	assert ( !isNum( 'a' ) );
+	assert ( !isNum( 'A' ) );
+	assert ( !isNum( 'z' ) );
+	assert ( !isNum( 'Z' ) );
+	assert ( !isNum( 'g' ) );
+	assert ( !isNum( 'i' ) );
+	assert ( !isNum( 'P' ) );
+
+	assert ( isNum( '1' ) );
+	assert ( isNum( '0' ) );
+	assert ( isNum( '9' ) );
+	assert ( isNum( '5' ) );
+	assert ( isNum( '6' ) );
+
+	assert ( !isNum( ' ' ) );
+	assert ( !isNum( '!' ) );
+	assert ( !isNum( '/' ) );
+	assert ( !isNum( '?' ) );
+	assert ( !isNum( '*' ) );
+	assert ( !isNum( '-' ) );
+	assert ( !isNum( '.' ) );
+	assert ( !isNum( ',' ) );
+	assert ( !isNum( '(' ) );
+	assert ( !isNum( '}' ) );
+
+	cout << "testIsNum passed" << endl;
 	}
\ No newline at end of file
--- a/util/tests/tokenizerTest.cpp
+++ b/util/tests/tokenizerTest.cpp
-//
-// Created by Veronica Day on 2/13/18.
-//

 #include <string>
 #include <vector>
@@ -16,29 +13,29 @@ void testExecute ( string original );
 int main ( )
 	{

-	cout << "Beginning testing for TokenizerTest_unit" << endl << endl;
+	cout << "Beginning testing for TokenizerTest" << endl << endl;

 	string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. "
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";

-	testExecute ( original );
+	testExecute( original );

-	cout << "\nTests passed for TokenizerTest_unit :D" << endl;
+	cout << "\nTests passed for TokenizerTest :D" << endl;

 	}

 void testExecute ( string original )
 	{
 	Tokenizer myTokenizer;
-	myTokenizer.execute ( original );
+	myTokenizer.execute( original, 0 );

-	auto dict = myTokenizer.get ( );
+	auto dict = myTokenizer.get( );

-	for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
+	for ( auto it = dict->begin( ); it != dict->end( ); it++ )
 		{
 		cout << it->first << ':';
-		for ( int i = 0; i < it->second.size ( ); ++i )
+		for ( int i = 0; i < it->second.size( ); ++i )
 			{
 			cout << it->second[ i ] << " ";
 			}