Skip to content
Snippets Groups Projects
Commit a4133948 authored by jsclose's avatar jsclose
Browse files

Merge branch 'milestone1' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into milestone1

parents 1c16250a e73ff17c
No related branches found
No related tags found
No related merge requests found
//
// Created by Jake Close on 3/5/18.
//
#include "Parser.h"
......@@ -60,7 +55,7 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
* @param word
* @return
*/
string Parser::extract_url ( string word )
string Parser::extract_url ( string & word )
{
string url = "";
if ( *findStr ( "<a", word ) != '\0' )
......
//
// Created by Veronica Day on 1/28/18.
//
// keep running count of offset, if stop word: don't incrememnt and remove stopword
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#pragma once
#include <string>
......@@ -65,7 +58,7 @@ private:
* @param word
* @return
*/
string extract_url ( string word );
string extract_url ( string & word );
/**
......
//
// Created by anvia on 2/6/2018.
//
#include <string>
#include <cassert>
......@@ -38,10 +35,10 @@ void testSimple ( )
assert ( dictionary != nullptr );
assert ( dictionary->size () == 2);
assert ( dictionary->find ( "cat" ) != dictionary->end () );
assert ( dictionary->find ( "title" ) != dictionary->end () );
assert ( dictionary->find ( "titl" ) != dictionary->end () );
assert ( dictionary->find ( "this" ) == dictionary->end () );
assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
assert ( dictionary->at ( "title" )[ 0 ] == 1 );
assert ( dictionary->at ( "titl" )[ 0 ] == 1 );
delete dictionary;
......@@ -52,37 +49,34 @@ void testComplex ( )
ProducerConsumerQueue < string > urlFrontierTest;
ifstream file("../tests/cats.html");
string temp;
char docString[10240];
strcpy(docString, "<title>Joe the Cat</title>\n");
strcat(docString, "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n");
while(std::getline(file, temp)) {
//strcat(docString, str(temp));
string docString = "<title>Joe the Cat</title>\n";
docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
while ( std::getline ( file, temp ) )
{
docString += temp;
}
ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html");
Document document ( url, docString );
char * writable = new char[docString.size( ) + 1];
std::copy(docString.begin( ), docString.end( ), writable);
writable[ docString.size( ) ] = '\0';
Document document ( url, writable );
Parser parser ( &urlFrontierTest );
auto dictionary = parser.execute ( &document );
// cout << dictionary->size () << endl;
// for (auto p : *dictionary)
// cout << p.first << endl;
assert ( dictionary != nullptr );
assert ( dictionary->size () == 3);
assert ( dictionary->find ( "cat" ) != dictionary->end () );
assert ( dictionary->find ( "story" ) != dictionary->end () );
assert ( dictionary->find ( "stori" ) != dictionary->end () );
assert ( dictionary->find ( "joe" ) != dictionary->end () );
assert ( dictionary->find ( "the" ) == dictionary->end () );
assert ( dictionary->find ( "of" ) == dictionary->end () );
// assert ( dictionary->at ( "cat" )[ 0 ] == 1 );
// assert ( dictionary->at ( "story" )[ 0 ] == 0 );
// cout << urlFrontierTest->Size () << endl;
// cout << urlFrontierTest->Pop () << endl;
delete dictionary;
delete[] writable;
}
\ No newline at end of file
//
// Created by Jake Close on 3/5/18.
//
#include "Document.h"
......
#include "Stemmer.h"
#include "stringProcessing.h"
/**
* Stemmer Cstor
*/
Stemmer::Stemmer ( )
{ }
/**
* Returns the stem of a word
*
* @param word
* @return
*/
std::string Stemmer::execute ( std::string word )
{
word = step1a( word );
word = step1b( word );
word = step1c( word );
word = step2( word );
word = step3( word );
word = step4( word );
word = step5a( word );
word = step5b( word );
return word;
}
/**
* Number of consonant sequences
*
* <c><v> -> 0
* <c>vc<v> -> 1
* <c>vcvc<v> -> 2
* <c>vcvcvc<v> -> 3
*
* @param word
* @return
*/
int Stemmer::measure ( std::string word )
{
int m = 0;
int begin = 0;
unsigned long end = word.size( ) - 1;
// Looking for CVC pattern
while ( begin <= end )
{
if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) )
{
break;
}
begin += 1;
}
if ( begin > end )
{
return m;
}
begin += 1;
while ( begin <= end )
{
while ( begin <= end )
{
if ( isConsonant( word.begin( ) + begin, word.begin( ) ) )
{
break;
}
begin += 1;
}
if ( begin > end )
{
return m;
}
begin += 1;
m += 1;
while ( begin <= end )
{
if ( !isConsonant( word.begin( ) + begin, word.begin( ) ) )
{
break;
}
begin += 1;
}
if ( begin > end )
{
return m;
}
begin += 1;
}
return m;
}
/**
* Check if a vowel is present in the stem
*
* @param wordBeg
* @param wordEnd
* @param word
* @return
*/
bool Stemmer::isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word )
{
while ( wordBeg != wordEnd )
{
if ( !isConsonant( wordBeg, word.begin( ) ) )
{
return true;
}
++wordBeg;
}
return false;
}
/**
* Return true if the wordIt points to a consonant
*
* @param wordIt
* @param wordBegin
* @return
*/
bool Stemmer::isConsonant ( string::iterator wordIt, string::iterator wordBegin )
{
if ( *wordIt == 'a' || *wordIt == 'e' || *wordIt == 'i' || *wordIt == 'o' || *wordIt == 'u' )
{
return false;
}
if ( *wordIt == 'y' )
{
if ( wordIt == wordBegin )
{
return true;
}
else
{
return ( !isConsonant( wordIt - 1, wordBegin ) );
}
}
return true;
}
/**
* Returns true if should add 'e' to end
*
* @param word
* @return
*/
bool Stemmer::addE ( string word )
{
// AT -> ATE
// BL -> BLE
// IZ -> IZE
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
auto substrAT = findPrev( "at", endPtr, begPtr + word.size( ) - 3 );
auto substrBL = findPrev( "bl", endPtr, begPtr + word.size( ) - 3 );
auto substrIZ = findPrev( "iz", endPtr, begPtr + word.size( ) - 3 );
if ( *substrAT != '\0' || *substrBL != '\0' || *substrIZ != '\0' )
{
return true;
}
else
{
return false;
}
}
/**
* Returns true if word ends in double constant
* Not LL, SS, ZZ
* @param word
* @return
*/
bool Stemmer::doubleCon ( string word )
{
unsigned long end = word.size( ) - 1;
auto endPtr = word.begin( ) + end;
if ( word.size( ) > 2 && *endPtr == *( endPtr - 1 ) )
{
if ( *endPtr == 'l' || *endPtr == 's' || *endPtr == 'z' )
{
return false;
}
else
{
return true;
}
}
return false;
}
/**
* Returns true if a word ends in a
* Consonant, Vowel, Consonant pattern
* Except when second C is W, X, or Y
*
* @param word
* @return
*/
bool Stemmer::endCVC ( std::string word )
{
unsigned long end = word.size( ) - 1;
auto endPtr = word.begin( ) + end - 1;
if ( word.size( ) > 3 )
{
// the stem ends cvc
if ( isConsonant( endPtr, word.begin( ) ) && !isConsonant( endPtr - 1, word.begin( ) ) &&
isConsonant( endPtr - 2, word.begin( ) ) )
{
// the second c is not W, X or Y
if ( *( endPtr - 1 ) != 'w' && *( endPtr - 1 ) != 'x' && *( endPtr - 1 ) != 'y' )
{
return true;
}
}
}
return false;
}
/**
* Stem plural words
*
* @param word
* @return
*/
std::string Stemmer::step1a ( std::string word )
{
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
// check S at end
if ( word.at( end ) == 's' )
{
string wordStem( word.begin( ), word.end( ) );
auto substrSSES = findPrev( "sses", endPtr, begPtr + word.size( ) - 5 );
auto substrIES = findPrev( "ies", endPtr, begPtr + word.size( ) - 4 );
auto substrSS = findPrev( "ss", endPtr, begPtr + word.size( ) - 3 );
auto substrS = findPrev( "s", endPtr, begPtr + word.size( ) - 2 );
// sses -> ss
// caresses -> caress
if ( *substrSSES != '\0' )
{
wordStem = subStr( word.begin( ), substrSSES );
wordStem += "ss";
}
// ies -> i
// ponies -> poni
else if ( *substrIES != '\0' )
{
wordStem = subStr( word.begin( ), substrIES );
wordStem += 'i';
}
// ss -> ss
// caress -> caress
else if ( *substrSS != '\0' )
{
// do nothing
}
// s ->
// cats -> cat
else if ( *substrS != '\0' )
{
wordStem = subStr( word.begin( ), substrS );
}
else
{
wordStem = subStr( word.begin( ), word.end( ) );
}
return wordStem;
}
return word;
}
/**
* Stem ED and ING
*
* @param word
* @return
*/
std::string Stemmer::step1b ( std::string word )
{
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
string wordStem( word.begin( ), word.end( ) );
auto substrEED = findPrev( "eed", endPtr, begPtr + word.size( ) - 4 );
auto substrED = findPrev( "ed", endPtr, begPtr + word.size( ) - 3 );
auto substrING = findPrev( "ing", endPtr, begPtr + word.size( ) - 4 );
// check EED at end and m > 0
// feed -> feed
// agreed -> agree
if ( measure( word ) > 1 && *substrEED != '\0' )
{
wordStem = subStr( word.begin( ), substrEED );
wordStem += "ee";
}
// check ED at end and preceeded by substr with vowel
// plastered -> plaster
// bled -> bled
else if ( measure( word ) > 1 && *substrED != '\0' && isVowelPresent( word.begin( ), substrED, word ) )
{
wordStem = subStr( word.begin( ), substrED );
if ( addE( wordStem ) )
{
wordStem += 'e';
}
else if ( doubleCon( wordStem ) )
{
wordStem = subStr( word, 0, wordStem.size( ) - 1 );
}
else if ( measure( word ) == 1 && endCVC( wordStem + 'e' ) )
{
wordStem += 'e';
}
}
// check ING at end and proceeded by substr with vowel
// motoring -> motor
// sing -> sing
else if ( *substrING != '\0' && isVowelPresent( word.begin( ), substrING, word ) )
{
wordStem = subStr( word.begin( ), substrING );
if ( addE( wordStem ) )
{
wordStem += 'e';
}
else if ( doubleCon( wordStem ) )
{
wordStem = subStr( word, 0, wordStem.size( ) - 1 );
}
else if ( measure( wordStem ) == 1 && endCVC( wordStem + 'e' ) )
{
wordStem += 'e';
}
}
return wordStem;
}
/**
* Checks for Y -> I
* @param word
* @return
*/
string Stemmer::step1c ( string word )
{
unsigned long end = word.size( ) - 1;
auto endPtr = word.begin( ) + end;
// Y -> I
// happy -> happi
// sky -> sky
if ( *endPtr == 'y' )
{
if ( isVowelPresent( word.begin( ), endPtr, word ) )
{
word = subStr( word, 0, word.size( ) - 1 );
word += 'i';
}
}
return word;
}
/**
* Step 2
*
* @param word
* @return
*/
string Stemmer::step2 ( std::string word )
{
if ( measure( word ) == 0 )
{
return word;
}
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
string wordStem( word.begin( ), word.end( ) );
auto substrATIONAL = findPrev( "ational", endPtr, begPtr + word.size( ) - 8 );
auto substrTIONAL = findPrev( "tional", endPtr, begPtr + word.size( ) - 7 );
auto substrENCI = findPrev( "enci", endPtr, begPtr + word.size( ) - 5 );
auto substrANCI = findPrev( "anci", endPtr, begPtr + word.size( ) - 5 );
auto substrIZER = findPrev( "izer", endPtr, begPtr + word.size( ) - 5 );
auto substrABLI = findPrev( "abli", endPtr, begPtr + word.size( ) - 5 );
auto substrALLI = findPrev( "alli", endPtr, begPtr + word.size( ) - 5 );
auto substrENTLI = findPrev( "entli", endPtr, begPtr + word.size( ) - 6 );
auto substrELI = findPrev( "eli", endPtr, begPtr + word.size( ) - 4 );
auto substrOUSLI = findPrev( "ousli", endPtr, begPtr + word.size( ) - 6 );
auto substrIZATION = findPrev( "ization", endPtr, begPtr + word.size( ) - 8 );
auto substrATION = findPrev( "ation", endPtr, begPtr + word.size( ) - 6 );
auto substrATOR = findPrev( "ator", endPtr, begPtr + word.size( ) - 5 );
auto substrALISM = findPrev( "alism", endPtr, begPtr + word.size( ) - 6 );
auto substrIVENESS = findPrev( "iveness", endPtr, begPtr + word.size( ) - 8 );
auto substrFULNESS = findPrev( "fulness", endPtr, begPtr + word.size( ) - 8 );
auto substrOUSNESS = findPrev( "ousness", endPtr, begPtr + word.size( ) - 8 );
auto substrALITI = findPrev( "aliti", endPtr, begPtr + word.size( ) - 6 );
auto substrIVITI = findPrev( "iviti", endPtr, begPtr + word.size( ) - 6 );
auto substrBILITI = findPrev( "biliti", endPtr, begPtr + word.size( ) - 7 );
// ATIONAL -> ATE
// relational -> relate
if ( *substrATIONAL != '\0' && ( begPtr + 1 ) != substrATIONAL )
{
wordStem = subStr( word.begin( ), substrATIONAL );
wordStem += "ate";
}
// TIONAL -> TION
// conditional -> condition
// rational -> rational
else if ( *substrTIONAL != '\0' )
{
wordStem = subStr( word.begin( ), substrTIONAL );
wordStem += "tion";
}
// ENCI -> ENCE
// valenci -> valence
else if ( *substrENCI != '\0' )
{
wordStem = subStr( word.begin( ), substrENCI );
wordStem += "ence";
}
// ANCI -> ANCE
// hesitanci -> hesitance
else if ( *substrANCI != '\0' )
{
wordStem = subStr( word.begin( ), substrANCI );
wordStem += "ance";
}
// IZER -> IZE
// digitizer -> digitize
else if ( *substrIZER != '\0' )
{
wordStem = subStr( word.begin( ), substrIZER );
wordStem += "ize";
}
// ABLI -> ABLE
// conformabli -> comformable
else if ( *substrABLI != '\0' )
{
wordStem = subStr( word.begin( ), substrABLI );
wordStem += "able";
}
// ALLI -> AL
// radicalli -> radical
else if ( *substrALLI != '\0' )
{
wordStem = subStr( word.begin( ), substrALLI );
wordStem += "al";
}
// ENTLI -> ENT
// differentli -> different
else if ( *substrENTLI != '\0' )
{
wordStem = subStr( word.begin( ), substrENTLI );
wordStem += "ent";
}
// ELI -> E
// vileli -> vile
else if ( *substrELI != '\0' )
{
wordStem = subStr( word.begin( ), substrELI );
wordStem += 'e';
}
// OUSLI -> OUS
// analogousli -> analogous
else if ( *substrOUSLI != '\0' )
{
wordStem = subStr( word.begin( ), substrOUSLI );
wordStem += "ous";
}
// IZATION -> IZE
// vietnamization -> vietnamize
else if ( *substrIZATION != '\0' )
{
wordStem = subStr( word.begin( ), substrIZATION );
wordStem += "ize";
}
// ATION -> ATE
// predication -> predicate
else if ( *substrATION != '\0' )
{
wordStem = subStr( word.begin( ), substrATION );
wordStem += "ate";
}
// ATOR -> ATE
// predication -> predicate
else if ( *substrATOR != '\0' )
{
wordStem = subStr( word.begin( ), substrATOR );
wordStem += "ate";
}
// ALISM -> AL
// feudalism -> feudal
else if ( *substrALISM != '\0' )
{
wordStem = subStr( word.begin( ), substrALISM );
wordStem += "al";
}
// IVENESS -> IVE
// decisivenss -> decisive
else if ( *substrIVENESS != '\0' )
{
wordStem = subStr( word.begin( ), substrIVENESS );
wordStem += "ive";
}
// FULNESS -> FUL
// hopefulness -> hopeful
else if ( *substrFULNESS != '\0' )
{
wordStem = subStr( word.begin( ), substrFULNESS );
wordStem += "ful";
}
// OUSNESS -> OUS
// callousness -> callous
else if ( *substrOUSNESS != '\0' )
{
wordStem = subStr( word.begin( ), substrOUSNESS );
wordStem += "ous";
}
// ALITI -> AL
// formalit -> callous
else if ( *substrOUSNESS != '\0' )
{
wordStem = subStr( word.begin( ), substrOUSNESS );
wordStem += "al";
}
// IVITI -> IVE
// sensitiviti -> sensitive
else if ( *substrIVITI != '\0' )
{
wordStem = subStr( word.begin( ), substrIVITI );
wordStem += "ive";
}
// BILITI -> BLE
// sensibiliti -> sensible
else if ( *substrBILITI != '\0' )
{
wordStem = subStr( word.begin( ), substrBILITI );
wordStem += "ble";
}
return wordStem;
}
/**
* Step 3
*
* @param word
* @return
*/
std::string Stemmer::step3 ( std::string word )
{
if ( measure( word ) == 0 )
{
return word;
}
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
string wordStem( word.begin( ), word.end( ) );
auto substrICATE = findPrev( "icate", endPtr, begPtr + word.size( ) - 6 );
auto substrATIVE = findPrev( "ative", endPtr, begPtr + word.size( ) - 6 );
auto substrALIZE = findPrev( "alize", endPtr, begPtr + word.size( ) - 6 );
auto substrICITI = findPrev( "iciti", endPtr, begPtr + word.size( ) - 6 );
auto substrICAL = findPrev( "ical", endPtr, begPtr + word.size( ) - 4 );
auto substrFUL = findPrev( "ful", endPtr, begPtr + word.size( ) - 4 );
auto substrNESS = findPrev( "ness", endPtr, begPtr + word.size( ) - 5 );
// ICATE -> IC
// triplicate -> triplic
if ( *substrICATE != '\0' )
{
wordStem = subStr( word.begin( ), substrICATE );
wordStem += "ic";
}
// ATIVE ->
// formative -> form
else if ( *substrATIVE != '\0' )
{
wordStem = subStr( word.begin( ), substrATIVE );
}
// ALIZE -> AL
// formalize -> formal
else if ( *substrALIZE != '\0' )
{
wordStem = subStr( word.begin( ), substrALIZE );
wordStem += "al";
}
// ICITI -> IC
// electriciti -> electric
else if ( *substrICITI != '\0' )
{
wordStem = subStr( word.begin( ), substrICITI );
wordStem += "ic";
}
// ICAL -> IC
// electrical -> electric
else if ( *substrICAL != '\0' )
{
wordStem = subStr( word.begin( ), substrICAL );
wordStem += "ic";
}
// FUL ->
// hopeful -> hope
else if ( *substrFUL != '\0' )
{
wordStem = subStr( word.begin( ), substrFUL );
}
// NESS ->
// goodness -> good
else if ( *substrNESS != '\0' )
{
wordStem = subStr( word.begin( ), substrNESS );
}
return wordStem;
}
/**
* Step 4
*
* @param word
* @return
*/
std::string Stemmer::step4 ( std::string word )
{
if ( measure( word ) <= 2 )
{
return word;
}
unsigned long end = word.size( ) - 1;
auto begPtr = word.begin( );
auto endPtr = begPtr + end;
string wordStem( word.begin( ), word.end( ) );
auto substrAL = findPrev( "al", endPtr, begPtr + word.size( ) - 3 );
auto substrANCE = findPrev( "ance", endPtr, begPtr + word.size( ) - 5 );
auto substrENCE = findPrev( "ence", endPtr, begPtr + word.size( ) - 5 );
auto substrER = findPrev( "er", endPtr, begPtr + word.size( ) - 3 );
auto substrIC = findPrev( "ic", endPtr, begPtr + word.size( ) - 3 );
auto substrABLE = findPrev( "able", endPtr, begPtr + word.size( ) - 5 );
auto substrIBLE = findPrev( "ible", endPtr, begPtr + word.size( ) - 5 );
auto substrANT = findPrev( "ant", endPtr, begPtr + word.size( ) - 4 );
auto substrEMENT = findPrev( "ement", endPtr, begPtr + word.size( ) - 6 );
auto substrMENT = findPrev( "ment", endPtr, begPtr + word.size( ) - 5 );
auto substrENT = findPrev( "ent", endPtr, begPtr + word.size( ) - 4 );
auto substrION = findPrev( "ion", endPtr, begPtr + word.size( ) - 4 );
auto substrOU = findPrev( "ou", endPtr, begPtr + word.size( ) - 3 );
auto substrISM = findPrev( "ism", endPtr, begPtr + word.size( ) - 4 );
auto substrATE = findPrev( "ate", endPtr, begPtr + word.size( ) - 4 );
auto substrITI = findPrev( "iti", endPtr, begPtr + word.size( ) - 4 );
auto substrOUS = findPrev( "ous", endPtr, begPtr + word.size( ) - 4 );
auto substrIVE = findPrev( "ive", endPtr, begPtr + word.size( ) - 4 );
auto substrIZE = findPrev( "ize", endPtr, begPtr + word.size( ) - 4 );
// AL ->
// revival -> reviv
if ( *substrAL != '\0' )
{
wordStem = subStr( word.begin( ), substrAL );
}
// ANCE ->
// allowance -> allow
else if ( *substrANCE != '\0' )
{
wordStem = subStr( word.begin( ), substrANCE );
}
// ENCE ->
// inference -> infer
else if ( *substrENCE != '\0' )
{
wordStem = subStr( word.begin( ), substrENCE );
}
// ER ->
// airliner -> airlin
else if ( *substrER != '\0' )
{
wordStem = subStr( word.begin( ), substrER );
}
// IC ->
// gyroscopic -> gyroscope
else if ( *substrIC != '\0' )
{
wordStem = subStr( word.begin( ), substrIC );
}
// ABLE ->
// adjustable -> adjust
else if ( *substrABLE != '\0' )
{
wordStem = subStr( word.begin( ), substrABLE );
}
// IBLE ->
// goodness -> good
else if ( *substrIBLE != '\0' )
{
wordStem = subStr( word.begin( ), substrIBLE );
}
// ANT ->
// irritant -> irrit
else if ( *substrANT != '\0' )
{
wordStem = subStr( word.begin( ), substrANT );
}
// EMENT ->
// replacement -> replace
else if ( *substrEMENT != '\0' )
{
wordStem = subStr( word.begin( ), substrEMENT );
}
// MENT ->
// adjustment -> adjust
else if ( *substrMENT != '\0' )
{
wordStem = subStr( word.begin( ), substrMENT );
}
// ENT ->
// dependent -> depend
else if ( *substrENT != '\0' )
{
wordStem = subStr( word.begin( ), substrENT );
}
// TION ->
// stem must end in 't' or 's'
// adoption -> adopt
else if ( *substrION != '\0' && ( *( substrION - 1 ) == 's' || *( substrION - 1 ) == 't' ) )
{
wordStem = subStr( word.begin( ), substrION );
}
// OU ->
// homologou -> homolog
else if ( *substrOU != '\0' )
{
wordStem = subStr( word.begin( ), substrOU );
}
// ISM ->
// communism -> commun
else if ( *substrISM != '\0' )
{
wordStem = subStr( word.begin( ), substrISM );
}
// ATE ->
// activate -> activ
else if ( *substrATE != '\0' )
{
wordStem = subStr( word.begin( ), substrATE );
}
// ITI ->
// angulariti -> angular
else if ( *substrITI != '\0' )
{
wordStem = subStr( word.begin( ), substrITI );
}
// OUS ->
// homologous -> homolog
else if ( *substrOUS != '\0' )
{
wordStem = subStr( word.begin( ), substrOUS );
}
// IVE ->
// effective -> effect
else if ( *substrIVE != '\0' )
{
wordStem = subStr( word.begin( ), substrIVE );
}
// IZE ->
// bowdlerize -> bowdler
else if ( *substrIZE != '\0' )
{
wordStem = subStr( word.begin( ), substrIZE );
}
return wordStem;
}
/**
* Step 5a
*
* @param word
* @return
*/
std::string Stemmer::step5a ( std::string word )
{
unsigned long end = word.size( ) - 1;
auto endPtr = word.begin( ) + end;
// E ->
// probabte -> probat
if ( measure( word ) > 1 && *endPtr == 'e' )
{
word = subStr( word, 0, word.size( ) - 1 );
return word;
}
// E ->
// cease -> cease
if ( measure( word ) == 1 && !endCVC( word ) && *endPtr == 'e' )
{
word = subStr( word, 0, word.size( ) - 1 );
}
return word;
}
/**
* Step 5b
*
* @param word
* @return
*/
std::string Stemmer::step5b ( std::string word )
{
unsigned long end = word.size( ) - 1;
auto endPtr = word.begin( ) + end;
if ( word.size( ) > 2 && measure( word ) > 1 && *endPtr == 'l' && *( endPtr - 1 ) == 'l' )
{
word = subStr( word, 0, word.size( ) - 1 );
}
return word;
}
\ No newline at end of file
//
// Created by Veronica Day on 2/22/18.
//
#pragma once
#include <string>
#include "stringProcessing.h"
/**
* Modeled after the Porter Stemmer algorithm
* http://snowball.tartarus.org/algorithms/porter/stemmer.html
*/
class Stemmer
{
public:
/**
* Stemmer Cstor
*/
Stemmer ( );
/**
* Returns the stem of a word
*
* @param word
* @return
*/
std::string execute ( std::string word );
private:
/**
* Number of consonant sequences
*
* <c><v> -> 0
* <c>vc<v> -> 1
* <c>vcvc<v> -> 2
* <c>vcvcvc<v> -> 3
*
* @param word
* @return
*/
int measure ( std::string word );
/**
* Check if a vowel is present in the stem
*
* @param wordBeg
* @param wordEnd
* @param word
* @return
*/
bool isVowelPresent ( string::iterator wordBeg, string::iterator wordEnd, string word );
/**
* Return true if the wordIt points to a consonant
*
* @param wordIt
* @param wordBegin
* @return
*/
bool isConsonant ( string::iterator wordIt, string::iterator wordBegin );
/**
* Returns true if should add 'e' to end
*
* @param word
* @return
*/
bool addE ( string word );
/**
* Returns true if word ends in double constant
* Not LL, SS, ZZ
* @param word
* @return
*/
bool doubleCon ( string word );
/**
* Returns true if a word ends in a
* Consonant, Vowel, Consonant pattern
* Except when second C is W, X, or Y
*
* @param word
* @return
*/
bool endCVC ( std::string word );
/**
* Stem plural words
*
* @param word
* @return
*/
std::string step1a ( std::string word );
/**
* Stem ED and ING
*
* @param word
* @return
*/
std::string step1b ( std::string word );
/**
* Checks for Y -> I
*
* @param word
* @return
*/
string step1c ( string word );
/**
* Step 2
*
* @param word
* @return
*/
string step2 ( std::string word );
/**
* Step 3
*
* @param word
* @return
*/
std::string step3 ( std::string word );
/**
* Step 4
*
* @param word
* @return
*/
std::string step4 ( std::string word );
/**
* Step 5a
*
* @param word
* @return
*/
std::string step5a ( std::string word );
/**
* Step 5b
*
* @param word
* @return
*/
std::string step5b ( std::string word );
};
......
#include "Tokenizer.h"
/**
* Tokenizer Cstor
*/
Tokenizer::Tokenizer ( )
{
docIndex = new unordered_map< string, vector< int>>;
}
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map< string, vector< int>> *Tokenizer::get ( ) const
{
return docIndex;
}
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
* token -> [offsets]
* @param originalText
* @param offset
*/
void Tokenizer::execute ( string & originalText, int offset )
{
vector< string > splitText = splitStr( originalText, ' ' );
string processedString = "";
for ( int i = 0; i < splitText.size( ); ++i )
{
// case fold
processedString = toLower( splitText[ i ] );
//strip all characters
processedString = stripStr( processedString );
if ( !isStopWord( processedString ) )
{
// stem word
processedString = stem.execute( processedString );
( *docIndex )[ processedString ].push_back( offset );
++offset;
}
}
}
//
// Created by anvia on 1/31/2018.
//
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
#include "stringProcessing.h"
#include "Stemmer.h"
using namespace std;
class Tokenizer
{
public:
Tokenizer ( )
{
docIndex = new unordered_map< string, vector< int>>;
}
unordered_map< string, vector< int>> *get ( ) const
{
return docIndex;
}
void execute ( string originalText, int offset )
{
vector< string > splitText = splitStr ( originalText, ' ' );
string lowerString = "";
for ( int i = 0; i < splitText.size ( ); ++i )
{
lowerString = toLower ( splitText[ i ] );
if ( !isStopWord ( lowerString ) )
{
( *docIndex )[ lowerString ].push_back ( offset );
++offset;
}
}
}
public:
/**
* Tokenizer Cstor
*/
Tokenizer ( );
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map< string, vector< int>> *get ( ) const;
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
* token -> [offsets]
* @param originalText
* @param offset
*/
void execute ( string &originalText, int offset );
private:
unordered_map< string, vector< int>> *docIndex;
Stemmer stem;
};
......@@ -3,158 +3,380 @@
//
#include "stringProcessing.h"
#include "Stemmer.h"
#include <cassert>
using namespace std;
string::iterator findStr (string needle, string haystack )
/**
* Finds the needle in the haystack
* returns position of first match
*
* @param haystack
* @param needle
* @return string::iterator
*/
string::iterator findStr ( string needle, string haystack )
{
auto beginNeedle = needle.begin ( );
auto beginHaystack = haystack.begin();
auto beginNeedle = needle.begin( );
auto beginHaystack = haystack.begin( );
while ( *beginHaystack != '\0' )
{
{
//keep looking for instance of a match
if ( *beginHaystack != *beginNeedle )
{
{
++beginHaystack;
}
}
else if ( *beginHaystack == *beginNeedle )
{
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = beginHaystack;
while ( *temp == *beginNeedle )
{
{
++temp;
++beginNeedle;
//if it hits the end of the needleing, it signifies an exact match
if ( *beginNeedle == '\0' )
{
{
//this is pointing at the beginning of the match
return beginHaystack;
}
}
}
}
//need to reset because still has to search rest of the string for a match
beginNeedle = needle.begin ( );
beginNeedle = needle.begin( );
//sets the original text pointer to where the last search left off
beginHaystack = temp;
}
}
else
{
{
//DO NOTHING
}
}
}
return beginHaystack;
}
string::iterator findNext (string needle, string::iterator haystackPointer )
/**
* Finds the next position of the needle in the string
*
* @param needle
* @param pointer
* @return string::iterator
*/
string::iterator findNext ( string needle, string::iterator haystackPointer )
{
auto beginNeedle = needle.begin ( );
auto beginHaystack = haystackPointer;
while ( *beginHaystack != '\0' )
{
//keep looking for instance of a match
if ( *beginHaystack != *beginNeedle )
auto beginNeedle = needle.begin( );
while ( *haystackPointer != '\0' )
{
++beginHaystack;
}
//keep looking for instance of a match
if ( *haystackPointer != *beginNeedle )
{
++haystackPointer;
}
else if ( *beginHaystack == *beginNeedle )
{
else if ( *haystackPointer == *beginNeedle )
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = beginHaystack;
auto temp = haystackPointer;
while ( *temp == *beginNeedle )
{
{
++temp;
++beginNeedle;
//if it hits the end of the needleing, it signifies an exact match
if ( *beginNeedle == '\0' )
{
{
//this is pointing at the beginning of the match
return beginHaystack;
}
return haystackPointer;
}
}
}
//need to reset because still has to search rest of the string for a match
beginNeedle = needle.begin ( );
beginNeedle = needle.begin( );
//sets the original text pointer to where the last search left off
beginHaystack = temp;
}
haystackPointer = temp;
}
else
{
{
//DO NOTHING
}
}
}
return beginHaystack;
return haystackPointer;
}
/**
* Finds the previous position of the needle in the string
*
* @param needle
* @param haystackPointer
* @return string::iterator
*/
string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg )
{
auto begNeedle = needle.begin( );
auto endNeedle = begNeedle + ( needle.size( ) - 1 );
while ( haystackPointer != haystackBeg )
{
//keep looking for instance of a match
if ( *haystackPointer != *endNeedle )
{
--haystackPointer;
}
else if ( *haystackPointer == *endNeedle )
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = haystackPointer;
while ( *temp == *endNeedle )
{
//if it hits the end of the needleing, it signifies an exact match
if ( endNeedle == begNeedle && *temp == *endNeedle )
{
//this is pointing at the beginning of the match
return temp;
}
if ( temp != haystackBeg )
{
--temp;
}
if ( endNeedle != begNeedle )
{
--endNeedle;
}
vector< string > splitStr ( string originalText, char delim )
}
//need to reset because still has to search rest of the string for a match
endNeedle = begNeedle + ( needle.size( ) - 1 );
//sets the original text pointer to where the last search left off
haystackPointer = temp;
}
else
{
//DO NOTHING
}
}
return needle.end( );
}
/**
* Returns a vector of strings from @originalText, split by @delim
*
* @param originalText
* @param delim
* @return vector < string >
*/
vector< string > splitStr ( string & originalText, char delim )
{
vector< string > splitWords;
auto begin = originalText.begin ( );
auto begin = originalText.begin( );
while ( *begin != '\0' )
{
{
string word = "";
while ( *begin != delim && *begin != '\0' )
{
{
word += *begin;
++begin;
}
}
splitWords.push_back ( word );
splitWords.push_back( word );
++begin;
}
}
return splitWords;
}
bool isStopWord ( string word )
/**
* Returns true if @word is a stopword
*
* @param word
* @return bool
*/
bool isStopWord ( string & word )
{
return ( stopWords.find ( word ) != stopWords.end ( ) );
return ( stopWords.find( word ) != stopWords.end( ) );
}
string toLower ( string word )
/**
* Returns lowercase @word
*
* @param word
* @return string
*/
string toLower ( string & word )
{
auto iter = word.begin ( );
auto iter = word.begin( );
string lowerWord = "";
while ( *iter != '\0' )
{
if ( *iter >= 'A' && *iter <= 'Z' )
{
if ( *iter >= 'A' && *iter <= 'Z' )
{
lowerWord += ( *iter + 32 );
}
}
else
{
{
lowerWord += *iter;
}
}
++iter;
}
}
return lowerWord;
}
/**
* Returns stemmed @word
*
* @param word
* @return string
*/
string stemWord ( string & word )
{
Stemmer stemmer;
word = stemmer.execute( word );
return word;
}
/**
* Returns a substring [ post, len )
*
* @param word
* @param pos
* @param len
* @return string
*/
string subStr ( string & word, size_t pos, size_t len )
{
string substr = "";
for ( int i = 0; i < len; ++i )
{
substr += word.at( pos );
++pos;
}
return substr;
}
string stemWord(string word)
/**
* Returns a substring [ begin, end )
*
* @param pos
* @param len
* @return string
*/
string subStr ( string::iterator begin, string::iterator end )
{
return "";
string substr = "";
while ( begin != end )
{
substr += *begin;
++begin;
}
return substr;
}
/**
* Removes the chars in vector from word
*
* @param word
* @param chars
* @return string
*/
string stripStr ( string & word, vector< char > chars )
{
string wordStripped = "";
auto begin = word.begin( );
bool isSymbol = false;
while ( begin != word.end( ) )
{
for ( int i = 0; i < chars.size( ); ++i )
{
if ( *begin == chars[ i ] )
{
isSymbol = true;
}
}
if ( !isSymbol )
{
wordStripped += *begin;
}
++begin;
}
return wordStripped;
}
/**
* Removes all chars from word
* Assumes word is lowercase
*
* @param word
* @param chars
* @return string
*/
string stripStr ( string & word )
{
string wordStripped = "";
auto begin = word.begin( );
while ( begin != word.end( ) )
{
if ( isAlpha( *begin ) || isNum( *begin ) )
{
wordStripped += *begin;
}
++begin;
}
return wordStripped;
}
/**
* Returns true is character is a letter
*
* @param ch
* @return bool
*/
bool isAlpha ( char ch )
{
// capital letter
if ( ch >= 'A' && ch <= 'Z' )
{
return true;
}
// lowercase letter
if ( ch >= 'a' && ch <= 'z' )
{
return true;
}
return false;
}
/**
* Returns true is character is a number
*
* @param ch
* @return bool
*/
bool isNum ( char ch )
{
if ( ch >= '0' && ch <= '9' )
{
return true;
}
return false;
}
\ No newline at end of file
......@@ -14,57 +14,127 @@ using namespace std;
/**
* Set of stopwords
*/
static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few",
"from",
"for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our",
"she",
"some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will",
"with",
"you", "your" };
static set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
"for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
"some", "the", "their", "them", "there", "they", "that",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
"you", "your" };
/**
* Finds the needle in the haystack
* returns position of first match
*
* @param haystack
* @param needle
* @return string::iterator
*/
string::iterator findStr (string needle, string haystack );
string::iterator findStr ( string needle, string haystack );
/**
* Finds the next position of the needle in the string
*
* @param needle
* @param pointer
* @return string::iterator
*/
string::iterator findNext (string needle, string::iterator haystackPointer );
string::iterator findNext ( string needle, string::iterator haystackPointer );
/**
* Finds the previous position of the needle in the string
*
* @param needle
* @param haystackPointer
* @param haystackBeg
* @return string::iterator
*/
string::iterator findPrev ( string needle, string::iterator haystackPointer, string::iterator haystackBeg );
/**
* Returns a vector of strings from @originalText, split by @delim
*
* @param originalText
* @param delim
* @return
* @return vector< string >
*/
vector< string > splitStr ( string originalText, char delim );
vector< string > splitStr ( string & originalText, char delim );
/**
* Returns true if @word is a stopword
*
* @param word
* @return
* @return bool
*/
bool isStopWord ( string word );
bool isStopWord ( string & word );
/**
* Returns lowercase @word
*
* @param word
* @return
* @return string
*/
string toLower ( string word );
string toLower ( string & word );
//TODO
/**
* Returns stemmed @word
*
* @param word
* @return
* @return string
*/
string stemWord ( string & word );
/**
* Returns a substring [ post, len )
*
* @param word
* @param pos
* @param len
* @return string
*/
string subStr ( string & word, size_t pos, size_t len );
/**
* Returns a substring [ begin, end )
*
* @param pos
* @param len
* @return string
*/
string stemWord(string word);
string subStr ( string::iterator begin, string::iterator end );
/**
* Removes the chars in vector from word
*
* @param word
* @param chars
* @return string
*/
string stripStr ( string & word, vector< char > chars );
/**
* Removes all chars from word
* Assumes word is lowercase
*
* @param word
* @return string
*/
string stripStr ( string & word );
/**
* Returns true is character is a letter
*
* @param ch
* @return bool
*/
bool isAlpha ( char ch );
/**
* Returns true is character is a number
*
* @param ch
* @return bool
*/
bool isNum ( char ch );
\ No newline at end of file
//
// Created by Veronica Day on 2/22/18.
//
#include <string>
#include <vector>
#include "../Stemmer.h"
#include <iostream>
#include <cassert>
int main ( )
{
cout << "Beginning testing for Stemmer" << endl;
Stemmer stem;
assert ( stem.execute( "caresses" ) == "caress" );
assert ( stem.execute( "ponies" ) == "poni" );
assert ( stem.execute( "ties" ) == "ti" );
assert ( stem.execute( "caress" ) == "caress" );
assert ( stem.execute( "cats" ) == "cat" );
assert ( stem.execute( "feed" ) == "feed" );
assert ( stem.execute( "agreed" ) == "agre" );
assert ( stem.execute( "plastered" ) == "plaster" );
assert ( stem.execute( "bled" ) == "bled" );
assert ( stem.execute( "motoring" ) == "motor" );
assert ( stem.execute( "conflated" ) == "conflat" );
assert ( stem.execute( "troubled" ) == "troubl" );
assert ( stem.execute( "sized" ) == "size" );
assert ( stem.execute( "hopping" ) == "hop" );
assert ( stem.execute( "tanning" ) == "tan" );
assert ( stem.execute( "tanned" ) == "tan" );
assert ( stem.execute( "falling" ) == "fall" );
assert ( stem.execute( "hissing" ) == "hiss" );
assert ( stem.execute( "fizzed" ) == "fizz" );
assert ( stem.execute( "failing" ) == "fail" );
assert ( stem.execute( "filing" ) == "file" );
assert ( stem.execute( "happy" ) == "happi" );
assert ( stem.execute( "sky" ) == "sky" );
assert ( stem.execute( "relational" ) == "relat" );
assert ( stem.execute( "conditional" ) == "condit" );
assert ( stem.execute( "rational" ) == "ration" );
assert ( stem.execute( "valenci" ) == "valenc" );
assert ( stem.execute( "hesitanci" ) == "hesit" );
assert ( stem.execute( "digitizer" ) == "digit" );
assert ( stem.execute( "conformabli" ) == "conform" );
assert ( stem.execute( "radicalli" ) == "radic" );
assert ( stem.execute( "differentli" ) == "differ" );
assert ( stem.execute( "vileli" ) == "vile" );
assert ( stem.execute( "analogousli" ) == "analog" );
assert ( stem.execute( "vietnamization" ) == "vietnam" );
assert ( stem.execute( "predication" ) == "predic" );
assert ( stem.execute( "operator" ) == "oper" );
assert ( stem.execute( "feudalism" ) == "feudal" );
assert ( stem.execute( "decisiveness" ) == "decis" );
assert ( stem.execute( "hopefulness" ) == "hope" );
assert ( stem.execute( "callousness" ) == "callous" );
assert ( stem.execute( "formaliti" ) == "formal" );
assert ( stem.execute( "sensitiviti" ) == "sensit" );
assert ( stem.execute( "sensibiliti" ) == "sensibl" );
assert ( stem.execute( "triplicate" ) == "triplic" );
assert ( stem.execute( "formative" ) == "form" );
assert ( stem.execute( "formalize" ) == "formal" );
assert ( stem.execute( "electriciti" ) == "electr" );
assert ( stem.execute( "electrical" ) == "electr" );
assert ( stem.execute( "hopeful" ) == "hope" );
assert ( stem.execute( "goodness" ) == "good" );
assert ( stem.execute( "revival" ) == "reviv" );
assert ( stem.execute( "allowance" ) == "allow" );
assert ( stem.execute( "inference" ) == "infer" );
assert ( stem.execute( "airliner" ) == "airlin" );
assert ( stem.execute( "gyroscopic" ) == "gyroscop" );
assert ( stem.execute( "adjustable" ) == "adjust" );
assert ( stem.execute( "defensible" ) == "defens" );
assert ( stem.execute( "irritant" ) == "irrit" );
assert ( stem.execute( "replacement" ) == "replac" );
assert ( stem.execute( "adjustment" ) == "adjust" );
assert ( stem.execute( "dependent" ) == "depend" );
assert ( stem.execute( "adoption" ) == "adopt" );
assert ( stem.execute( "homologou" ) == "homolog" );
assert ( stem.execute( "communism" ) == "commun" );
assert ( stem.execute( "activate" ) == "activ" );
assert ( stem.execute( "angulariti" ) == "angular" );
assert ( stem.execute( "homologous" ) == "homolog" );
assert ( stem.execute( "effective" ) == "effect" );
assert ( stem.execute( "bowdlerize" ) == "bowdler" );
assert ( stem.execute( "probate" ) == "probat" );
assert ( stem.execute( "cease" ) == "ceas" );
assert ( stem.execute( "controll" ) == "control" );
assert ( stem.execute( "roll" ) == "roll" );
assert ( stem.execute( "university" ) == "univers" );
assert ( stem.execute( "example" ) == "exampl" );
assert ( stem.execute( "do" ) == "do" );
assert ( stem.execute( "you" ) == "you" );
assert ( stem.execute( "really" ) == "real" );
assert ( stem.execute( "weakness" ) == "weak" );
assert ( stem.execute( "yields" ) == "yield" );
assert ( stem.execute( "temptation" ) == "temptat" );
assert ( stem.execute( "are" ) == "ar" );
assert ( stem.execute( "terrible" ) == "terribl" );
cout << "\nTests passed for Stemmer :D" << endl;
}
//
// Created by Veronica Day on 2/13/18.
//
#include <string>
#include <vector>
#include "../stringProcessing.h"
#include "../Stemmer.h"
#include <iostream>
#include <cassert>
......@@ -12,76 +10,163 @@ using namespace std;
void testFindStr ( string original );
void testFindNext ( );
void testFindPrev ( );
void testSplitStr ( string original );
void testIsStopWord ( );
void testToLower ( );
void testIsStopWord ( );
void testStemWord ( );
void testSubStr ( );
void testStripStr ( );
void testIsAlpha ( );
void testIsNum ( );
int main ( )
{
cout << "Beginning testing for StringProcessing_unit" << endl << endl;
cout << "Beginning testing for StringProcessing" << endl << endl;
string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. "
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
testFindStr ( original );
testSplitStr ( original );
testToLower ( );
testIsStopWord ( );
testFindStr( original );
testFindNext( );
testFindPrev( );
testSplitStr( original );
testIsStopWord( );
testToLower( );
testStemWord( );
testSubStr( );
testStripStr( );
testIsAlpha( );
testIsNum( );
cout << "\nTests passed for StringProcessing_unit :D" << endl;
cout << "\nTests passed for StringProcessing :D" << endl;
}
void testFindStr ( string original )
{
cout << "Testing findStr..." << endl;
assert( *findStr ( original, "established" ) == 'e' );
assert( *findStr ( original, "Lorem Ipsum" ) == 'L' );
assert( *findStr( "established", original ) == 'e' );
assert( *findStr( "Lorem Ipsum", original ) == 'L' );
string title = "<title> This is a test </title>";
auto word = findStr ( title, "<title>" );
auto word = findStr( "<title>", title );
assert( *word == '<' );
auto titleIt = title.begin ( );
while ( word != title.end ( ) && titleIt != title.end ( ) )
auto titleIt = title.begin( );
while ( word != title.end( ) && titleIt != title.end( ) )
{
assert( *word == *titleIt );
++word;
++titleIt;
}
auto word1 = findStr ( title, "</title>" );
auto word1 = findStr( "</title>", title );
assert( *word1 == '<' && *( word1 + 1 ) == '/' );
assert( *findStr ( original, "</title>" ) == '\0' );
assert( *findStr ( original, "orange" ) == '\0' );
assert( *findStr ( "apple", "orange" ) == '\0' );
auto word2 = findStr ( "bigbird", "bird" );
assert( *findStr( "</title>", original ) == '\0' );
assert( *findStr( "orange", original ) == '\0' );
assert( *findStr( "orange", "apple" ) == '\0' );
auto word2 = findStr( "bird", "bigbird" );
assert( *word2 == 'b' && *( word2 + 1 ) == 'i' && *( word2 + 2 ) == 'r' );
cout << "testFindStr passed" << endl;
cout << "testFindStr passed" << endl << endl;
}
void testFindNext ( )
{
cout << "Testing findNext..." << endl;
string racecar = "racecar";
string hello = "hello";
string blank = "";
assert ( *findNext( "race", racecar.begin( ) ) == 'r' );
assert ( *findNext( "race", racecar.begin( ) + 4 ) == '\0' );
assert ( *findNext( "car", racecar.begin( ) + 4 ) == 'c' );
assert ( *findNext( "hello", hello.begin( ) ) == 'h' );
assert ( *findNext( "ello", hello.begin( ) ) == 'e' );
assert ( *findNext( "ello", hello.begin( ) + 2 ) == '\0' );
assert ( *findNext( "", blank.begin( ) ) == '\0' );
cout << "testFindNext passed" << endl << endl;
}
void testFindPrev ( )
{
cout << "Testing findPrev..." << endl;
string racecar = "racecar";
string hello = "hello";
string blank = "";
assert ( *findPrev( "race", racecar.begin( ), racecar.begin( ) ) == '\0' );
assert ( *findPrev( "race", racecar.begin( ) + 4, racecar.begin( ) ) == 'r' );
assert ( *findPrev( "car", racecar.begin( ) + 4, racecar.begin( ) ) == '\0' );
assert ( *findPrev( "car", racecar.begin( ) + 7, racecar.begin( ) ) == 'c' );
assert ( *findPrev( "hello", hello.begin( ), hello.begin( ) ) == '\0' );
assert ( *findPrev( "ello", hello.begin( ) + 3, hello.begin( ) ) == '\0' );
assert ( *findPrev( "ello", hello.begin( ) + 5, hello.begin( ) ) == 'e' );
assert ( *findPrev( "", blank.begin( ), blank.begin( ) ) == '\0' );
string fall = "fall";
assert ( *findPrev( "bl", fall.begin( ) + 3, fall.begin( ) ) == '\0' );
cout << "testFindPrev passed" << endl << endl;
}
void testSplitStr ( string original )
{
cout << "Testing splitStr..." << endl;
vector< string > vec = splitStr ( original, ' ' );
assert( vec.size ( ) == 53 );
vector< string > vec = splitStr( original, ' ' );
assert( vec.size( ) == 53 );
string word = "hello\ngoodbye";
vec = splitStr ( word, '\n' );
assert( vec.size ( ) == 2 );
vec = splitStr( word, '\n' );
assert( vec.size( ) == 2 );
assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
cout << "testSplitStr passed" << endl;
cout << "testSplitStr passed" << endl << endl;
}
void testIsStopWord ( )
{
cout << "Testing isStopWord..." << endl;
string is = "is";
string hello = "Hello";
string none = "none";
string blank = "";
string blank2 = " ";
assert ( isStopWord( is ) );
assert ( !isStopWord( hello ) );
assert ( isStopWord( none ) );
assert ( !isStopWord( blank ) );
assert ( !isStopWord( blank2 ) );
cout << "testIsStopWord passed" << endl << endl;
}
void testToLower ( )
{
......@@ -93,11 +178,11 @@ void testToLower ( )
string word4 = "";
string word5 = " ";
string test = toLower ( word );
string test2 = toLower ( word2 );
string test3 = toLower ( word3 );
string test4 = toLower ( word4 );
string test5 = toLower ( word5 );
string test = toLower( word );
string test2 = toLower( word2 );
string test3 = toLower( word3 );
string test4 = toLower( word4 );
string test5 = toLower( word5 );
assert ( test == "hello" );
assert ( test2 == "hello" );
......@@ -105,26 +190,144 @@ void testToLower ( )
assert ( test4 == "" );
assert ( test5 == " " );
cout << "testToLower passed" << endl;
cout << "testToLower passed" << endl << endl;
}
void testStemWord ( )
{
cout << "Testing stemWord..." << endl;
Stemmer stem;
void testIsStopWord ( )
assert ( stem.execute( "cats" ) == "cat" );
assert ( stem.execute( "wilde" ) == "wild" );
assert( stem.execute( "zoo" ) == "zoo" );
assert( stem.execute( "troublesome" ) == "troublesom" );
cout << "testStemWord passed" << endl << endl;
}
void testSubStr ( )
{
cout << "Testing isStopWord..." << endl;
cout << "Testing subStr..." << endl;
string is = "is";
string hello = "Hello";
string none = "none";
string blank = "";
string blank2 = " ";
string hello = "hello";
string goodbye = "goodbye";
string blank = " ";
string blank2 = "";
assert ( subStr( hello, 1, 4 ) == "ello" );
assert ( subStr( hello, 0, 5 ) == "hello" );
assert ( subStr( hello, 0, 1 ) == "h" );
assert ( subStr( hello, 1, 2 ) == "el" );
assert ( subStr( goodbye, 0, 4 ) == "good" );
assert ( subStr( goodbye, 4, 3 ) == "bye" );
assert ( subStr( goodbye, 1, 0 ) == "" );
assert ( subStr( goodbye, 0, 7 ) == "goodbye" );
assert ( subStr( blank, 0, 1 ) == " " );
assert ( subStr( blank, 0, 0 ) == "" );
assert ( subStr( blank2, 0, 0 ) == "" );
assert ( subStr( hello.begin( ), hello.end( ) ) == "hello" );
assert ( subStr( hello.begin( ) + 4, hello.begin( ) + 5 ) == "o" );
assert ( subStr( hello.begin( ), hello.begin( ) + 1 ) == "h" );
assert ( subStr( goodbye.begin( ) + 1, goodbye.begin( ) + 3 ) == "oo" );
cout << "testSubStrpassed" << endl << endl;
assert ( isStopWord ( is ) );
assert ( !isStopWord ( hello ) );
assert ( isStopWord ( none ) );
assert ( !isStopWord ( blank ) );
assert ( !isStopWord ( blank2 ) );
}
void testStripStr ( )
{
cout << "Testing stripStr..." << endl;
char arr[] = { ',', '.', '*', '&', '^', '%', ';', ' ' };
vector< char > chars( arr, arr + sizeof( arr ) / sizeof( arr[ 0 ] ) );
string hello = "!hello!";
string allSym = "\"*&^%;";
string comma = "comma,";
string period = "period.";
string blank = " ";
assert ( stripStr( hello ) == "hello" );
assert ( stripStr( allSym ) == "" );
assert ( stripStr( comma ) == "comma" );
assert ( stripStr( period ) == "period" );
assert ( stripStr( blank ) == "" );
assert ( stripStr( hello, chars ) == "!hello!" );
assert ( stripStr( allSym, chars ) == "\"" );
assert ( stripStr( comma, chars ) == "comma" );
assert ( stripStr( period, chars ) == "period" );
assert ( stripStr( blank, chars ) == "" );
cout << "testStripStrpassed" << endl << endl;
}
cout << "testIsStopWord passed" << endl;
void testIsAlpha ( )
{
cout << "Testing isAlpha..." << endl;
assert ( isAlpha( 'a' ) );
assert ( isAlpha( 'A' ) );
assert ( isAlpha( 'z' ) );
assert ( isAlpha( 'Z' ) );
assert ( isAlpha( 'g' ) );
assert ( isAlpha( 'i' ) );
assert ( isAlpha( 'P' ) );
assert ( !isAlpha( '1' ) );
assert ( !isAlpha( '0' ) );
assert ( !isAlpha( '9' ) );
assert ( !isAlpha( '5' ) );
assert ( !isAlpha( '6' ) );
assert ( !isAlpha( ' ' ) );
assert ( !isAlpha( '!' ) );
assert ( !isAlpha( '/' ) );
assert ( !isAlpha( '?' ) );
assert ( !isAlpha( '*' ) );
assert ( !isAlpha( '-' ) );
assert ( !isAlpha( '.' ) );
assert ( !isAlpha( ',' ) );
assert ( !isAlpha( '(' ) );
assert ( !isAlpha( '}' ) );
cout << "testIsAlpha passed" << endl << endl;
}
void testIsNum ( )
{
cout << "Testing isNum..." << endl;
assert ( !isNum( 'a' ) );
assert ( !isNum( 'A' ) );
assert ( !isNum( 'z' ) );
assert ( !isNum( 'Z' ) );
assert ( !isNum( 'g' ) );
assert ( !isNum( 'i' ) );
assert ( !isNum( 'P' ) );
assert ( isNum( '1' ) );
assert ( isNum( '0' ) );
assert ( isNum( '9' ) );
assert ( isNum( '5' ) );
assert ( isNum( '6' ) );
assert ( !isNum( ' ' ) );
assert ( !isNum( '!' ) );
assert ( !isNum( '/' ) );
assert ( !isNum( '?' ) );
assert ( !isNum( '*' ) );
assert ( !isNum( '-' ) );
assert ( !isNum( '.' ) );
assert ( !isNum( ',' ) );
assert ( !isNum( '(' ) );
assert ( !isNum( '}' ) );
cout << "testIsNum passed" << endl;
}
\ No newline at end of file
//
// Created by Veronica Day on 2/13/18.
//
#include <string>
#include <vector>
......@@ -16,29 +13,29 @@ void testExecute ( string original );
int main ( )
{
cout << "Beginning testing for TokenizerTest_unit" << endl << endl;
cout << "Beginning testing for TokenizerTest" << endl << endl;
string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. "
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. ";
testExecute ( original );
testExecute( original );
cout << "\nTests passed for TokenizerTest_unit :D" << endl;
cout << "\nTests passed for TokenizerTest :D" << endl;
}
void testExecute ( string original )
{
Tokenizer myTokenizer;
myTokenizer.execute ( original );
myTokenizer.execute( original, 0 );
auto dict = myTokenizer.get ( );
auto dict = myTokenizer.get( );
for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
cout << it->first << ':';
for ( int i = 0; i < it->second.size ( ); ++i )
for ( int i = 0; i < it->second.size( ); ++i )
{
cout << it->second[ i ] << " ";
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment