Skip to content
Snippets Groups Projects
stringProcessing.cpp 8.18 KiB
Newer Older
  • Learn to ignore specific revisions
  • benbergk's avatar
    benbergk committed
    //
    // Created by Ben Bergkamp on 3/6/18.
    //
    
    #include "stringProcessing.h"
    
    vcday's avatar
    vcday committed
    #include "Stemmer.h"
    
    vcday's avatar
    vcday committed
    #include <cassert>
    
    vcday's avatar
    vcday committed
    #include <string>
    
    vcday's avatar
    vcday committed
    #include <iostream>
    
    benbergk's avatar
    benbergk committed
    using namespace std;
    
    
    vcday's avatar
    vcday committed
    /**
     * Finds the needle in the haystack
     * returns position of first match
     *
     * @param needle
    
    vcday's avatar
    vcday committed
     * @param haystack
     * @return
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    unsigned long findStr ( string needle, string haystack )
    
    benbergk's avatar
    benbergk committed
    	{
    
    vcday's avatar
    vcday committed
    	unsigned long needleIt = 0;
    	unsigned long haystackIt = 0;
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    	while ( haystackIt < haystack.size( ) )
    
    vcday's avatar
    vcday committed
    		{
    
    benbergk's avatar
    benbergk committed
    		//keep looking for instance of a match
    
    vcday's avatar
    vcday committed
    		if ( haystack[ haystackIt ] != needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			++haystackIt;
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    		else if ( haystack[ haystackIt ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			{
    
    benbergk's avatar
    benbergk committed
    			/* want to keep the original iterator where it is so it
    				can return the beginning of the matched word if found */
    
    vcday's avatar
    vcday committed
    			unsigned long temp = haystackIt;
    			while ( haystack[ temp ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    				{
    
    benbergk's avatar
    benbergk committed
    				++temp;
    
    vcday's avatar
    vcday committed
    				++needleIt;
    
    benbergk's avatar
    benbergk committed
    				//if it hits the end of the needleing, it signifies an exact match
    
    vcday's avatar
    vcday committed
    				if ( needleIt == needle.size( ) )
    
    vcday's avatar
    vcday committed
    					{
    
    benbergk's avatar
    benbergk committed
    					//this is pointing at the beginning of the match
    
    vcday's avatar
    vcday committed
    					return haystackIt;
    
    vcday's avatar
    vcday committed
    					}
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    				}
    
    benbergk's avatar
    benbergk committed
    			//need to reset because still has to search rest of the string for a match
    
    vcday's avatar
    vcday committed
    			needleIt = 0;
    
    benbergk's avatar
    benbergk committed
    			//sets the original text pointer to where the last search left off
    
    vcday's avatar
    vcday committed
    			haystackIt = temp;
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    		else
    
    vcday's avatar
    vcday committed
    			{
    
    benbergk's avatar
    benbergk committed
    			//DO NOTHING
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    	return haystackIt;
    
    vcday's avatar
    vcday committed
    /**
     * Finds the next position of the needle in the string
     *
     * @param needle
    
    vcday's avatar
    vcday committed
     * @param haystackIt
     * @param haystack
     * @return
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    unsigned long findNext ( string needle, unsigned long haystackIt, string haystack )
    
    benbergk's avatar
    benbergk committed
    	{
    
    vcday's avatar
    vcday committed
    	unsigned long needleIt = 0;
    	while ( haystackIt < haystack.size( ) )
    
    benbergk's avatar
    benbergk committed
    		{
    
    vcday's avatar
    vcday committed
    		//keep looking for instance of a match
    
    vcday's avatar
    vcday committed
    		if ( haystack[ haystackIt ] != needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			++haystackIt;
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    		else if ( haystack[ haystackIt ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			{
    
    benbergk's avatar
    benbergk committed
    			/* want to keep the original iterator where it is so it
    				can return the beginning of the matched word if found */
    
    vcday's avatar
    vcday committed
    			if ( needle.size( ) == 1 )
    				{
    				return haystackIt;
    				}
    			unsigned long temp = haystackIt;
    			while ( haystack[ temp ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    				{
    
    benbergk's avatar
    benbergk committed
    				++temp;
    
    vcday's avatar
    vcday committed
    				++needleIt;
    
    benbergk's avatar
    benbergk committed
    				//if it hits the end of the needleing, it signifies an exact match
    
    vcday's avatar
    vcday committed
    				if ( needleIt == needle.size( ) - 1 && haystack[ temp ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    					{
    
    benbergk's avatar
    benbergk committed
    					//this is pointing at the beginning of the match
    
    vcday's avatar
    vcday committed
    					return haystackIt;
    
    vcday's avatar
    vcday committed
    					}
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    				}
    
    benbergk's avatar
    benbergk committed
    			//need to reset because still has to search rest of the string for a match
    
    vcday's avatar
    vcday committed
    			needleIt = 0;
    
    benbergk's avatar
    benbergk committed
    			//sets the original text pointer to where the last search left off
    
    vcday's avatar
    vcday committed
    			haystackIt = temp;
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    		else
    
    vcday's avatar
    vcday committed
    			{
    
    benbergk's avatar
    benbergk committed
    			//DO NOTHING
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    	return haystackIt;
    
    vcday's avatar
    vcday committed
    /**
     * Finds the previous position of the needle in the string
     *
    
    vcday's avatar
    vcday committed
     * @param needle
    
    vcday's avatar
    vcday committed
     * @param haystackIt
     * @return unsigned long
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    unsigned long findPrev ( string needle, unsigned long haystackIt, string haystack )
    
    vcday's avatar
    vcday committed
    	if ( needle == "" )
    		{
    		return haystack.size( );
    		}
    	unsigned long needleIt = needle.size( ) - 1;
    
    vcday's avatar
    vcday committed
    
    
    vcday's avatar
    vcday committed
    	while ( haystackIt >= 0 )
    
    vcday's avatar
    vcday committed
    		{
    		//keep looking for instance of a match
    
    vcday's avatar
    vcday committed
    		if ( haystack[ haystackIt ] != needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			if ( haystackIt == 0 )
    				{
    				return haystack.size( );
    				}
    			--haystackIt;
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    		else if ( haystack[ haystackIt ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    			{
    			/* want to keep the original iterator where it is so it
    				can return the beginning of the matched word if found */
    
    vcday's avatar
    vcday committed
    			unsigned long temp = haystackIt;
    			while ( haystack[ temp ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    				//if it hits the end of the needleing, it signifies an exact match
    
    vcday's avatar
    vcday committed
    				if ( needleIt == 0 && haystack[ temp ] == needle[ needleIt ] )
    
    vcday's avatar
    vcday committed
    					//this is pointing at the beginning of the match
    					return temp;
    					}
    
    vcday's avatar
    vcday committed
    				if ( temp != haystackIt + 1 )
    
    vcday's avatar
    vcday committed
    					{
    
    vcday's avatar
    vcday committed
    					if ( temp == 0 )
    						{
    						return haystack.size( );
    						}
    
    vcday's avatar
    vcday committed
    					--temp;
    					}
    
    vcday's avatar
    vcday committed
    				if ( needleIt != 0 )
    
    vcday's avatar
    vcday committed
    					{
    
    vcday's avatar
    vcday committed
    					--needleIt;
    
    vcday's avatar
    vcday committed
    			//need to reset because still has to search rest of the string for a match
    
    vcday's avatar
    vcday committed
    			needleIt = needle.size( ) - 1;
    
    vcday's avatar
    vcday committed
    			//sets the original text pointer to where the last search left off
    
    vcday's avatar
    vcday committed
    			haystackIt = temp;
    
    vcday's avatar
    vcday committed
    			}
    
    		else
    			{
    			//DO NOTHING
    
    vcday's avatar
    vcday committed
    		}
    
    vcday's avatar
    vcday committed
    	return haystack.size( );
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    /**
     * Returns a vector of strings from @originalText, split by @delim
    
    vcday's avatar
    vcday committed
     * Will remove symbols if bool is set
    
    vcday's avatar
    vcday committed
     *
     * @param originalText
     * @param delim
    
    vcday's avatar
    vcday committed
     * @param removeChars
    
    vcday's avatar
    vcday committed
     * @return vector < string >
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    vector< string > splitStr ( string originalText, char delim, bool removeSyms )
    
    benbergk's avatar
    benbergk committed
    	{
    	vector< string > splitWords;
    
    vcday's avatar
    vcday committed
    	char begin;
    	int i = 0;
    	while ( i < originalText.size( ) )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		begin = originalText[ i ];
    
    benbergk's avatar
    benbergk committed
    		string word = "";
    
    vcday's avatar
    vcday committed
    		while ( begin != delim && i < originalText.size( ) )
    
    vcday's avatar
    vcday committed
    			{
    
    			if ( removeSyms)
    				{
    				if( isAlpha( begin ) || isNum( begin ) )
    					{
    					word.push_back( begin );
    					}
    				}
    			else
    
    vcday's avatar
    vcday committed
    				{
    
    vcday's avatar
    vcday committed
    				word.push_back( begin );
    
    vcday's avatar
    vcday committed
    				}
    
    vcday's avatar
    vcday committed
    			++i;
    			begin = originalText[ i ];
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    		if ( word != "" && word != " " && word[ 0 ] != delim )
    
    vcday's avatar
    vcday committed
    			{
    			splitWords.push_back( word );
    			}
    
    vcday's avatar
    vcday committed
    		++i;
    
    vcday's avatar
    vcday committed
    		}
    
    benbergk's avatar
    benbergk committed
    
    	return splitWords;
    
    	}
    
    
    vcday's avatar
    vcday committed
    /**
     * Splits string by multiple delimiters
     *
     * @param originalText
     * @param delims
     * @param removeSyms
     * @return
     */
    
    vcday's avatar
    vcday committed
    vector< string > splitStr ( string originalText, set< char > delims, bool removeSyms )
    
    vcday's avatar
    vcday committed
    	{
    	vector< string > splitWords;
    	char begin;
    
    vcday's avatar
    vcday committed
    	for ( int i = 0; i < originalText.size( ); ++i )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		begin = originalText[ i ];
    
    vcday's avatar
    vcday committed
    		string word = "";
    
    vcday's avatar
    vcday committed
    		while ( delims.find( begin ) == delims.end( ) && i < originalText.size( ) )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			if ( removeSyms && ( isAlpha( begin ) || isNum( begin ) ) )
    
    vcday's avatar
    vcday committed
    				{
    
    vcday's avatar
    vcday committed
    				word.push_back( begin );
    
    vcday's avatar
    vcday committed
    				}
    
    zldunn's avatar
    zldunn committed
    			else if ( !removeSyms )
    				{
    				word.push_back( begin );
    				}
    
    vcday's avatar
    vcday committed
    			++i;
    
    vcday's avatar
    vcday committed
    			begin = originalText[ i ];
    
    vcday's avatar
    vcday committed
    		if ( word != "" && word != " " )
    
    vcday's avatar
    vcday committed
    			{
    			splitWords.push_back( word );
    			}
    		}
    
    	return splitWords;
    
    	}
    
    
    
    vcday's avatar
    vcday committed
    /**
     * Returns true if @word is a stopword
     *
     * @param word
    
    vcday's avatar
    vcday committed
     * @return bool
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    bool isStopWord ( string word )
    
    benbergk's avatar
    benbergk committed
    	{
    
    vcday's avatar
    vcday committed
    	return ( stopWords.find( word ) != stopWords.end( ) );
    
    vcday's avatar
    vcday committed
    /**
     * Returns lowercase @word
     *
     * @param word
    
    vcday's avatar
    vcday committed
     * @return string
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    string toLower ( string word )
    
    benbergk's avatar
    benbergk committed
    	{
    
    vcday's avatar
    vcday committed
    	unsigned long wordIt = 0;
    
    benbergk's avatar
    benbergk committed
    	string lowerWord = "";
    
    vcday's avatar
    vcday committed
    	while ( wordIt < word.size( ) )
    
    benbergk's avatar
    benbergk committed
    		{
    
    vcday's avatar
    vcday committed
    		if ( word[ wordIt ] >= 'A' && word[ wordIt ] <= 'Z' )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			lowerWord += ( word[ wordIt ] + 32 );
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    
    		else
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			lowerWord += word[ wordIt ];
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    		++wordIt;
    
    vcday's avatar
    vcday committed
    		}
    
    benbergk's avatar
    benbergk committed
    
    	return lowerWord;
    	}
    
    
    vcday's avatar
    vcday committed
    /**
     * Returns stemmed @word
     *
     * @param word
    
    vcday's avatar
    vcday committed
     * @return string
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    string stemWord ( string word )
    
    benbergk's avatar
    benbergk committed
    	{
    
    vcday's avatar
    vcday committed
    	Stemmer stemmer;
    
    vcday's avatar
    vcday committed
    	word = stemmer.execute( word );
    
    vcday's avatar
    vcday committed
    	return word;
    
    benbergk's avatar
    benbergk committed
    	}
    
    vcday's avatar
    vcday committed
    
    /**
     * Returns a substring [ post, len )
     *
     * @param word
     * @param pos
     * @param len
    
    vcday's avatar
    vcday committed
     * @return
    
    vcday's avatar
    vcday committed
     */
    
    vcday's avatar
    vcday committed
    string subStr ( string word, unsigned long pos, unsigned long len )
    
    vcday's avatar
    vcday committed
    	{
    	string substr = "";
    	for ( int i = 0; i < len; ++i )
    		{
    
    vcday's avatar
    vcday committed
    		substr.push_back( word[ pos ] );
    
    vcday's avatar
    vcday committed
    		++pos;
    		}
    	return substr;
    	}
    
    /**
     * Removes the chars in vector from word
     *
     * @param word
     * @param chars
     * @return string
     */
    
    vcday's avatar
    vcday committed
    string stripStr ( string word, vector< char > chars )
    
    vcday's avatar
    vcday committed
    	{
    	string wordStripped = "";
    	bool isSymbol = false;
    
    
    vcday's avatar
    vcday committed
    	int j = 0;
    	while ( j < word.size( ) )
    
    vcday's avatar
    vcday committed
    		{
    		for ( int i = 0; i < chars.size( ); ++i )
    			{
    
    vcday's avatar
    vcday committed
    			if ( word[ j ] == chars[ i ] )
    
    vcday's avatar
    vcday committed
    				{
    				isSymbol = true;
    				}
    			}
    		if ( !isSymbol )
    			{
    
    vcday's avatar
    vcday committed
    			wordStripped.push_back( word[ j ] );
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    		++j;
    
    vcday's avatar
    vcday committed
    		}
    	return wordStripped;
    	}
    
    /**
     * Removes all chars from word
     * Assumes word is lowercase
     *
     * @param word
     * @param chars
     * @return string
     */
    
    vcday's avatar
    vcday committed
    string stripStr ( string word )
    
    vcday's avatar
    vcday committed
    	{
    	string wordStripped = "";
    
    vcday's avatar
    vcday committed
    	int i = 0;
    	while ( i < word.size( ) )
    
    vcday's avatar
    vcday committed
    		{
    
    vcday's avatar
    vcday committed
    		if ( isAlpha( word[ i ] ) || isNum( word[ i ] ) )
    
    vcday's avatar
    vcday committed
    			{
    
    vcday's avatar
    vcday committed
    			wordStripped.push_back( word[ i ] );
    
    vcday's avatar
    vcday committed
    			}
    
    vcday's avatar
    vcday committed
    		++i;
    
    vcday's avatar
    vcday committed
    		}
    	return wordStripped;
    	}
    
    /**
     * Returns true is character is a letter
     *
     * @param ch
     * @return bool
     */
    bool isAlpha ( char ch )
    	{
    	// capital letter
    	if ( ch >= 'A' && ch <= 'Z' )
    		{
    		return true;
    		}
    	// lowercase letter
    	if ( ch >= 'a' && ch <= 'z' )
    		{
    		return true;
    		}
    	return false;
    	}
    
    /**
     * Returns true is character is a number
     *
     * @param ch
     * @return bool
     */
    bool isNum ( char ch )
    	{
    	if ( ch >= '0' && ch <= '9' )
    		{
    		return true;
    		}
    	return false;
    
    vcday's avatar
    vcday committed
    	}
    
    /**
     * Returns last n characters in string
     * @param input
     * @param n
     * @return
     */
    string lastN ( string input, int n )
    	{
    	unsigned long inputSize = input.size( );
    	return ( n > 0 && inputSize > n ) ? input.substr( inputSize - n ) : "";
    
    vcday's avatar
    vcday committed
    	}