#include "Tokenizer.h"

/**
 * Tokenizer Cstor
 */
Tokenizer::Tokenizer ( )
	{
	docIndex = new unordered_map< string, vector< int>>;
	}

/**
 * Returns pointer to the docIndex dictionary
 *
 * @return pointer to unordered_map< string, vector< int>>
 */
unordered_map< string, vector< int>> *Tokenizer::get ( ) const
	{
	return docIndex;
	}

/**
 * Executes the Tokenizer
 * Sends tokens to dictionary
 *
 * token -> [offsets]
 * @param originalText
 * @param offset
 */
void Tokenizer::execute ( string & originalText, int offset )
	{
	vector< string > splitText = splitStr( originalText, ' ' );
	string processedString = "";
	for ( int i = 0; i < splitText.size( ); ++i )
		{
		// case fold
		processedString = toLower( splitText[ i ] );
		//strip all characters
		processedString = stripStr( processedString );

		if ( !isStopWord( processedString ) )
			{
			// stem word
			processedString = stem.execute( processedString );
			( *docIndex )[ processedString ].push_back( offset );
			++offset;
			}
		}
	}