#include "Tokenizer.h" #include <iostream> /** * Tokenizer Cstor */ Tokenizer::Tokenizer ( ) { docIndex = new unordered_map< string, vector< unsigned long > >; } /** * Returns pointer to the docIndex dictionary * * @return pointer to unordered_map< string, vector< int>> */ unordered_map< string, vector< unsigned long > > *Tokenizer::get ( ) const { return docIndex; } /** * Executes the Tokenizer * Sends tokens to dictionary * * @param originalText * @param offset * @param decorator */ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, char decorator ) { // split by symbols if ( decorator == Tokenizer::URL ) { vector < char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='}; return tokenize( splitStr( originalText, split, true ), offset, decorator ); } // split by spaces else { return tokenize( splitStr( originalText, ' ', true ), offset, decorator ); } } /** * Tokenizes text (titles, body text) * * @param originalText * @param offset * @param decorator */ unsigned long Tokenizer::tokenize ( vector< string > splitText , unsigned long offset, char decorator ) { string processedString = ""; for ( int i = 0; i < splitText.size( ); ++i ) { // case fold processedString = toLower( splitText[ i ] ); //strip all characters if ( !isStopWord( processedString ) ) { // stem word processedString = stem.execute( processedString ); if ( decorator != '\0' ) { processedString = decorator + processedString; } ( *docIndex )[ processedString ].push_back( offset ); ++offset; } } return offset; }