// // Created by anvia on 1/31/2018. // #pragma once #include <string> #include <unordered_map> #include <vector> #include "stringProcessing.h" using namespace std; struct wordData { int offset; int frequency = 0; //total num words/unique words?? }; class Tokenizer { public: Tokenizer ( ) { docIndex = new unordered_map< string, vector<wordData>>; } unordered_map< string, vector< int>> *get ( ) const { return docIndex; } //add type of word parameter, ie paragraph, url etc void execute ( string originalText, int offset ) { vector< string > splitText = splitStr ( originalText, ' ' ); string lowerString = ""; int vectorLength = 0; for ( int i = 0; i < splitText.size ( ); ++i ) { lowerString = toLower ( splitText[ i ] ); if ( !isStopWord ( lowerString ) ) { //crawler will have to delete these off the heap as well //when would a dtor come into play here? wordData *currentWord = new wordData; currentWord -> offset = offset; vectorLength = ( *docIndex )[ lowerString ].size( ); ( *docIndex )[ lowerString ].push_back ( *currentWord ); ( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1; //I don't know if this is good practice or not delete currentWord; ++offset; } } currentWord = nullptr; } private: unordered_map< string, vector<wordData>> *docIndex; };