Skip to content
Snippets Groups Projects
Tokenizer.h 776 B
Newer Older
  • Learn to ignore specific revisions
  • aanvi's avatar
    aanvi committed
    //
    // Created by anvia on 1/31/2018.
    //
    
    #include <string>
    #include <unordered_map>
    #include <vector>
    
    vcday's avatar
    vcday committed
    #include "stringProcessing.h"
    
    vcday's avatar
    vcday committed
    
    
    aanvi's avatar
    aanvi committed
    using namespace std;
    
    class Tokenizer
    
    vcday's avatar
    vcday committed
    	{
    
    aanvi's avatar
    aanvi committed
    public:
    
    vcday's avatar
    vcday committed
    	Tokenizer ( )
    		{
    
    vcday's avatar
    vcday committed
    		docIndex = new unordered_map< string, vector< int>>;
    
    vcday's avatar
    vcday committed
    		}
    
    aanvi's avatar
    aanvi committed
    
    
    vcday's avatar
    vcday committed
    	unordered_map< string, vector< int>> *get ( ) const
    		{
    
    vcday's avatar
    vcday committed
    		return docIndex;
    
    vcday's avatar
    vcday committed
    		}
    
    aanvi's avatar
    aanvi committed
    
    
    vcday's avatar
    vcday committed
    	void execute ( string originalText, int offset )
    
    vcday's avatar
    vcday committed
    		{
    		vector< string > splitText = splitStr ( originalText, ' ' );
    		string lowerString = "";
    		for ( int i = 0; i < splitText.size ( ); ++i )
    			{
    			lowerString = toLower ( splitText[ i ] );
    			if ( !isStopWord ( lowerString ) )
    				{
    
    vcday's avatar
    vcday committed
    				( *docIndex )[ lowerString ].push_back ( offset );
    
    vcday's avatar
    vcday committed
    				++offset;
    				}
    			}
    		}
    
    aanvi's avatar
    aanvi committed
    
    private:
    
    vcday's avatar
    vcday committed
    	unordered_map< string, vector< int>> *docIndex;
    
    vcday's avatar
    vcday committed
    	};