#pragma once

#include <string>
#include <unordered_map>
#include <vector>
#include "stringProcessing.h"
#include "Stemmer.h"

using namespace std;

class Tokenizer
{

public:

	// decorators
	static const char TITLE = '#';
	static const char ANCHOR = '@';
	static const char URL = '$';
	static const char BODY = '%';
	static const char HOST = '=';

	/**
 	* Tokenizer Cstor
 	*/
	Tokenizer ( );


	/**
 	* Returns pointer to the docIndex dictionary
	 *
 	* @return pointer to unordered_map< string, vector< int>>
 	*/
	const unordered_map< string, vector< unsigned long > > *get ( ) const;

	/**
	 * Executes the Tokenizer
	 * Sends tokens to dictionary
	 *
	 *
	 * @param originalText
	 * @param offset
	 * @param decorator
	 */
	unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );

private:

    unordered_map< string, vector< unsigned long > > *docIndex;
    Stemmer stem;

    /**
     * Tokenizes text (titles, body text)
     *
     * @param originalText
     * @param offset
     * @param decorator
     */
    unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );

};