Parser.h

//
// Created by Veronica Day on 1/28/18.
//

// keep running count of offset, if stop word: don't incrememnt and remove stopword
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//


#include <string>
#include <functional>
#include <queue>
#include <iostream>
#include <fstream>
#include "../util/Tokenizer.h"
#include "../util/stringProcessing.h"
#include "../shared/Document.h"
#include "../shared/ProducerConsumerQueue.h"

using namespace std;

/**
 * This class uses the Doc object from the Crawler to parse the text
 * Returns a pointer to a dictionary that contains the tokenized input
 */
class Parser
	{

public:

	Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
		{
		urlFrontier = urlFrontierIn;
		}


	/**
	 * Parser
	 * @return
	 */
	const unordered_map< string, vector< int>> * execute ( Document* document)
		{
		Tokenizer tokenizer;
		parse ( document->DocToString (), &tokenizer );
		return tokenizer.get ( );
		}


private:
	ProducerConsumerQueue < string >* urlFrontier;

	/**
	 * Parses file
	 * @param inFile
	 * @return
	 */
	//TODO instead of grabbing each line, look to see if beginning of
	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
	void parse ( string html, Tokenizer *tokenizer )
		{
		auto htmlIt = html.begin();
		int offset = 0;
		while (htmlIt != html.end())
			{
			// if open bracket
			if ( *htmlIt == '<' )
				{
				auto begCloseTag = findNext ("</", htmlIt);
				auto endCloseTag = findNext ( ">", begCloseTag);
				string line (htmlIt, endCloseTag + 1);
				htmlIt = endCloseTag + 2;

				// check if line is url
				string url = extract_url ( line );
				if (url != "")
					{
					urlFrontier->Push ( url );
					}
				// check if line is title
				else
					{
					string title = extract_title ( line );
					if (title != "")
						{
						tokenizer->execute ( title, offset );
						}
					}
				//TODO fix offset?
				offset = htmlIt - html.begin();
				}
			else
				{
				++htmlIt;
				}
			}


		}

	/**
	 * Returns a url, or "" if none
	 * @param word
	 * @return
	 */
	string extract_url ( string word )
		{
		string url = "";
		if ( *findStr ( "<a", word ) != '\0' )
			{
			auto foundHref = findStr ( "href", word );
			auto foundHttp = findNext ( "http", foundHref );
			if ( *foundHttp != '\0' )
				{
				url = "";
				auto closeTag = findNext ( ">", word.begin ( ) );
				while ( *foundHttp != *closeTag )
					{
					url += *foundHttp;
					++foundHttp;
					}
				}
			}

		return url;
		}

	/**
	 * Returns a title, or "" if none
	 * @param word
	 * @return
	 */
	string extract_title ( string & word )
		{
		string title = "";
		char end = '<';
		auto pos = findStr ( "<title>", word );
		if ( *pos != '\0')
			{
			pos += 7;
			while ( *pos != end )
				{
				title += *pos;
				++pos;
				}
			}
		return title;
		}

	};