//
// Created by Jake Close on 3/5/18.
//



#include "Parser.h"


/**
 * Parses file
 * @param inFile
 * @return
 */
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void Parser::parse ( string html, Tokenizer *tokenizer )
	{
	auto htmlIt = html.begin();
	int offset = 0;
	while (htmlIt != html.end())
		{
		// if open bracket
		if ( *htmlIt == '<' )
			{
			auto begCloseTag = findNext ("</", htmlIt);
			auto endCloseTag = findNext ( ">", begCloseTag);
			string line (htmlIt, endCloseTag + 1);
			htmlIt = endCloseTag + 2;

			// check if line is url
			string url = extract_url ( line );
			if (url != "")
				{
				urlFrontier->Push ( url );
				}
				// check if line is title
			else
				{
				string title = extract_title ( line );
				if (title != "")
					{
					tokenizer->execute ( title, offset );
					}
				}
			//TODO fix offset?
			offset = htmlIt - html.begin();
			}
		else
			{
			++htmlIt;
			}
		}


	}

/**
 * Returns a url, or "" if none
 * @param word
 * @return
 */
string Parser::extract_url ( string word )
	{
	string url = "";
	if ( *findStr ( "<a", word ) != '\0' )
		{
		auto foundHref = findStr ( "href", word );
		auto foundHttp = findNext ( "http", foundHref );
		if ( *foundHttp != '\0' )
			{
			url = "";
			auto closeTag = findNext ( ">", word.begin ( ) );
			while ( *foundHttp != *closeTag )
				{
				url += *foundHttp;
				++foundHttp;
				}
			}
		}

	return url;
	}

/**
 * Returns a title, or "" if none
 * @param word
 * @return
 */
string Parser::extract_title ( string & word )
	{
	string title = "";
	char end = '<';
	auto pos = findStr ( "<title>", word );
	if ( *pos != '\0')
		{
		pos += 7;
		while ( *pos != end )
			{
			title += *pos;
			++pos;
			}
		}
	return title;
	}