From 22d29b60cfe8c04098f7e31c3e19e5261c95921a Mon Sep 17 00:00:00 2001
From: aanvi <aanvi@umich.edu>
Date: Tue, 20 Mar 2018 01:31:14 -0400
Subject: [PATCH] Added body parsing

---
 parser/Parser.cpp           | 526 +++++++++++++++++++++---------------
 parser/Parser.h             | 151 +++++------
 parser/tests/parserTest.cpp | 106 +++++++-
 util/Tokenizer.h            | 137 +++-------
 4 files changed, 514 insertions(+), 406 deletions(-)

diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 654b6c4..64cd93c 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -6,9 +6,9 @@
  * @param urlFrontierIn
  */
 Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn )
-	{
-	urlFrontier = urlFrontierIn;
-	}
+{
+    urlFrontier = urlFrontierIn;
+}
 
 
 /**
@@ -16,215 +16,160 @@ Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn )
  * @return
  */
 const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document )
-	{
-	Tokenizer tokenizer;
-	parse( document->DocToString( ), document->getUrl( ), &tokenizer );
-	return tokenizer.get( );
-	}
+{
+    Tokenizer tokenizer;
+    parse( document->DocToString( ), document->getUrl( ), &tokenizer );
+    return tokenizer.get( );
+}
 
 /**
  * Parses file
  * @param inFile
  * @return
  */
-<<<<<<< HEAD
-// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
-// TODO different counts: frequency, total num unique words, etc
-//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
-/*
- * Anchor text = #
- * Title = *
- * Url = @
- * Body = %
- */
+void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
+{
 
-void Parser::parse ( string html, Tokenizer *tokenizer )
-	{
+    unsigned long htmlIt = 0;
+    unsigned long offsetTitle = 0;
+    unsigned long offsetBody = 0;
+    unsigned long offsetURL = 0;
 
-	//maybe add some code to read in stream and add chars to string as they come in
-	auto htmlIt = html.begin();
-	int offset = 0;
-	while (htmlIt != html.end())
-=======
-void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
-	{
+    // tokenize url
+    string host = "";
+    host.assign( currentUrl.Host );
+    string path = "";
+    path.assign( currentUrl.Path );
+    string urlCurrent = host + "/" + path;
 
-	unsigned long htmlIt = 0;
-	unsigned long offsetTitle = 0;
-	unsigned long offsetURL = 0;
+    offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
 
-	// tokenize url
-	string host = "";
-	host.assign( currentUrl.Host );
-	string path = "";
-	path.assign( currentUrl.Path );
-	string urlCurrent = host + "/" + path;
+    while ( htmlIt < html.size( ) )
+    {
+        unsigned long begCloseTag = 0;
+        bool isParagraph = false;
+        unsigned long savePosition = htmlIt;
+        // if open bracket
+        if ( html[ htmlIt ] == '<' )
+        {
 
-	offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
+            if (  html[ htmlIt + 1 ] == 'p' && ( ( html[htmlIt + 2]) == '>' || ( html[ htmlIt + 2 ] == ' ') ) )
+            {
+                begCloseTag = findNext( "</p>", htmlIt, html );
+                isParagraph = true;
+            }
+            else
+            {
+                begCloseTag = findNext( "</", htmlIt, html );
+            }
 
-	while ( htmlIt < html.size( ) )
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-		{
-		// if open bracket
-		if ( html[ htmlIt ] == '<' )
-			{
-<<<<<<< HEAD
-			// TODO have to put a conditional that ensures the opening and closing tags are the same type
-			auto begCloseTag = findNext ("</", htmlIt);
-			auto endCloseTag = findNext ( ">", begCloseTag);
-			string line (htmlIt, endCloseTag + 1);
-=======
-			unsigned long begCloseTag = findNext( "</", htmlIt, html );
-			unsigned long endCloseTag = findNext( ">", begCloseTag, html );
-			string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt );
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-			htmlIt = endCloseTag + 2;
+            unsigned long endCloseTag = findNext( ">", begCloseTag, html );
+            string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt );
+            htmlIt = endCloseTag + 2;
 
-			//check if line is a script
-			if ( isScript( line ) )
-				{
-				// DO NOTHING
-				}
-			// check if line is url
-<<<<<<< HEAD
-			else if ( url = extract_url( line ) != "" )
-				{
-				//where is urlFrontier defined?
-				urlFrontier->push ( url );
-				}
-			// check if line is title
-			else if ( title = extract_title( line ) != "" )
-				{
-				tokenizer->execute ( title, offset );
-=======
-			string url = extract_url( line );
-			if ( url != "" )
-				{
-				if ( isLocal( url ) )
-					{
-					string completeUrl = "";
-					completeUrl.assign( currentUrl.CompleteUrl );
-					url = completeUrl + url;
-					}
-				if ( isValid( url ) && url != urlCurrent )
-					{
-					// TODO ParsedUrl with anchor text
-					ParsedUrl pUrl = ParsedUrl( url );
-					urlFrontier->Push( pUrl );
-					cout << url << endl;
-					}
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-				}
-            else if ( body = extract_body( line ) != "")
+            // check if line is url
+            string title = extract_title( line );
+            string url = extract_url( line );
+            string header = extract_header( line );
+            //checking if html line is script
+            if ( isTag( line, "script" ) )
+            {
+                //DO NOTHING
+            }
+                //checking for p tag
+            else if ( isParagraph )
+            {
+                string body = extract_body( line, offsetTitle, offsetBody, isParagraph, tokenizer, currentUrl, urlCurrent );
+                offsetBody = tokenizer->execute( body, offsetBody, Tokenizer::BODY );
+            }
+                //if html line is url, parses accordingly and pushes to frontier
+            else if ( url != "" )
+            {
+                if ( isLocal( url ) )
                 {
-                tokenizer->execute( body, offset );
+                    string completeUrl = "";
+                    completeUrl.assign( currentUrl.CompleteUrl );
+                    url = completeUrl + url;
                 }
-			else
-				{
-<<<<<<< HEAD
-				//DO NOTHING
-=======
-				string title = extract_title( line );
-				if ( title != "" )
-					{
-					offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
-					}
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-				}
-			}
-		else
-			{
-			++htmlIt;
-			}
-		}
-	}
-
+                if ( isValid( url ) && url != urlCurrent )
+                {
+                    // TODO ParsedUrl with anchor text
+                    ParsedUrl pUrl = ParsedUrl( url );
+                    // urlFrontier->Push( pUrl );
+                    cout << url << endl;
+                }
+            }
+                //check if line is header; classifies as body text
+            else if ( header != "")
+            {
+                offsetBody = tokenizer->execute( header, offsetBody, Tokenizer::BODY );
+            }
+                // check if line is title
+            else if ( title != "")
+            {
+                offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
+            }
 
-/*
- * Returns true if script tag, false if not
-*/
-bool Parser::isScript ( string & word )
-	{
-	if ( *findStr ( "<script", word ) != '\0' )
-		{
-			return true;
-		}
-	return false;
-	}
-/*
- * Returns body text if p tags, empty string if not
- * If there's no closing tag, stops at the first opening tag or when it hits end of file
-*/
-string Parser::extract_body( string & word, int & offset )
-    {
-    string body = "";
-    auto foundBody = findStr("<p", word) != '\0';
-    if ( *foundBody != '\0' )
-        {
-        while ( *findStr != '<' )
+            else
             {
-            body += *findStr;
-			if ( *findStr == ' ')
-				{
-				count += 1;
-				}
+                //DO NOTHING
             }
         }
-    return body;
+        else
+        {
+            ++htmlIt;
+        }
     }
+}
+
 
 /**
  * Returns a url, or "" if none
  * @param word
  * @return
  */
-<<<<<<< HEAD
-
-string Parser::extract_url ( string & word )
-=======
 string Parser::extract_url ( string html )
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-	{
-	string url = "";
-	if ( findStr( "<a", html ) != html.size( ) )
-		{
-		unsigned long foundHref = findStr( "href", html );
-		unsigned long foundHttp = findNext( "http", foundHref, html );
-		if ( foundHttp < html.size( ) )
-			{
-			url = "";
-			unsigned long closeTag = findNext( ">", foundHref, html );
-			unsigned long closeSpace = findNext( " ", foundHref, html );
-			unsigned long closeUrl = 0;
-			// end == ' >'
-			if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
-				{
-				if ( html[ closeSpace - 1 ] == '\"' )
-					{
-					closeSpace -= 1;
-					}
-				closeUrl = closeSpace;
-				}
-			// end == '>'
-			else if ( closeTag < html.size( ) )
-				{
-				if ( html[ closeTag - 1 ] == '\"' )
-					{
-					closeTag -= 1;
-					}
-				closeUrl = closeTag;
-				}
+{
+    string url = "";
+    if ( findStr( "<a", html ) != html.size( ) )
+    {
+        unsigned long foundHref = findStr( "href", html );
+        unsigned long foundHttp = findNext( "http", foundHref, html );
+        if ( foundHttp < html.size( ) )
+        {
+            url = "";
+            unsigned long closeTag = findNext( ">", foundHref, html );
+            unsigned long closeSpace = findNext( " ", foundHref, html );
+            unsigned long closeUrl = 0;
+            // end == ' >'
+            if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
+            {
+                if ( html[ closeSpace - 1 ] == '\"' )
+                {
+                    closeSpace -= 1;
+                }
+                closeUrl = closeSpace;
+            }
+                // end == '>'
+            else if ( closeTag < html.size( ) )
+            {
+                if ( html[ closeTag - 1 ] == '\"' )
+                {
+                    closeTag -= 1;
+                }
+                closeUrl = closeTag;
+            }
 
-			while ( foundHttp != closeUrl && html[ foundHttp ] != '\n')
-				{
-				url.push_back( html[ foundHttp ] );
-				++foundHttp;
-				}
-			}
-		}
+            while ( foundHttp != closeUrl && html[ foundHttp ] != '\n')
+            {
+                url.push_back( html[ foundHttp ] );
+                ++foundHttp;
+            }
+        }
+    }
 
-	return url;
-	}
+    return url;
+}
 
 /**
  * Returns a title, or "" if none
@@ -232,21 +177,21 @@ string Parser::extract_url ( string html )
  * @return
  */
 string Parser::extract_title ( string html )
-	{
-	string title = "";
-	char end = '<';
-	auto pos = findStr( "<title>", html );
-	if ( pos < html.size( ) )
-		{
-		pos += 7;
-		while ( html[ pos ] != end )
-			{
-			title += html[ pos ];
-			++pos;
-			}
-		}
-	return title;
-	}
+{
+    string title = "";
+    char end = '<';
+    auto pos = findStr( "<title>", html );
+    if ( pos < html.size( ) )
+    {
+        pos += 7;
+        while ( html[ pos ] != end )
+        {
+            title += html[ pos ];
+            ++pos;
+        }
+    }
+    return title;
+}
 
 /**
  * Will return true if local url
@@ -255,9 +200,9 @@ string Parser::extract_title ( string html )
  * @return
  */
 bool Parser::isLocal ( string url )
-	{
-	return ( url[ 0 ] == '/' );
-	}
+{
+    return ( url[ 0 ] == '/' );
+}
 
 /**
  * Returns false if the link is an invalid type
@@ -266,28 +211,163 @@ bool Parser::isLocal ( string url )
  * @return
  */
 bool Parser::isValid ( string url )
-	{
-	unsigned long size = url.size( );
+{
+    unsigned long size = url.size( );
+
+    string lastFive = lastN( url, 5 );
+    string lastFour = lastN( url, 4 );
+
+    // .html
+    if ( lastFive == ".html" )
+    {
+        return true;
+    }
+
+    // png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
+    if ( lastFour == ".png" ||  lastFour == ".jpg" || lastFour == ".css" ||  lastFour == ".gif"
+         || lastFour == ".pdf" ||  lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" )
+    {
+        return false;
+    }
+    //jpeg
+    if ( lastFive == ".jpeg" )
+    {
+        return false;
+    }
+    return true;
+}
+
+//TODO delete?? may not need
+void Parser::remove_tag( string & html, unsigned long & htmlIt, unsigned long savePosition, string tag)
+{
+    unsigned long openTag = findStr( "<" + tag + ">", html );
+    unsigned long closeTag = findNext( "</" + tag + ">", openTag, html );
+    //TODO write erase functions??
+    html.erase( closeTag, tag.length( ) + 2 );
+    html.erase( openTag, tag.length( ) + 3 );
+
+    htmlIt = savePosition;
+}
+void Parser::extract_all ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
+                           ParsedUrl & currentUrl, string & urlCurrent  )
+{
+    // check if line is url
+    string title = extract_title( line );
+    string url = extract_url( line );
+    //checking if html line is script
+    if ( isTag( line, "script" ) )
+    {
+        //DO NOTHING
+    }
+        //TODO delete this conditional if keeping whats in main right now
+    else if ( isParagraph )
+    {
+        string body = extract_body( line, offsetTitle, offsetBody, isParagraph, tokenizer, currentUrl, urlCurrent );
+        offsetBody = tokenizer->execute( body, offsetBody, Tokenizer::BODY );
+    }
+
+    else if ( url != "" )
+    {
+        if ( isLocal( url ) )
+        {
+            string completeUrl = "";
+            completeUrl.assign( currentUrl.CompleteUrl );
+            url = completeUrl + url;
+        }
+        if ( isValid( url ) && url != urlCurrent )
+        {
+            // TODO ParsedUrl with anchor text
+            ParsedUrl pUrl = ParsedUrl( url );
+            // urlFrontier->Push( pUrl );
+            cout << url << endl;
+        }
+    }
+        // check if line is title
+        // check if line is title
+    else if ( title != "")
+    {
+        offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
+    }
+
+    else
+    {
+        //DO NOTHING
+    }
+}
+/**
+ * Returns true if tag is in html, false if not
+ * @param html
+ * @return
+ */
+bool Parser::isTag( string html, string tag )
+{
+    string findTag = "<" + tag;
+    if ( findStr( findTag, html ) != html.size( ) )
+    {
+        return true;
+    }
+    return false;
+}
 
-	string lastFive = lastN( url, 5 );
-	string lastFour = lastN( url, 4 );
+string Parser::extract_body( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
+                             ParsedUrl & currentUrl, string & urlCurrent )
+{
+    string body = "";
+    unsigned long startParTag = findNext( "<p>", 0, html );
+    unsigned long closeParTag = findNext( "</p>", startParTag, html );
+    unsigned long nextCloseTag = findNext( "</", startParTag, html );
+    startParTag += 3;
+    while ( nextCloseTag != startParTag )
+    {
+        if ( closeParTag == nextCloseTag )
+        {
+            while ( startParTag != closeParTag )
+            {
+                body += html[ startParTag ];
+                ++startParTag;
+                if ( startParTag >= html.size( ))
+                {
+                    return body;
+                }
+            }
+        }
+        else
+        {
+            unsigned long newHtmlStart = findNext ( "<", startParTag, html );
+            char a = html[ newHtmlStart ];
+            unsigned long closeNewHtml = findNext ( ">", newHtmlStart, html );
+            char b = html[ closeNewHtml ];
+            unsigned long newHtmlTagLength = closeNewHtml - newHtmlStart;
+
+            while ( startParTag != newHtmlStart )
+            {
+                body += html[ startParTag ];
+                ++startParTag;
+            }
+
+            string newHtml = subStr(html, newHtmlStart, nextCloseTag - newHtmlStart + newHtmlTagLength + 2);
+            extract_all( newHtml, offsetTitle, offsetBody, false, tokenizer, currentUrl, urlCurrent);
+            startParTag = nextCloseTag + newHtmlTagLength + 2;
+            nextCloseTag = findNext( "</", startParTag, html );
+        }
+    }
 
-	// .html
-	if ( lastFive == ".html" )
-		{
-		return true;
-		}
+    return body;
+}
 
-	// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
-	if ( lastFour == ".png" ||  lastFour == ".jpg" || lastFour == ".css" ||  lastFour == ".gif"
-	     || lastFour == ".pdf" ||  lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" )
-		{
-		return false;
-		}
-	//jpeg
-	if ( lastFive == ".jpeg" )
-		{
-		return false;
-		}
-	return true;
-	}
+string Parser::extract_header( string html )
+{
+    string header = "";
+    unsigned long startHeader = findStr( "<h", html );
+    if ( startHeader !=  html.size( ) && ( html[ startHeader + 1] >= '1' && html[ startHeader + 1 ] <= '6' ) )
+    {
+        unsigned long endHeader = findNext( "</h", startHeader, html );
+        startHeader += 4;
+        while ( startHeader != endHeader )
+        {
+            header += html[ startHeader ];
+            ++startHeader;
+        }
+    }
+    return header;
+}
diff --git a/parser/Parser.h b/parser/Parser.h
index 06c5854..3c76734 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -19,92 +19,79 @@ using namespace std;
  * Returns a pointer to a dictionary that contains the tokenized input
  */
 class Parser
-	{
+{
 
 public:
 
-<<<<<<< HEAD
-
-	Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
-		{
-		urlFrontier = urlFrontierIn;
-		}
-=======
-	/**
-	 * Parser Cstor
-	 * @param urlFrontierIn
-	 */
-	Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn );
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-
-
-	/**
-	 * Executes the Parser
-	 * @return
-	 */
-<<<<<<< HEAD
-	// TODO need to change vector type to word data, change where struct is declared
-	const unordered_map< string, vector< Tokenizer::wordData>> * execute ( Document* document)
-		{
-		Tokenizer tokenizer;
-		parse ( document->DocToString (), &tokenizer );
-		return tokenizer.get ( );
-		}
-=======
-	const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
+    /**
+     * Parser Cstor
+     * @param urlFrontierIn
+     */
+    Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn );
+
+
+    /**
+     * Executes the Parser
+     * @return
+     */
+    const unordered_map< string, vector< unsigned long > > *execute ( Document *document );
 
 
 private:
-	ProducerConsumerQueue< ParsedUrl > *urlFrontier;
-
-	/**
-	 * Parses file
-	 * @param inFile
-	 * @return
-	 */
-<<<<<<< HEAD
-	void parse ( string html, Tokenizer *tokenizer );
-=======
-	void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-
-
-	/**
-	 * Returns a url, or "" if none
-	 * @param html
-	 * @return
-	 */
-	string extract_url ( string html );
-
-
-	/**
-	 * Returns a title, or "" if none
-	 * @param html
-	 * @return
-	 */
-	string extract_title ( string html );
-
-<<<<<<< HEAD
-	bool isScript ( string & word );
-
-	string extract_body( string & word );
-=======
-	/**
-	 * Will return true if local url
-	 *
-	 * @param url
-	 * @return
-	 */
-	bool isLocal ( string url );
-
-	/**
-	 * Returns true is url is valid
-	 *
-	 * @param url
-	 * @return
-	 */
-	bool isValid ( string url );
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-	};
+    ProducerConsumerQueue< ParsedUrl > *urlFrontier;
+
+    /**
+     * Parses file
+     * @param inFile
+     * @return
+     */
+    void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
+
+
+    /**
+     * Returns a url, or "" if none
+     * @param html
+     * @return
+     */
+    string extract_url ( string html );
+
+
+    /**
+     * Returns a title, or "" if none
+     * @param html
+     * @return
+     */
+    string extract_title ( string html );
+
+    /**
+     * Will return true if local url
+     *
+     * @param url
+     * @return
+     */
+    bool isLocal ( string url );
+
+    /**
+     * Returns true is url is valid
+     *
+     * @param url
+     * @return
+     */
+    bool isValid ( string url );
+
+    bool isTag( string html, string tag );
+
+    string extract_body( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
+                         ParsedUrl & currentUrl, string & urlCurrent );
+
+    void extract_all ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer * tokenizer,
+                       ParsedUrl & currentUrl, string & urlCurrent );
+
+    //TODO delete?? may not need
+    void remove_tag( string & html, unsigned long & htmlIt, unsigned long savePosition, string tag);
+
+    string extract_header( string html );
+
+
+};
 
diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp
index 766aa46..fb55c13 100644
--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
@@ -1,5 +1,6 @@
 
 #include <string>
+#include <cstring>
 #include <cassert>
 #include <iostream>
 #include "../Parser.h"
@@ -14,6 +15,10 @@ void testComplex ( );
 
 void testURL ( );
 
+void testExtractBody( );
+
+void testBody( );
+
 int main ( )
 	{
 	cout << "Testing Parser ... " << endl << endl;
@@ -26,11 +31,13 @@ int main ( )
 	cout << "Testing Complex: " << endl;
 	testComplex( );
 	cout << "Complex Test Passed!" << endl;
-	cout << "Parser Tests Passed! :D" << endl;
-
-	}
+    cout << "Testing BODY: " << endl;
+    testExtractBody( );
+    testBody( );
+    cout << "Parser Tests Passed! :D" << endl;
+    }
 
-void testSimple ( )
+void testSimple( )
 	{
 
 	ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
@@ -143,4 +150,93 @@ void testURL ( )
 
 	delete dictionary;
 	dictionary = nullptr;
-	}
\ No newline at end of file
+	}
+
+void testBody( )
+{
+    ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
+    ParsedUrl url = ParsedUrl( "http://www.testurl.com" );
+    char docString[1024];
+    strcpy( docString, "<!DOCTYPE html>\n"
+            "<html>\n"
+            "<head>\n"
+            "<!-- HTML Codes by Quackit.com -->\n"
+            "<title>\n"
+            "Story of Cat</title>\n"
+            "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n"
+            "<meta name=\"keywords\" content=\"cat story\">\n"
+            "<meta name=\"description\" content=\"This is the tale of a cat names joe\">\n"
+            "<style>\n"
+            "body {background-color:#ffffff;background-repeat:no-repeat;background-position:top left;background-attachment:fixed;}\n"
+            "h1{font-family:Arial, sans-serif;color:#000000;background-color:#ffffff;}\n"
+            "p {font-family:Georgia, serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;}\n"
+            "</style>\n"
+            "</head>\n"
+            "<body>\n"
+            "<h1>Joe the cat</h1>\n"
+            "<p>On Saturday, joe the cat went to the store. He climbed up a mountain? It was weird. The store was called Food Store</p>\n"
+            "</body>\n"
+            "</html>" );
+    Document document( url, docString );
+
+    Parser parser( &urlFrontierTest );
+    auto dictionary = parser.execute( &document );
+    cout << dictionary->size( ) << endl;
+    //assert( dictionary->size( ) == 4);
+    for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+    {
+        cout << it->first << ':';
+        for ( int i = 0; i < it->second.size( ); ++i )
+        {
+            cout << it->second[ i ] << " ";
+        }
+        cout << std::endl;
+    }
+}
+void testExtractBody ( )
+{
+    ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
+    ParsedUrl url = ParsedUrl( "http://www.testurl.com" );
+    char docString[1024];
+    strcpy( docString, "<title>Paragraph body text hello</title>" );
+    Document document( url, docString );
+
+    Parser parser( &urlFrontierTest );
+    auto dictionary = parser.execute( &document );
+    cout << dictionary->size( ) << endl;
+    for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+    {
+        cout << it->first << ':';
+        for ( int i = 0; i < it->second.size( ); ++i )
+        {
+            cout << it->second[ i ] << " ";
+        }
+        cout << std::endl;
+    }
+    cout << endl << endl;
+    assert( dictionary->size( ) == 6);
+
+    char docString2[1024];
+    strcpy( docString2, "<p>Paragraph body text hello <title>Specific title</title> more body words</p>" );
+    Document document2( url, docString2 );
+    Parser parser2 ( &urlFrontierTest );
+    dictionary = parser.execute( &document2 );
+    cout << "Dictionary 2 size " << dictionary->size( ) << endl;
+    for ( auto it = dictionary->begin( ); it != dictionary->end( ); it++ )
+    {
+        cout << it->first << ':';
+        for ( int i = 0; i < it->second.size( ); ++i )
+        {
+            cout << it->second[ i ] << " ";
+        }
+        cout << std::endl;
+    }
+    assert( dictionary->size( ) == 10);
+    assert( dictionary->at( "#specif" )[0] == 0);
+    assert( dictionary->at("%paragraph")[0] == 0);
+    assert( dictionary->at("%bodi")[1] == 5);
+
+
+}
+
+
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index 7b04615..c94395b 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include <string>
@@ -6,111 +5,57 @@
 #include <vector>
 #include "stringProcessing.h"
 #include "Stemmer.h"
-#include "../parser/Parser.h"
 
 using namespace std;
 
-
 class Tokenizer
-	{
+{
 
 public:
-	struct wordData {
-		int frequency = 0;
-		int offset;
-	};
-	
-	Tokenizer ( )
-		{
-		docIndex = new unordered_map< string, vector<wordData>>;
-		}
-
-	unordered_map< string, vector< wordData>> *get ( ) const
-		{
-		return docIndex;
-		}
-	//add type of word parameter, ie paragraph, url etc
-
-    void execute ( string & originalText, int offset )
-		{
-		vector< string > splitText = splitStr ( originalText, ' ' );
-        string processedString = "";
-		int vectorLength = 0;
-        for ( int i = 0; i < splitText.size( ); ++i )
-            {
-             // case fold
-             processedString = toLower( splitText[ i ] );
-             //strip all characters
-             processedString = stripStr( processedString );
-
-             if ( !isStopWord ( lowerString ) )
-				{
-                // stem word
-                processedString = stem.execute( processedString );
-
-                wordData currentWord;
-				currentWord.offset = offset;
-				vectorLength = ( *docIndex )[ lowerString ].size( );
-				( *docIndex )[ lowerString ].push_back ( currentWord );
-                //incrementing frequency value of the current word
-				( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
-				++offset;
-				}
-			}
-		}
-
-
-
-	// decorators
-	static const char TITLE = '#';
-	static const char ANCHOR = '@';
-	static const char URL = '$';
-
-	/**
- 	* Tokenizer Cstor
- 	*/
-	Tokenizer ( );
 
-	/**
- 	* Returns pointer to the docIndex dictionary
-	 *
- 	* @return pointer to unordered_map< string, vector< int>>
- 	*/
-<<<<<<< HEAD
-	unordered_map< string, vector<wordData>> *get ( ) const;
-=======
-	unordered_map< string, vector< unsigned long > > *get ( ) const;
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
 
-	/**
-	 * Executes the Tokenizer
-	 * Sends tokens to dictionary
-	 *
-	 *
-	 * @param originalText
-	 * @param offset
-	 * @param decorator
-	 */
-	unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
+// decorators
+    static const char TITLE = '#';
+    static const char ANCHOR = '@';
+    static const char URL = '$';
+    static const char BODY = '%';
+
+
+    /**
+     * Tokenizer Cstor
+     */
+    Tokenizer ( );
+
+    /**
+     * Returns pointer to the docIndex dictionary
+     *
+     * @return pointer to unordered_map< string, vector< int>>
+     */
+    unordered_map< string, vector< unsigned long > > *get ( ) const;
+
+    /**
+     * Executes the Tokenizer
+     * Sends tokens to dictionary
+     *
+     *
+     * @param originalText
+     * @param offset
+     * @param decorator
+     */
+    unsigned long execute ( string originalText, unsigned long offset, char decorator = '\0' );
 
 private:
 
-	unordered_map< string, vector< unsigned long > > *docIndex;
-	Stemmer stem;
+    unordered_map< string, vector< unsigned long > > *docIndex;
+    Stemmer stem;
 
-	/**
-	 * Tokenizes text (titles, body text)
-	 *
-	 * @param originalText
-	 * @param offset
-	 * @param decorator
-	 */
-	unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
+    /**
+     * Tokenizes text (titles, body text)
+     *
+     * @param originalText
+     * @param offset
+     * @param decorator
+     */
+    unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
 
-<<<<<<< HEAD
-	private:
-        unordered_map< string, vector<wordData>> *docIndex;
-		Stemmer stem;
-=======
->>>>>>> 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
-	};
+};
-- 
GitLab