From 20743e12416d98a5024e7a9e727a1199b53465fd Mon Sep 17 00:00:00 2001
From: aanvi <aanvi@umich.edu>
Date: Tue, 13 Mar 2018 15:32:50 -0400
Subject: [PATCH] Added script conditional, altered dictionary val to vector of
 structs instead of just offsets

---
 parser/Parser.cpp | 42 +++++++++++++++++++++++++++++++++---------
 util/Tokenizer.h  | 24 ++++++++++++++++++++----
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index e412e36..9a37a91 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -14,8 +14,15 @@
  */
 //TODO instead of grabbing each line, look to see if beginning of
 // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
+// TODO have to read input in as a stream of chars eventually - cat into string?
+// TODO different counts: frequency, total num unique words, etc
+// TODO handle bad html style (ie no closing p tag)
+//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
+
 void Parser::parse ( string html, Tokenizer *tokenizer )
 	{
+
+	//maybe add some code to read in stream and add chars to string as they come in
 	auto htmlIt = html.begin();
 	int offset = 0;
 	while (htmlIt != html.end())
@@ -23,25 +30,32 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 		// if open bracket
 		if ( *htmlIt == '<' )
 			{
+			// TODO have to put a conditional that ensures the opening and closing tags are the same type
 			auto begCloseTag = findNext ("</", htmlIt);
 			auto endCloseTag = findNext ( ">", begCloseTag);
 			string line (htmlIt, endCloseTag + 1);
 			htmlIt = endCloseTag + 2;
 
+			//check if line is a script
+			if ( isScript( line ) )
+				{
+				// DO NOTHING
+				}
 			// check if line is url
-			string url = extract_url ( line );
-			if (url != "")
+			else if ( extract_url( line ) != "" )
 				{
-				urlFrontier->Push ( url );
+				//where is urlFrontier defined?
+				urlFrontier->push ( url );
 				}
-				// check if line is title
+			// check if line is title
+			else if ( extract_title( line ) != "" )
+				{
+				tokenizer->execute ( title, offset );
+				}
+
 			else
 				{
-				string title = extract_title ( line );
-				if (title != "")
-					{
-					tokenizer->execute ( title, offset );
-					}
+				//DO NOTHING
 				}
 			//TODO fix offset?
 			offset = htmlIt - html.begin();
@@ -60,6 +74,16 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
  * @param word
  * @return
  */
+
+bool Parser::isScript ( string word )
+	{
+	if ( *findStr ( "<script", word ) != '\0' )
+		{
+			return true;
+		}
+	return false;
+	}
+
 string Parser::extract_url ( string word )
 	{
 	string url = "";
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index ba27e43..2b16020 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -9,34 +9,50 @@
 
 using namespace std;
 
+struct wordData {
+	int offset;
+	int frequency = 0;
+	//total num words/unique words??
+};
 class Tokenizer
 	{
 public:
 	Tokenizer ( )
 		{
-		docIndex = new unordered_map< string, vector< int>>;
+		docIndex = new unordered_map< string, vector<wordData>>;
 		}
 
 	unordered_map< string, vector< int>> *get ( ) const
 		{
 		return docIndex;
 		}
-
+	//add type of word parameter, ie paragraph, url etc
 	void execute ( string originalText, int offset )
 		{
+
 		vector< string > splitText = splitStr ( originalText, ' ' );
 		string lowerString = "";
+		int vectorLength = 0;
 		for ( int i = 0; i < splitText.size ( ); ++i )
 			{
 			lowerString = toLower ( splitText[ i ] );
 			if ( !isStopWord ( lowerString ) )
 				{
-				( *docIndex )[ lowerString ].push_back ( offset );
+				//crawler will have to delete these off the heap as well
+				//when would a dtor come into play here?
+				wordData *currentWord = new wordData;
+				currentWord -> offset = offset;
+				vectorLength = ( *docIndex )[ lowerString ].size( );
+				( *docIndex )[ lowerString ].push_back ( *currentWord );
+				( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
+				//I don't know if this is good practice or not
+				delete currentWord;
 				++offset;
 				}
 			}
+			currentWord = nullptr;
 		}
 
 private:
-	unordered_map< string, vector< int>> *docIndex;
+	unordered_map< string, vector<wordData>> *docIndex;
 	};
-- 
GitLab