From b9acd359ec6621ca63c0a918af457babd335196b Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Mon, 5 Mar 2018 00:20:44 -0500
Subject: [PATCH] parse logic imporved

---
 parser/Parser.h             | 68 ++++++++++++++++++++-----------------
 parser/tests/parserTest.cpp | 53 +++++++++++++++++++++++++++--
 util/Tokenizer.h            | 11 +++---
 util/stringProcessing.h     | 52 +++++++++++++++++++++++++++-
 4 files changed, 144 insertions(+), 40 deletions(-)

diff --git a/parser/Parser.h b/parser/Parser.h
index 4f7203a..5d1c166 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -54,39 +54,46 @@ private:
 	 * @param inFile
 	 * @return
 	 */
+	//TODO instead of grabbing each line, look to see if beginning of
+	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
 	void parse ( string html, Tokenizer *tokenizer )
 		{
-
-		string tokenizerInput = "";
-		string currentTerm = "";
-		int index = 0;
-		while (index != html.size())
+		auto htmlIt = html.begin();
+		int offset = 0;
+		while (htmlIt != html.end())
 			{
-			currentTerm = "";
-			while ( html.at( index ) != '\n' )
-				{
-				currentTerm += html[ index ];
-				++index;
-				}
-			++index;
-
-
-			string url = extract_url ( currentTerm );
-			if (url != "")
+			// if open bracket
+			if ( *htmlIt == '<' )
 				{
-				urlFrontier->Push (url);
+				auto begCloseTag = findNext ("</", htmlIt);
+				auto endCloseTag = findNext ( ">", begCloseTag);
+				string line (htmlIt, endCloseTag + 1);
+				htmlIt = endCloseTag + 2;
+
+				// check if line is url
+				string url = extract_url ( line );
+				if (url != "")
+					{
+					urlFrontier->Push ( url );
+					}
+				// check if line is title
+				else
+					{
+					string title = extract_title ( line );
+					if (title != "")
+						{
+						tokenizer->execute ( title, offset );
+						}
+					}
+				//TODO fix offset?
+				offset = htmlIt - html.begin();
 				}
 			else
 				{
-				string title = extract_title ( currentTerm );
-				if (title != "")
-					{
-					tokenizerInput += title;
-					}
+				++htmlIt;
 				}
-
 			}
-		tokenizer->execute ( tokenizerInput );
+
 
 		}
 
@@ -98,16 +105,15 @@ private:
 	string extract_url ( string word )
 		{
 		string url = "";
-
-		if ( *findStr ( word, "<a" ) != '\0' )
+		if ( *findStr ( "<a", word ) != '\0' )
 			{
-			auto foundHttp = findStr ( word, "href=http" );
+			auto foundHref = findStr ( "href", word );
+			auto foundHttp = findNext ( "http", foundHref );
 			if ( *foundHttp != '\0' )
 				{
-				url = "http";
-				foundHttp += 9;
-
-				while ( *foundHttp != *findStr ( word, "\">" ) )
+				url = "";
+				auto closeTag = findNext ( ">", word.begin ( ) );
+				while ( *foundHttp != *closeTag )
 					{
 					url += *foundHttp;
 					++foundHttp;
diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp
index d160c4c..50c7469 100644
--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
@@ -11,11 +11,23 @@
 
 using namespace std;
 
+void testSimple ( );
+void testComplex ( );
+
 int main ( )
 	{
 	cout << "Testing Parser ... " << endl << endl;
+	testSimple ();
+	testComplex ();
+	cout << "Parser Tests Passed! :D" << endl;
+
+	}
+
+void testSimple ( )
+	{
+
 	ProducerConsumerQueue < string > * urlFrontierTest;
-	Document document ( "<title>This Cat Title Cat</title>\n" );
+	Document document ( "<title>This Cat Title Cat</title>" );
 
 	Parser parser ( urlFrontierTest );
 	auto dictionary = parser.execute ( &document );
@@ -28,7 +40,44 @@ int main ( )
 	assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
 	assert ( dictionary->at ( "title" )[ 0 ] == 1 );
 
-	cout << "Parser Tests Passed! :D" << endl;
+	delete dictionary;
 
 	}
+void testComplex ( )
+	{
+
+	ProducerConsumerQueue < string > * urlFrontierTest;
+	ifstream file("../tests/cats.html");
+	string temp;
+	string docString = "<title>Joe the Cat</title>\n";
+	docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
+	while(std::getline(file, temp)) {
+		docString += temp;
+		}
+
+	Document document ( docString );
+
+	Parser parser ( urlFrontierTest );
+	auto dictionary = parser.execute ( &document );
+
+//	cout << dictionary->size () << endl;
+//	for (auto p : *dictionary)
+//		cout << p.first << endl;
+
+	assert ( dictionary != nullptr );
+	assert ( dictionary->size () == 3);
+
+	assert ( dictionary->find ( "cat" ) != dictionary->end () );
+	assert ( dictionary->find ( "story" ) != dictionary->end () );
+	assert ( dictionary->find ( "joe" ) != dictionary->end () );
+
+	assert ( dictionary->find ( "the" ) == dictionary->end () );
+	assert ( dictionary->find ( "of" ) == dictionary->end () );
+
+//	assert ( dictionary->at ( "cat" )[ 0 ] == 1 );
+//	assert ( dictionary->at ( "story" )[ 0 ] == 0 );
+//	cout << urlFrontierTest->Size () << endl;
+//	cout << urlFrontierTest->Pop () << endl;
+	delete dictionary;
 
+	}
\ No newline at end of file
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index 3e28002..3de99f5 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -14,17 +14,16 @@ class Tokenizer
 public:
 	Tokenizer ( )
 		{
-		doc_index = new unordered_map< string, vector< int>>;
+		docIndex = new unordered_map< string, vector< int>>;
 		}
 
 	unordered_map< string, vector< int>> *get ( ) const
 		{
-		return doc_index;
+		return docIndex;
 		}
 
-	void execute ( string originalText )
+	void execute ( string originalText, int offset )
 		{
-		int offset = 0;
 		vector< string > splitText = splitStr ( originalText, ' ' );
 		string lowerString = "";
 		for ( int i = 0; i < splitText.size ( ); ++i )
@@ -32,12 +31,12 @@ public:
 			lowerString = toLower ( splitText[ i ] );
 			if ( !isStopWord ( lowerString ) )
 				{
-				( *doc_index )[ lowerString ].push_back ( offset );
+				( *docIndex )[ lowerString ].push_back ( offset );
 				++offset;
 				}
 			}
 		}
 
 private:
-	unordered_map< string, vector< int>> *doc_index;
+	unordered_map< string, vector< int>> *docIndex;
 	};
diff --git a/util/stringProcessing.h b/util/stringProcessing.h
index ed54713..40056e0 100644
--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -24,9 +24,10 @@ set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "
                             "you", "your" };
 /**
  * Finds the needle in the haystack
+ * returns position of first match
  * @param haystack
  * @param needle
- * @return
+ * @return string::iterator
  */
 string::iterator findStr (string needle, string haystack )
 	{
@@ -75,6 +76,55 @@ string::iterator findStr (string needle, string haystack )
 
 	}
 
+/**
+ * Finds the next position of the needle in the string
+ * @param needle
+ * @param pointer
+ * @return string::iterator
+ */
+string::iterator findNext (string needle, string::iterator haystackPointer )
+	{
+	auto beginNeedle = needle.begin ( );
+	auto beginHaystack = haystackPointer;
+	while ( *beginHaystack != '\0' )
+		{
+		//keep looking for instance of a match
+		if ( *beginHaystack != *beginNeedle )
+			{
+			++beginHaystack;
+			}
+
+		else if ( *beginHaystack == *beginNeedle )
+			{
+			/* want to keep the original iterator where it is so it
+				can return the beginning of the matched word if found */
+			auto temp = beginHaystack;
+			while ( *temp == *beginNeedle )
+				{
+				++temp;
+				++beginNeedle;
+				//if it hits the end of the needleing, it signifies an exact match
+				if ( *beginNeedle == '\0' )
+					{
+					//this is pointing at the beginning of the match
+					return beginHaystack;
+					}
+
+				}
+			//need to reset because still has to search rest of the string for a match
+			beginNeedle = needle.begin ( );
+			//sets the original text pointer to where the last search left off
+			beginHaystack = temp;
+			}
+
+		else
+			{
+			//DO NOTHING
+			}
+		}
+
+	return beginHaystack;
+	}
 
 
 /**
-- 
GitLab