From acee5aa5a35f078c161f34fff118c576ee536602 Mon Sep 17 00:00:00 2001
From: aanvi <aanvi@umich.edu>
Date: Thu, 15 Mar 2018 23:47:47 -0400
Subject: [PATCH] Added functionalites

---
 parser/Parser.cpp | 37 ++++++++++++++++++++++++-------------
 parser/Parser.h   |  5 +++--
 util/Tokenizer.h  | 24 ++++++++++++++++--------
 3 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 2cfbd52..19a8542 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -7,12 +7,15 @@
  * @param inFile
  * @return
  */
-//TODO instead of grabbing each line, look to see if beginning of
 // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
-// TODO have to read input in as a stream of chars eventually - cat into string?
 // TODO different counts: frequency, total num unique words, etc
-// TODO handle bad html style (ie no closing p tag)
 //TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
+/*
+ * Anchor text = #
+ * Title = *
+ * Url = @
+ * Body = %
+ */
 
 void Parser::parse ( string html, Tokenizer *tokenizer )
 	{
@@ -63,17 +66,13 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 			++htmlIt;
 			}
 		}
-
-
 	}
 
-/**
- * Returns a url, or "" if none
- * @param word
- * @return
- */
 
-bool Parser::isScript ( string word )
+/*
+ * Returns true if script tag, false if not
+*/
+bool Parser::isScript ( string & word )
 	{
 	if ( *findStr ( "<script", word ) != '\0' )
 		{
@@ -81,8 +80,11 @@ bool Parser::isScript ( string word )
 		}
 	return false;
 	}
-
-string Parser::extract_body( string word )
+/*
+ * Returns body text if p tags, empty string if not
+ * If there's no closing tag, stops at the first opening tag or when it hits end of file
+*/
+string Parser::extract_body( string & word, int & offset )
     {
     string body = "";
     auto foundBody = findStr("<p", word) != '\0';
@@ -91,11 +93,20 @@ string Parser::extract_body( string word )
         while ( *findStr != '<' )
             {
             body += *findStr;
+			if ( *findStr == ' ')
+				{
+				count += 1;
+				}
             }
         }
     return body;
     }
 
+/**
+ * Returns a url, or "" if none
+ * @param word
+ * @return
+ */
 
 string Parser::extract_url ( string & word )
 	{
diff --git a/parser/Parser.h b/parser/Parser.h
index da60d76..c19fff2 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -32,6 +32,7 @@ public:
 	 * Parser
 	 * @return
 	 */
+	// TODO need to change vector type to word data, change where struct is declared
 	const unordered_map< string, vector< int>> * execute ( Document* document)
 		{
 		Tokenizer tokenizer;
@@ -48,8 +49,6 @@ private:
 	 * @param inFile
 	 * @return
 	 */
-	//TODO instead of grabbing each line, look to see if beginning of
-	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
 	void parse ( string html, Tokenizer *tokenizer );
 
 
@@ -68,6 +67,8 @@ private:
 	 */
 	string extract_title ( string & word );
 
+	bool isScript ( string & word );
 
+	string extract_body( string & word );
 	};
 
diff --git a/util/Tokenizer.h b/util/Tokenizer.h
index 931448b..bf867b5 100644
--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -27,21 +27,29 @@ public:
 		return docIndex;
 		}
 	//add type of word parameter, ie paragraph, url etc
-	void execute ( string originalText, int offset )
-		{
 
+    void execute ( string & originalText, int offset )
+		{
 		vector< string > splitText = splitStr ( originalText, ' ' );
-		string lowerString = "";
+        string processedString = "";
 		int vectorLength = 0;
-		for ( int i = 0; i < splitText.size ( ); ++i )
-			{
-			lowerString = toLower ( splitText[ i ] );
-			if ( !isStopWord ( lowerString ) )
+        for ( int i = 0; i < splitText.size( ); ++i )
+            {
+             // case fold
+             processedString = toLower( splitText[ i ] );
+             //strip all characters
+             processedString = stripStr( processedString );
+
+             if ( !isStopWord ( lowerString ) )
 				{
-				wordData currentWord;
+                // stem word
+                processedString = stem.execute( processedString );
+
+                wordData currentWord;
 				currentWord.offset = offset;
 				vectorLength = ( *docIndex )[ lowerString ].size( );
 				( *docIndex )[ lowerString ].push_back ( currentWord );
+                //incrementing frequency value of the current word
 				( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
 				++offset;
 				}
-- 
GitLab