Added functionalites

acee5aa5 · aanvi · a33c0ba5 · acee5aa5 · acee5aa5 · acee5aa5
Commit acee5aa5 authored 7 years ago by aanvi
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -7,12 +7,15 @@
 * @param inFile
 * @return
 */
-//TODO instead of grabbing each line, look to see if beginning of
 // TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
-// TODO have to read input in as a stream of chars eventually - cat into string?
 // TODO different counts: frequency, total num unique words, etc
-// TODO handle bad html style (ie no closing p tag)
 //TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
+/*
+ * Anchor text = #
+ * Title = *
+ * Url = @
+ * Body = %
+ */

 void Parser::parse ( string html, Tokenizer *tokenizer )
 	{
@@ -63,17 +66,13 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 			++htmlIt;
 			}
 		}
-
-
 	}

-/**
- * Returns a url, or "" if none
- * @param word
- * @return
- */

-bool Parser::isScript ( string word )
+/*
+ * Returns true if script tag, false if not
+*/
+bool Parser::isScript ( string & word )
 	{
 	if ( *findStr ( "<script", word ) != '\0' )
 		{
@@ -81,8 +80,11 @@ bool Parser::isScript ( string word )
 		}
 	return false;
 	}
-
-string Parser::extract_body( string word )
+/*
+ * Returns body text if p tags, empty string if not
+ * If there's no closing tag, stops at the first opening tag or when it hits end of file
+*/
+string Parser::extract_body( string & word, int & offset )
    {
    string body = "";
    auto foundBody = findStr("<p", word) != '\0';
@@ -91,11 +93,20 @@ string Parser::extract_body( string word )
        while ( *findStr != '<' )
            {
            body += *findStr;
+			if ( *findStr == ' ')
+				{
+				count += 1;
+				}
            }
        }
    return body;
    }

+/**
+ * Returns a url, or "" if none
+ * @param word
+ * @return
+ */

 string Parser::extract_url ( string & word )
 	{

--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -32,6 +32,7 @@ public:
 	 * Parser
 	 * @return
 	 */
+	// TODO need to change vector type to word data, change where struct is declared
 	const unordered_map< string, vector< int>> * execute ( Document* document)
 		{
 		Tokenizer tokenizer;
@@ -48,8 +49,6 @@ private:
 	 * @param inFile
 	 * @return
 	 */
-	//TODO instead of grabbing each line, look to see if beginning of
-	// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
 	void parse ( string html, Tokenizer *tokenizer );


@@ -68,6 +67,8 @@ private:
 	 */
 	string extract_title ( string & word );

+	bool isScript ( string & word );

+	string extract_body( string & word );
 	};

--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -27,21 +27,29 @@ public:
 		return docIndex;
 		}
 	//add type of word parameter, ie paragraph, url etc
-	void execute ( string originalText, int offset )
-		{

+    void execute ( string & originalText, int offset )
+		{
 		vector< string > splitText = splitStr ( originalText, ' ' );
-		string lowerString = "";
+        string processedString = "";
 		int vectorLength = 0;
-		for ( int i = 0; i < splitText.size ( ); ++i )
-			{
-			lowerString = toLower ( splitText[ i ] );
-			if ( !isStopWord ( lowerString ) )
+        for ( int i = 0; i < splitText.size( ); ++i )
+            {
+             // case fold
+             processedString = toLower( splitText[ i ] );
+             //strip all characters
+             processedString = stripStr( processedString );
+
+             if ( !isStopWord ( lowerString ) )
 				{
-				wordData currentWord;
+                // stem word
+                processedString = stem.execute( processedString );
+
+                wordData currentWord;
 				currentWord.offset = offset;
 				vectorLength = ( *docIndex )[ lowerString ].size( );
 				( *docIndex )[ lowerString ].push_back ( currentWord );
+                //incrementing frequency value of the current word
 				( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
 				++offset;
 				}