Added functions to parser; changed tokenizer data struct

16a17aa6 · aanvi · de512c34 · 16a17aa6 · 16a17aa6
Commit 16a17aa6 authored 7 years ago by aanvi
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
-//
-// Created by Jake Close on 3/5/18.
-//
 #include "Parser.h"
@@ -42,17 +37,20 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
 				// DO NOTHING
 				}
 			// check if line is url
-			else if ( extract_url( line ) != "" )
+			else if ( url = extract_url( line ) != "" )
 				{
 				//where is urlFrontier defined?
 				urlFrontier->push ( url );
 				}
 			// check if line is title
-			else if ( extract_title( line ) != "" )
+			else if ( title = extract_title( line ) != "" )
 				{
 				tokenizer->execute ( title, offset );
 				}
+            else if ( body = extract_body( line ) != "")
+                {
+                tokenizer->execute( body, offset );
+                }
 			else
 				{
 				//DO NOTHING
@@ -84,6 +82,20 @@ bool Parser::isScript ( string word )
 	return false;
 	}
+string Parser::extract_body( string word )
+    {
+    string body = "";
+    auto foundBody = findStr("<p", word) != '\0';
+    if ( *foundBody != '\0' )
+        {
+        while ( *findStr != '<' )
+            {
+            body += *findStr;
+            }
+        }
+    return body;
+    }
 string Parser::extract_url ( string word )
 	{
 	string url = "";

--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -38,19 +38,14 @@ public:
 			lowerString = toLower ( splitText[ i ] );
 			if ( !isStopWord ( lowerString ) )
 				{
-				//crawler will have to delete these off the heap as well
+				wordData currentWord;
-				//when would a dtor come into play here?
+				currentWord.offset = offset;
-				wordData *currentWord = new wordData;
-				currentWord -> offset = offset;
 				vectorLength = ( *docIndex )[ lowerString ].size( );
-				( *docIndex )[ lowerString ].push_back ( *currentWord );
+				( *docIndex )[ lowerString ].push_back ( currentWord );
 				( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
-				//I don't know if this is good practice or not
-				delete currentWord;
 				++offset;
 				}
 			}
-			currentWord = nullptr;
 		}
 private: