Skip to content
Snippets Groups Projects
Commit 16a17aa6 authored by aanvi's avatar aanvi
Browse files

Added functions to parser; changed tokenizer data struct

parent de512c34
No related branches found
No related tags found
1 merge request!2WIP:Crawler parser 2 merge into duplicate url-crawler
//
// Created by Jake Close on 3/5/18.
//
#include "Parser.h" #include "Parser.h"
...@@ -42,17 +37,20 @@ void Parser::parse ( string html, Tokenizer *tokenizer ) ...@@ -42,17 +37,20 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
// DO NOTHING // DO NOTHING
} }
// check if line is url // check if line is url
else if ( extract_url( line ) != "" ) else if ( url = extract_url( line ) != "" )
{ {
//where is urlFrontier defined? //where is urlFrontier defined?
urlFrontier->push ( url ); urlFrontier->push ( url );
} }
// check if line is title // check if line is title
else if ( extract_title( line ) != "" ) else if ( title = extract_title( line ) != "" )
{ {
tokenizer->execute ( title, offset ); tokenizer->execute ( title, offset );
} }
else if ( body = extract_body( line ) != "")
{
tokenizer->execute( body, offset );
}
else else
{ {
//DO NOTHING //DO NOTHING
...@@ -84,6 +82,20 @@ bool Parser::isScript ( string word ) ...@@ -84,6 +82,20 @@ bool Parser::isScript ( string word )
return false; return false;
} }
string Parser::extract_body( string word )
{
string body = "";
auto foundBody = findStr("<p", word) != '\0';
if ( *foundBody != '\0' )
{
while ( *findStr != '<' )
{
body += *findStr;
}
}
return body;
}
string Parser::extract_url ( string word ) string Parser::extract_url ( string word )
{ {
string url = ""; string url = "";
......
...@@ -38,19 +38,14 @@ public: ...@@ -38,19 +38,14 @@ public:
lowerString = toLower ( splitText[ i ] ); lowerString = toLower ( splitText[ i ] );
if ( !isStopWord ( lowerString ) ) if ( !isStopWord ( lowerString ) )
{ {
//crawler will have to delete these off the heap as well wordData currentWord;
//when would a dtor come into play here? currentWord.offset = offset;
wordData *currentWord = new wordData;
currentWord -> offset = offset;
vectorLength = ( *docIndex )[ lowerString ].size( ); vectorLength = ( *docIndex )[ lowerString ].size( );
( *docIndex )[ lowerString ].push_back ( *currentWord ); ( *docIndex )[ lowerString ].push_back ( currentWord );
( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1; ( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
//I don't know if this is good practice or not
delete currentWord;
++offset; ++offset;
} }
} }
currentWord = nullptr;
} }
private: private:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment