Skip to content
Snippets Groups Projects
Commit 16a17aa6 authored by aanvi's avatar aanvi
Browse files

Added functions to parser; changed tokenizer data struct

parent de512c34
No related branches found
No related tags found
1 merge request!2WIP:Crawler parser 2 merge into duplicate url-crawler
//
// Created by Jake Close on 3/5/18.
//
#include "Parser.h"
......@@ -42,17 +37,20 @@ void Parser::parse ( string html, Tokenizer *tokenizer )
// DO NOTHING
}
// check if line is url
else if ( extract_url( line ) != "" )
else if ( url = extract_url( line ) != "" )
{
//where is urlFrontier defined?
urlFrontier->push ( url );
}
// check if line is title
else if ( extract_title( line ) != "" )
else if ( title = extract_title( line ) != "" )
{
tokenizer->execute ( title, offset );
}
else if ( body = extract_body( line ) != "")
{
tokenizer->execute( body, offset );
}
else
{
//DO NOTHING
......@@ -84,6 +82,20 @@ bool Parser::isScript ( string word )
return false;
}
string Parser::extract_body( string word )
{
string body = "";
auto foundBody = findStr("<p", word) != '\0';
if ( *foundBody != '\0' )
{
while ( *findStr != '<' )
{
body += *findStr;
}
}
return body;
}
string Parser::extract_url ( string word )
{
string url = "";
......
......@@ -38,19 +38,14 @@ public:
lowerString = toLower ( splitText[ i ] );
if ( !isStopWord ( lowerString ) )
{
//crawler will have to delete these off the heap as well
//when would a dtor come into play here?
wordData *currentWord = new wordData;
currentWord -> offset = offset;
wordData currentWord;
currentWord.offset = offset;
vectorLength = ( *docIndex )[ lowerString ].size( );
( *docIndex )[ lowerString ].push_back ( *currentWord );
( *docIndex )[ lowerString ].push_back ( currentWord );
( *docIndex )[ lowerString ][ vectorLength - 1 ].frequency += 1;
//I don't know if this is good practice or not
delete currentWord;
++offset;
}
}
currentWord = nullptr;
}
private:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment