Newer
Older
//
// Created by Veronica Day on 1/28/18.
//
// keep running count of offset, if stop word: don't incrememnt and remove stopword
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#include <string>
#include <functional>
#include <queue>
#include <iostream>
#include <fstream>
#include "../shared/Document.h"
#include "../shared/ProducerConsumerQueue.h"
/**
* This class uses the Doc object from the Crawler to parse the text
* Returns a pointer to a dictionary that contains the tokenized input
*/
Parser ( ProducerConsumerQueue < string > * urlFrontierIn)
const unordered_map< string, vector< int>> * execute ( Document* document)
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
auto htmlIt = html.begin();
int offset = 0;
while (htmlIt != html.end())
auto begCloseTag = findNext ("</", htmlIt);
auto endCloseTag = findNext ( ">", begCloseTag);
string line (htmlIt, endCloseTag + 1);
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url ( line );
if (url != "")
{
urlFrontier->Push ( url );
}
// check if line is title
else
{
string title = extract_title ( line );
if (title != "")
{
tokenizer->execute ( title, offset );
}
}
//TODO fix offset?
offset = htmlIt - html.begin();
/**
* Returns a url, or "" if none
* @param word
* @return
auto foundHref = findStr ( "href", word );
auto foundHttp = findNext ( "http", foundHref );
url = "";
auto closeTag = findNext ( ">", word.begin ( ) );
while ( *foundHttp != *closeTag )