Skip to content
Snippets Groups Projects
Commit 93aef450 authored by vcday's avatar vcday
Browse files

added parser.cpp

parent 5d2dea33
No related branches found
No related tags found
No related merge requests found
.idea/
CMakeLists.txt
cmake-build-debug/
//
// Created by Veronica Day on 1/28/18.
//
#include <string>
#include <functional>
#include <queue>
#include <iostream>
#include <fstream>
using namespace std;
// Doc Id
std::priority_queue<int> DOCID_PQ;
std::priority_queue<string> URL_PQ;
string PATH = "/doc";
//TODO
// get doc id from DocIDqueue (sent from crawler)
// go to disk and get the HTML file
// parse the html file
// if find url; send to crawler
// if find title send string to tokenizer
class Parser
{
public:
/**
* Parser
* @return
*/
string execute()
{
ifstream inFile;
string docid = to_string(DOCID_PQ.top());
inFile.open(PATH+docid);
// error checking
if (!inFile)
cerr << "Unable to open file datafile.txt";
parse(inFile);
//TODO
// close file and remove
}
private:
/**
* Parses file
* @param inFile
* @return
*/
string parse(ifstream inFile)
{
string word = "";
while (!inFile.eof())
{
inFile >> word;
// checks for url
check_url(word);
// checks for title tags
Tokenizer.execute(check_title(word));
}
}
/**
* Checks for url in string word
* @param word
*/
void check_url(string &word)
{
if (char* pos = strstr("href", word))
{
while (pos != "\"" && pos != "\'")
++pos;
// take everything until next quote
string url = "";
++pos;
while (pos != "\"" && pos != "\'")
{
url += *pos;
}
// send it back to the crawler
URL_PQ.push(url);
}
}
/**
* <title >AJF</title>
* @param word
*/
string check_title(string &word)
{
if (char* pos = strstr("<title>", word))
{
pos += 6;
end_pos = strstr("</title>", word);
string title = "";
while (pos != end_pos)
{
++pos;
title += *pos;
}
return title;
}
}
};
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment