Skip to content
Snippets Groups Projects
Commit eb55e2f3 authored by vcday's avatar vcday
Browse files

parser changes

parent fbbf2bf0
Branches
No related tags found
No related merge requests found
......@@ -12,7 +12,8 @@
#include <queue>
#include <iostream>
#include <fstream>
#include <stringProcessing.h>
#include "Tokenizer.h"
#include "stringProcessing.h"
using namespace std;
......@@ -49,15 +50,13 @@ public:
*/
// input: object with char* and URL string
//
string execute()
const unordered_map<string, vector<int>> execute()
{
Tokenizer tokenizer();
Tokenizer tokenizer;
//TEMP - until we get real input from crawler
raw_data data("url", "html");
parse(data.html_data, &tokenizer);
return tokenzier.get();
return tokenizer.get();
}
......@@ -72,22 +71,17 @@ private:
string parse(string &html_data, Tokenizer *tokenizer)
{
//figure out file handle syntax - pointer to file
tokenizerInput = "";
currentTerm = "";
string tokenizerInput = "";
string currentTerm = "";
for (int i = 0; i < html_data.size(); ++i) {
while (html_data[i] != ' ') {
currentTerm += html_data[i];
}
//one method that directly adds urls onto frontier instead of checking for them
if (!check_title(currentTerm)) {
add_urls(current_term);
}
else {
tokenizerInput += currentTerm;
//can also pass titles individually through tokenizer instead of concatonating (idk how to spell)
}
add_urls(currentTerm);
check_title(currentTerm);
tokenizerInput += currentTerm;
}
tokenizer->execute(tokenizerInput);
......@@ -104,12 +98,12 @@ private:
string http_end_tag = ">";
auto word_iter = word.begin();
url = "";
string url = "";
word_iter = findStr(word_iter, a_tag);
if (word_iter) {
if (word_iter != nullptr) {
auto found_http = findStr(word_iter, http_start);
if (found_http) {
url = "http";
if (found_http != nullptr) {
url = "http";
found_http += 9;
auto end_http = findStr(word_iter, http_end_tag);
while (found_http != end_http) {
......@@ -129,36 +123,6 @@ private:
}
/**
* Checks for url in string word
* @param word
*/
bool check_url(string &word)
{
//need to add string processing function where you check in a specified range of positions
if (char* pos = strstr("href", word))
{
while (pos != "\"" && pos != "\'")
++pos;
// take everything until next quote
string url = "";
++pos;
while (pos != "\"" && pos != "\'")
{
//filter out everything except http, https
url += *pos;
}
// send it back to the crawler
URL_PQ.push(url);
return true;
}
return false;
}
/**
* <title >AJF</title>
* @param word
......@@ -166,10 +130,10 @@ private:
bool check_title(string &word)
{
/*if (char* pos = strstr("<title>", word))
if (char* pos = strstr("<title>", word))
{
pos += 6;
end_pos = strstr("</title>", word);
auto end_pos = strstr("</title>", word);
string title = "";
while (pos != end_pos)
{
......@@ -179,105 +143,13 @@ private:
}
return title;
}*/
}
begin_title = "<title>";
auto word_begin = word.begin();
auto word_iter = findStr();
// string begin_title = "<title>";
// auto word_begin = word.begin();
// auto word_iter = findStr(word_begin, begin_title);
}
<<<<<<< HEAD
/**
* Checks for url in string word
* @param word
*/
bool check_url(string &word)
{
if (char* pos = strstr("href", word))
{
while (pos != "\"" && pos != "\'")
++pos;
// take everything until next quote
string url = "";
++pos;
while (pos != "\"" && pos != "\'")
{
//filter out everything except http, https
url += *pos;
}
// send it back to the crawler
URL_PQ.push(url);
return true;
}
return false;
}
/**
* <title >AJF</title>
* @param word
*/
string check_title_handle(string &word)
{
string titleTag = "<title>";
string closeTitleTag = "</title>";
auto wordBegin = word.begin();
string allTitles = "";
while (wordBegin != nullptr)
{
wordBegin = findStr(wordBegin, titleTag);
if (wordBegin == nullptr)
{
return allTitles;
}
//increments until first letter past opening title tag
wordBegin += 7;
auto end_title = findStr(wordBegin, closeTitleTag);
while (wordBegin != end_title)
{
allTitles += *wordBegin;
++wordBegin;
if (wordBegin == nullptr)
{
return allTitles;
}
}
//increments until first letter past closing title tag
wordBegin += 8;
}
return allTitles;
}
string check_title(string &word)
{
if (char* pos = strstr("<title>", word))
{
pos += 6;
end_pos = strstr("</title>", word);
string title = "";
while (pos != end_pos)
{
++pos;
title += *pos;
}
return title;
}
}
//TODO
};
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment