Skip to content
Snippets Groups Projects
Commit f96179eb authored by jsclose's avatar jsclose
Browse files

trying to integrate parser and crawler

parent 0dc421db
No related branches found
No related tags found
No related merge requests found
......@@ -3,16 +3,23 @@ project(eecs398_search)
set(CMAKE_CXX_STANDARD 11)
add_executable(eecs398_search
add_executable(search
main.cpp
shared/ProducerConsumerQueue.cpp
shared/ProducerConsumerQueue.h
shared/ProducerConsumerQueue_test.cpp parser/Parser.h
parser/Parser.h
parser/Parser.h
util/Tokenizer.h
util/stringProcessing.h
util/Stemmer.h)
add_executable(crawl main.cpp shared/ProducerConsumerQueue.h shared/ThreadClass.h shared/url.h crawler/crawler.cpp crawler/SocketReader.cpp crawler/StreamReader.h crawler/spider.cpp util/util.h crawler/LocalReader.h crawler/StreamReader.h parser/Parser.h shared/Document.cpp parser/Parser.cpp)
add_executable(test1 main.cpp shared/ProducerConsumerQueue.h
shared/ThreadClass.h crawler/crawler.cpp crawler/spider.cpp shared/url.h crawler/StreamReader.h util/util.cpp crawler/SocketReader.cpp crawler/SocketReader.h crawler/LocalReader.h )
add_executable(StringProcessingTest
util/stringProcessing.h
util/Stemmer.h
......@@ -32,3 +39,10 @@ add_executable(ParserEndToEndTest
shared/url.h
parser/tests/parserTest.cpp)
find_package(OpenSSL REQUIRED)
target_link_libraries(crawl OpenSSL::SSL)
target_link_libraries(test1 OpenSSL::SSL)
\ No newline at end of file
File added
......@@ -6,18 +6,18 @@
#include "spider.h"
#include "../parser/Parser.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <unistd.h>
#include "../util/util.h"
#include "LocalReader.h"
#include "SocketReader.h"
#include "../shared/Document.h"
#include "../util/util.h"
size_t Spider::hash(const char * s){
{
// http://www.cse.yorku.ca/~oz/hash.html
......@@ -62,7 +62,11 @@ void Spider::FuncToRun()
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + to_string(docID)+ ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
//parser.parse(reader);
/*
Document document ( currentUrl, reader->buffer );
auto dictionary = parser.execute ( &document );
*/
cond = true;
}
else
......@@ -87,7 +91,10 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
*/
/*
* Takes in a parsed url, creates a document object, writes information about the document to disk
* returns the begining position of the document on disk, stores that into the in memory lookup hash table
*/
bool Spider::writeDocToDisk(ParsedUrl url)
{
Document d(url);
......@@ -103,6 +110,12 @@ bool Spider::writeDocToDisk(ParsedUrl url)
return true;
}
/*
*
* Takes a parsed url, checks if its in the local in memory hash table of documents return false
* If url was crawled but past a certain point, reindexs or does not exist , indexes the doc
* and returns true
*/
bool Spider::shouldURLbeCrawled( ParsedUrl url )
......
......@@ -9,6 +9,8 @@
#include<iostream>
#include <unordered_map>
#include "StreamReader.h"
#include "../parser/Parser.h"
#include "../util/util.h"
using namespace std;
......@@ -20,8 +22,9 @@ public:
Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in,
unordered_map < string, int > *doc_map_lookup_in )
: mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in )
{ };
: mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( Parser( url_q_in))
{
};
//Takes a url off of the url frontier
......@@ -45,6 +48,7 @@ public:
private:
int locationOnDisk;
Parser parser;
ProducerConsumerQueue < string > *urlFrontier;
string mode;
unordered_map < string, int > *docMapLookup;
......
This diff is collapsed.
//
// Created by Jake Close on 3/5/18.
//
#include "Parser.h"
/**
* Parses file
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void Parser::parse ( string html, Tokenizer *tokenizer )
{
auto htmlIt = html.begin();
int offset = 0;
while (htmlIt != html.end())
{
// if open bracket
if ( *htmlIt == '<' )
{
auto begCloseTag = findNext ("</", htmlIt);
auto endCloseTag = findNext ( ">", begCloseTag);
string line (htmlIt, endCloseTag + 1);
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url ( line );
if (url != "")
{
urlFrontier->Push ( url );
}
// check if line is title
else
{
string title = extract_title ( line );
if (title != "")
{
tokenizer->execute ( title, offset );
}
}
//TODO fix offset?
offset = htmlIt - html.begin();
}
else
{
++htmlIt;
}
}
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
string Parser::extract_url ( string word )
{
string url = "";
if ( *findStr ( "<a", word ) != '\0' )
{
auto foundHref = findStr ( "href", word );
auto foundHttp = findNext ( "http", foundHref );
if ( *foundHttp != '\0' )
{
url = "";
auto closeTag = findNext ( ">", word.begin ( ) );
while ( *foundHttp != *closeTag )
{
url += *foundHttp;
++foundHttp;
}
}
}
return url;
}
/**
* Returns a title, or "" if none
* @param word
* @return
*/
string Parser::extract_title ( string & word )
{
string title = "";
char end = '<';
auto pos = findStr ( "<title>", word );
if ( *pos != '\0')
{
pos += 7;
while ( *pos != end )
{
title += *pos;
++pos;
}
}
return title;
}
......@@ -6,7 +6,7 @@
// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue
//
#pragma once
#include <string>
#include <functional>
#include <queue>
......@@ -17,6 +17,7 @@
#include "../shared/Document.h"
#include "../shared/ProducerConsumerQueue.h"
using namespace std;
/**
......@@ -56,95 +57,24 @@ private:
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void parse ( string html, Tokenizer *tokenizer )
{
auto htmlIt = html.begin();
int offset = 0;
while (htmlIt != html.end())
{
// if open bracket
if ( *htmlIt == '<' )
{
auto begCloseTag = findNext ("</", htmlIt);
auto endCloseTag = findNext ( ">", begCloseTag);
string line (htmlIt, endCloseTag + 1);
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url ( line );
if (url != "")
{
urlFrontier->Push ( url );
}
// check if line is title
else
{
string title = extract_title ( line );
if (title != "")
{
tokenizer->execute ( title, offset );
}
}
//TODO fix offset?
offset = htmlIt - html.begin();
}
else
{
++htmlIt;
}
}
void parse ( string html, Tokenizer *tokenizer );
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
string extract_url ( string word )
{
string url = "";
if ( *findStr ( "<a", word ) != '\0' )
{
auto foundHref = findStr ( "href", word );
auto foundHttp = findNext ( "http", foundHref );
if ( *foundHttp != '\0' )
{
url = "";
auto closeTag = findNext ( ">", word.begin ( ) );
while ( *foundHttp != *closeTag )
{
url += *foundHttp;
++foundHttp;
}
}
}
return url;
}
string extract_url ( string word );
/**
* Returns a title, or "" if none
* @param word
* @return
*/
string extract_title ( string & word )
{
string title = "";
char end = '<';
auto pos = findStr ( "<title>", word );
if ( *pos != '\0')
{
pos += 7;
while ( *pos != end )
{
title += *pos;
++pos;
}
}
return title;
}
string extract_title ( string & word );
};
......@@ -27,7 +27,10 @@ void testSimple ( )
{
ProducerConsumerQueue < string > urlFrontierTest;
Document document ( "<title>This Cat Title Cat</title>" );
ParsedUrl url = ParsedUrl("testurl.com");
char docString[10240];
strcpy(docString, "<title>This Cat Title Cat</title>");
Document document ( url, docString);
Parser parser ( &urlFrontierTest );
auto dictionary = parser.execute ( &document );
......@@ -49,13 +52,15 @@ void testComplex ( )
ProducerConsumerQueue < string > urlFrontierTest;
ifstream file("../tests/cats.html");
string temp;
string docString = "<title>Joe the Cat</title>\n";
docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
char docString[10240];
strcpy(docString, "<title>Joe the Cat</title>\n");
strcat(docString, "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n");
while(std::getline(file, temp)) {
docString += temp;
//strcat(docString, str(temp));
}
Document document ( docString );
ParsedUrl url = ParsedUrl("https://www.w3schools.com/tests/cats.html");
Document document ( url, docString );
Parser parser ( &urlFrontierTest );
auto dictionary = parser.execute ( &document );
......
//
// Created by Jake Close on 3/5/18.
//
#include "Document.h"
string Document::DocToString ( )
{
return string ( docString, strlen ( docString ) ) + "\n";
}
int Document::WriteToDocMap ( )
{
pthread_mutex_lock ( &docMap_mutex );
//for now just write url
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "W" );
off_t resultPosition = 0;
try
{
//check if its available
if ( file == -1 )
{
throw ( "error opening docMap" );
}
else
{
//get the current size of the docMap
size_t seekPosition = util::FileSize ( file );
//seek to the end of the file
resultPosition = lseek ( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
{
throw ( "Could not seek" );
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write ( file, this->DocToString ( ).c_str ( ),
strlen ( this->DocToString ( ).c_str ( ) ) );
if ( success == -1 )
{
throw ( "Error writing document object to document map" );
}
}
}
catch ( const char *str )
{
cerr << str << endl;
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return -1;
}
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return resultPosition;
}
void Document::PrintDocMap ( string url, int location )
{
pthread_mutex_lock ( &docMap_mutex );
std::cout << url << " is " << location;
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "R" );
//check if its available
if ( file )
{
off_t resultPosition = lseek ( file, ( size_t ) location, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
{
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read ( file, buffer, bytes ) )
write ( 1, buffer, bytesRead );
else
{
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
pthread_mutex_unlock ( &docMap_mutex );
return;
}
}
}
pthread_mutex_unlock ( &docMap_mutex );
return;
};
\ No newline at end of file
......@@ -8,6 +8,7 @@
#include <string>
#include <vector>
#include <pthread.h>
#include "../util/util.h"
using namespace std;
......@@ -23,107 +24,26 @@ class Document
{
private:
ParsedUrl url;
char *docString;
long docID;
bool lastCrawlStatus;
int lastCrawlDate;
int lastCrawlPageCount;
int lastCrawlWordCount;
//add more info fields here
public:
Document ( string url_in ) : url ( ParsedUrl ( url_in ) )
{ }
string DocToString ( )
{
return string ( url.CompleteUrl, strlen ( url.CompleteUrl ) ) + "\n";
}
int WriteToDocMap ( )
{
pthread_mutex_lock ( &docMap_mutex );
//for now just write url
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "W" );
off_t resultPosition = 0;
try
{
//check if its available
if ( file == -1 )
{
throw ( "error opening docMap" );
}
else
{
//get the current size of the docMap
size_t seekPosition = util::FileSize ( file );
//seek to the end of the file
resultPosition = lseek ( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
{
throw ( "Could not seek" );
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write ( file, this->DocToString ( ).c_str ( ),
strlen ( this->DocToString ( ).c_str ( ) ) );
if ( success == -1 )
{
throw ( "Error writing document object to document map" );
}
}
}
catch ( const char *str )
{
cerr << str << endl;
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return -1;
}
close ( file );
pthread_mutex_unlock ( &docMap_mutex );
return resultPosition;
}
static void PrintDocMap ( string url, int location )
{
pthread_mutex_lock ( &docMap_mutex );
Document( ParsedUrl url_in ) : url((url_in)), docString( nullptr )
{ }
std::cout << url << " is " << location;
Document( ParsedUrl url_in, char *docStringIn ) : url((url_in)), docString( docStringIn )
{ }
string loc = util::GetCurrentWorkingDir ( ) + filepath::DOC_MAP;
int file = util::getFileDescriptor ( loc.c_str ( ), "R" );
string DocToString();
int WriteToDocMap();
//check if its available
if ( file )
{
off_t resultPosition = lseek ( file, ( size_t ) location, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
{
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read ( file, buffer, bytes ) )
write ( 1, buffer, bytesRead );
else
{
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
pthread_mutex_unlock ( &docMap_mutex );
return;
}
}
}
pthread_mutex_unlock ( &docMap_mutex );
return;
}
static void PrintDocMap( string url, int location );
};
\ No newline at end of file
......@@ -2,9 +2,8 @@
// Created by Veronica Day on 2/22/18.
//
#ifndef EECS398_SEARCH_STEMMER_H
#define EECS398_SEARCH_STEMMER_H
#pragma once
class Stemmer
{
......@@ -12,4 +11,3 @@ class Stemmer
};
#endif //EECS398_SEARCH_STEMMER_H
//
// Created by anvia on 1/31/2018.
//
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
......
......@@ -2,9 +2,7 @@
// Created by anvia on 1/31/2018.
//
#ifndef EECS398_SEARCH_STRINGPROCESSING_H
#define EECS398_SEARCH_STRINGPROCESSING_H
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
......@@ -201,4 +199,3 @@ string stemWord(string word)
return "";
}
#endif //EECS398_SEARCH_STRINGPROCESSING_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment