Skip to content
Snippets Groups Projects
Commit 3cc58b8e authored by jsclose's avatar jsclose
Browse files

removed the on disk doc map look upstuff, and created an isolated crawler test

parent 8b26f280
Branches
No related tags found
No related merge requests found
......@@ -46,6 +46,27 @@ add_executable(crawler-parser-indexer-test
util/stringProcessing.cpp
indexer/Indexer.cpp)
add_executable(isolated-integration
crawler/tests/crawlerTest.cpp
shared/ProducerConsumerQueue.h
shared/ThreadClass.h
shared/url.h
crawler/crawler.cpp
crawler/Readers/StreamReader.h
crawler/Readers/HttpReader.cpp
crawler/Readers/HttpsReader.cpp
crawler/Readers/LocalReader.cpp
crawler/spider.cpp
util/util.cpp
shared/Document.cpp
parser/Parser.cpp
util/Stemmer.cpp
util/Tokenizer.cpp
util/stringProcessing.cpp
indexer/Indexer.cpp)
add_executable(StringProcessingTest
util/stringProcessing.cpp
util/Stemmer.cpp
......@@ -92,6 +113,8 @@ find_package(OpenSSL REQUIRED)
target_link_libraries(ParserTest OpenSSL::SSL)
target_link_libraries(isolated-integration OpenSSL::SSL pthread)
target_link_libraries(crawler-parser-test OpenSSL::SSL pthread)
target_link_libraries(crawler-parser-indexer-test OpenSSL::SSL pthread)
......
......@@ -4,18 +4,32 @@
#include "crawler.h"
void Crawler::SpawnSpiders ( size_t num_spiders, unordered_map< string, int > *docMapLookup,
unordered_map< size_t, int > *duplicateUrlMap )
/*
*
* @parms number of spiders
* Creates a number of spiders and starts new threads for them
*
*/
void Crawler::SpawnSpiders ( size_t num_spiders,
unordered_map< size_t, int > *duplicateUrlMap
)
{
for ( size_t i = 0; i < num_spiders; i++ )
{
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap, this->IndexerQueue );
Spider *temp = new Spider( this->mode, this->urlFrontier , duplicateUrlMap, this->IndexerQueue );
temp->StartThread( );
this->spiders.push_back( temp );
}
}
/*
*
*Function to wait for all of the threads to finish running
*
*/
void Crawler::WaitOnAllSpiders ( )
{
cout << "Waiting for spiders to finish...\n";
......
......@@ -17,13 +17,16 @@ class Crawler
{
public:
Crawler ( string mode_in, ProducerConsumerQueue< ParsedUrl > *url_q_in,
ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
: IndexerQueue( doc_index_queue_in ), mode( mode_in ), urlFrontier( url_q_in )
Crawler ( string mode_in,
ProducerConsumerQueue< ParsedUrl > *url_q_in,
ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
: IndexerQueue( doc_index_queue_in ),
mode( mode_in ),
urlFrontier( url_q_in )
{ };
//spawns a number of works
void SpawnSpiders ( size_t num_spiders, unordered_map< string, int > *docMapLookup,
void SpawnSpiders ( size_t num_spiders,
unordered_map< size_t, int > *duplicateUrlMap );
//Creates a housekeeping thread
......
......@@ -79,7 +79,7 @@ void Spider::run ( )
std::cout << "Spider is crawling" << endl;
int cond = 0;
while ( cond < 25 )
while ( cond < true )
{
ParsedUrl currentUrl = getUrl( );
size_t docID = hash( currentUrl.CompleteUrl );
......@@ -122,8 +122,11 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
* Takes in a parsed url, creates a document object, writes information about the document to disk
* returns the begining position of the document on disk, stores that into the in memory lookup hash table
*/
bool Spider::writeDocToDisk ( ParsedUrl url )
{
/*
Document d( url );
int resultPosition = d.WriteToDocMap( );
if ( resultPosition == -1 )
......@@ -136,7 +139,7 @@ bool Spider::writeDocToDisk ( ParsedUrl url )
{
std::cout << it->first << " => " << it->second << '\n';
}
*/
return true;
}
......
......@@ -24,13 +24,11 @@ public:
Spider ( string mode_in,
ProducerConsumerQueue< ParsedUrl > *url_q_in,
unordered_map< string, int > *doc_map_lookup_in,
unordered_map< size_t, int > *duplicate_url_map_in,
ProducerConsumerQueue< DocIndex * > *doc_index_queue_in
)
: mode( mode_in ),
urlFrontier( url_q_in ),
docMapLookup( doc_map_lookup_in ),
parser( url_q_in ),
duplicateUrlMap( duplicate_url_map_in ),
IndexerQueue( doc_index_queue_in )
......@@ -60,7 +58,6 @@ private:
ProducerConsumerQueue< DocIndex * > *IndexerQueue;
unordered_map< size_t, int > *duplicateUrlMap;
string mode;
unordered_map< string, int > *docMapLookup;
Parser parser;
};
\ No newline at end of file
//
// Created by Jake Close on 3/21/18.
//
#include "../crawler.h"
#include "../spider.h"
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <queue>
#include "../../crawler/crawler.h"
#include <openssl/ssl.h>
#include <string>
#include <unordered_map>
#include "../../util/util.h"
#include <getopt.h>
#include "../../indexer/Indexer.h"
using DocIndex = const unordered_map< string, vector< unsigned long > >;
using namespace std;
int main ( int argc, char *argv[] )
{
int numberOfSpiders = 1;
unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( );
ProducerConsumerQueue < ParsedUrl > *urlFrontier = new ProducerConsumerQueue < ParsedUrl >( );
ProducerConsumerQueue < DocIndex * > *IndexerQueue = new ProducerConsumerQueue < DocIndex * >( );
Indexer indexer( IndexerQueue );
indexer.StartThread( );
string mode = "web";
Crawler crawler( mode, urlFrontier, IndexerQueue );
crawler.SpawnSpiders( numberOfSpiders , duplicateUrlMap );
crawler.WaitOnAllSpiders( );
indexer.WaitForFinish( );
auto f = urlFrontier->Pop( );
int x = 0;
delete urlFrontier;
return 1;
}
\ No newline at end of file
File added
......@@ -132,7 +132,6 @@ int main ( int argc, char *argv[] )
ParsedUrl url = ParsedUrl( testFile );
urlFrontier->Push( url );
}
unordered_map< string, int > *docMapLookUp = new unordered_map< string, int >( );
Indexer indexer( IndexerQueue );
......@@ -140,7 +139,7 @@ int main ( int argc, char *argv[] )
Crawler crawler( mode, urlFrontier, IndexerQueue );
crawler.SpawnSpiders( numberOfSpiders, docMapLookUp, duplicateUrlMap );
crawler.SpawnSpiders( numberOfSpiders, duplicateUrlMap );
crawler.WaitOnAllSpiders( );
indexer.WaitForFinish( );
......
https://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:cars
https://wikipedia.org/wiki/71st_British_Academy_Film_Awards
http://www.bbc.com/
https://www.eecs.umich.edu/
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment