diff --git a/crawler/crawler.h b/crawler/crawler.h index f4c04a7655226ec173c4ac2d6747b24199fdd2cf..b8aac95b514594c1180c873fbd9fa9c57b314df2 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -1,26 +1,45 @@ +#pragma once - +#include<vector> +#include "spider.h" +#include<string> /* * - * Must provide - Robustness: -The Web contains servers that create spider traps, which are generators of web pages that mislead crawlers into getting stuck fetching an infinite number of pages in a particular domain. Crawlers must be designed to be resilient to such traps. Not all such traps are malicious; some are the inadvertent side-effect of faulty website development. -Politeness: -Web servers have both implicit and explicit policies regulating the rate at which a crawler can visit them. These politeness policies must be respected. - */ - +using namespace std; class Crawler { - //robots.txt cache + vector< *Spider> spiders; public: + string mode; + + ProducerConsumerQueue *urlFrontier; + ProducerConsumerQueue *fileQueue; + + + //spawns a number of works + void spawnSpiders(size_t numberOfSpiders) + { + for( size_t i = 0 ; i < numberOfSpiders; i++) + { + Spider *temp = new Spider( this.mode ); + this->spiders.push_back(temp); + } + + + } + + //Creates a housekeeping thread + void houseKeeper(); + + Crawler(string mode_in, ProducerConsumerQueue* url_q_in , ProducerConsumerQueue* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in); diff --git a/crawler/spider.h b/crawler/spider.h new file mode 100644 index 0000000000000000000000000000000000000000..84c22a8c610bf4fe13484e22e65f09da840a7015 --- /dev/null +++ b/crawler/spider.h @@ -0,0 +1,102 @@ + +#pragma once + + +#include<string> +#include <pthread.h> +#include "crawler.h" +#include <fstream> +using namespace std; +Crawler c( " m ") ; + +class Spider : public Crawler { + +private: + int locationOnDisk; + pthread_t runningThread; + +public: + + + //Takes a url off of the url frontier + string getUrl( ) + { + return urlFrontier->Pop(); + + } + + void* run(void * arg){ + + + while( true ) + { + string currentUrl = getUrl(); + if ( request( currentUrl ) ) + { + // markURLSeen( currentUrl ); + // writeHTMLtoDisk( ); + // addHTMLToQueue( ); + + } + else + { + cerr << "Error connecting" ; + } + + + + } + + + } + + //Makes request to given url + // if successful, writes file to disk, stores location to memeber value + // else return false and error information, retry if necessary + bool request( string url ) + { + if ( mode == 'local' ) + { + ifstream inFile; + string in; + inFile.open(url); + if ( !inFile ) { + cout << "Unable to open file"; + exit(1); // terminate with error + } + + while (inFile >> in) { + cout << in << endl; + } + + inFile.close(); + + + } + + } + + + //Where to write to disk? What type of data are we reading in? + void writeHTMLtoDisk( ); + + //Adds location + void addHTMLToQueue(); + + + void markURLSeen( string URL ); + + + Spider( ) + { + cout << "SPAWNING NEW SPIDER " << endl; + pthread_create(&runningThread, NULL, run, nullptr); + + }; + + + + + + +}; \ No newline at end of file diff --git a/main.cpp b/main.cpp index 1ab0f90f62551dc9fac8d28103e35f503dd30aee..ba561eb14eb10531f03eb1a78dd73c7bb92c7789 100644 --- a/main.cpp +++ b/main.cpp @@ -2,12 +2,71 @@ // main.cpp // + + #include <iostream> #include <stdlib.h> #include <pthread.h> #include <queue> +#include "crawler/crawler.h" +#include <string> +#include <ProducerConsumerQueue.h> +#include <ProducerConsumerQueue.cpp> + + +#define PATH_TO_BLACKLIST = '/bin/blacklist.txt' +#define PATH_TO_VISITED_URL = 'bin/urls.txt' +#define PATH_TO_HTML_DIR = 'bin/html/' +#define PATH_TO_INDEX = 'bin/index/wordIDX' +#define PATH_TO_DOC_INDEX = 'bin/index/docIDX' + + +using namespace std; + int main(int argc, const char * argv[]) { + /* + * + * Settings Flags to control program on start up + * to be read in via command line with default settings + * + * string : Mode : Getting content from the web vs local + * + * string : Seed : filename of list of starting urls + * + * int : numberOfSpiders: # of spiders crawler spawns + * + * int : numberOfParsers: # of parsers spawned + * + * bool : restoreFromLog: bool represeting if the program should load from saved state + */ + + // + string mode = "local"; + // Seed urls? + string seed; + // + int numberOfSpiders; + int numberOfParsers; + bool restoreFromLog; + + + ProducerConsumerQueue urlFrontier; + ProducerConsumerQueue fileQueue; + + urlFrontier.Push("tests/cats.html"); + + + Crawler crawler(mode, &urlFrontier, &fileQueue ); + + crawler.spawnSpiders(1); + + // crawler.houseKeeper(); + + + + + } \ No newline at end of file