Skip to content
Snippets Groups Projects
crawler.h 1.44 KiB
#pragma once

#include<vector>
#include "spider.h"
#include<string>
#include "../shared/ProducerConsumerQueue.h"
#include <unordered_map>
#include "UrlFrontier.h"

//#include "CrawlerStatistics.h"
/*
 *
 */
using namespace std;
using DocIndex = const unordered_map< string, vector< unsigned long > >;

class Crawler
	{

public:
	Crawler ( string mode_in,
				 UrlFrontier  *url_q_in,
				 ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
			: IndexerQueue( doc_index_queue_in ),
			  mode( mode_in ),
			  urlFrontier( url_q_in )
		{ };

	//spawns a number of works
	void SpawnSpiders ( size_t num_spiders, atomic_bool* alive );

	//Creates a housekeeping thread
	void houseKeeper ( );

	void KillAllSpiders ( );

	void WaitOnAllSpiders ( );
	UrlFrontier  *urlFrontier;


private:
	vector< Spider * > spiders;
	//UrlFrontier  *urlFrontier;
	ProducerConsumerQueue< DocIndex * > *IndexerQueue;
	//CrawlerStatistics housekeeper;
	string mode;

	};


//spiders : threads doing work of fetching urls
//houseKeeper : This thread is generally quiescent except that it wakes up once every few seconds to log crawl progress statistics
// (URLs crawled, frontier size, etc.), decide whether to terminate the crawl, or (once every few hours of crawling) checkpoint the crawl. In checkpointing, a snapshot of the crawler's state (say, the URL frontier) is committed to disk. In the event of a catastrophic crawler failure, the crawl is restarted from the most recent checkpoint.