Skip to content
Snippets Groups Projects
crawler.h 1.23 KiB
#pragma once

#include<vector>
#include "spider.h"
#include<string>
#include "../ProducerConsumerQueue.h"
#include "CrawlerStatistics.h"
/*
 *
 */
using namespace std;

class Crawler {

public:
    Crawler(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<int>* html_q_in)
            : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) {  } ;

    //spawns a number of works
    void SpawnSpiders(size_t num_spiders);

    //Creates a housekeeping thread
    void houseKeeper();

    void WaitOnAllSpiders();

private:
    vector<Spider*> spiders;
    ProducerConsumerQueue<string> *urlFrontier;
    ProducerConsumerQueue<int> *fileQueue;
    CrawlerStatistics housekeeper;
    string mode;

};


//spiders : threads doing work of fetching urls
//houseKeeper : This thread is generally quiescent except that it wakes up once every few seconds to log crawl progress statistics
// (URLs crawled, frontier size, etc.), decide whether to terminate the crawl, or (once every few hours of crawling) checkpoint the crawl. In checkpointing, a snapshot of the crawler's state (say, the URL frontier) is committed to disk. In the event of a catastrophic crawler failure, the crawl is restarted from the most recent checkpoint.