Skip to content
Snippets Groups Projects
crawler.h 1.44 KiB
Newer Older
  • Learn to ignore specific revisions
  • #pragma once
    
    jsclose's avatar
    jsclose committed
    
    
    #include<vector>
    #include "spider.h"
    #include<string>
    
    jsclose's avatar
    jsclose committed
    #include "../shared/ProducerConsumerQueue.h"
    
    #include <unordered_map>
    
    jsclose's avatar
    jsclose committed
    #include "UrlFrontier.h"
    
    jsclose's avatar
    jsclose committed
    //#include "CrawlerStatistics.h"
    
    jsclose's avatar
    jsclose committed
    /*
     *
     */
    
    using namespace std;
    
    using DocIndex = const unordered_map< string, vector< unsigned long > >;
    
    jsclose's avatar
    jsclose committed
    
    
    jsclose's avatar
    jsclose committed
    class Crawler
    	{
    
    jsclose's avatar
    jsclose committed
    
    public:
    
    jsclose's avatar
    jsclose committed
    				 UrlFrontier  *url_q_in,
    
    				 ProducerConsumerQueue< DocIndex * > *doc_index_queue_in )
    			: IndexerQueue( doc_index_queue_in ),
    			  mode( mode_in ),
    			  urlFrontier( url_q_in )
    
    jsclose's avatar
    jsclose committed
    		{ };
    
    jsclose's avatar
    jsclose committed
    	//spawns a number of works
    
    	void SpawnSpiders ( size_t num_spiders, atomic_bool* alive );
    
    jsclose's avatar
    jsclose committed
    	//Creates a housekeeping thread
    
    vcday's avatar
    vcday committed
    	void houseKeeper ( );
    
    vcday's avatar
    vcday committed
    	void KillAllSpiders ( );
    
    	void WaitOnAllSpiders ( );
    
    	UrlFrontier  *urlFrontier;
    
    
    jsclose's avatar
    jsclose committed
    
    
    benbergk's avatar
    benbergk committed
    private:
    
    vcday's avatar
    vcday committed
    	vector< Spider * > spiders;
    
    	//UrlFrontier  *urlFrontier;
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue< DocIndex * > *IndexerQueue;
    
    jsclose's avatar
    jsclose committed
    	//CrawlerStatistics housekeeper;
    	string mode;
    
    jsclose's avatar
    jsclose committed
    
    
    jsclose's avatar
    jsclose committed
    	};
    
    jsclose's avatar
    jsclose committed
    
    
    //spiders : threads doing work of fetching urls
    //houseKeeper : This thread is generally quiescent except that it wakes up once every few seconds to log crawl progress statistics
    // (URLs crawled, frontier size, etc.), decide whether to terminate the crawl, or (once every few hours of crawling) checkpoint the crawl. In checkpointing, a snapshot of the crawler's state (say, the URL frontier) is committed to disk. In the event of a catastrophic crawler failure, the crawl is restarted from the most recent checkpoint.