Skip to content
Snippets Groups Projects
spider.h 1.17 KiB
Newer Older
  • Learn to ignore specific revisions
  • #pragma once
    
    
    #include<string>
    #include <pthread.h>
    #include <fstream>
    
    jsclose's avatar
    jsclose committed
    #include "../shared/ProducerConsumerQueue.h"
    #include "../shared/ThreadClass.h"
    
    #include <iostream>
    
    #include <unordered_map>
    
    benbergk's avatar
    benbergk committed
    #include "Readers/StreamReader.h"
    
    #include "../util/util.h"
    
    benbergk's avatar
    benbergk committed
    #include "../parser/Parser.h"
    
    
    using namespace std;
    
    
    using DocIndex = const unordered_map< string, vector< unsigned long > >;
    
    jsclose's avatar
    jsclose committed
    class Spider : public ThreadClass
    	{
    
    vcday's avatar
    vcday committed
    	Spider ( string mode_in,
    
    jsclose's avatar
    jsclose committed
    				UrlFrontier  *url_q_in,
    
    	         ProducerConsumerQueue< DocIndex * > *doc_index_queue_in,
    				atomic_bool * bool_in
    
    vcday's avatar
    vcday committed
    	)
    
    			: mode( mode_in ),
    			  urlFrontier( url_q_in ),
    
    vcday's avatar
    vcday committed
    			  parser( url_q_in ),
    
    			  IndexerQueue( doc_index_queue_in ),
    			  alive( bool_in )
    
    jsclose's avatar
    jsclose committed
    	//Takes a url off of the url frontier
    
    jsclose's avatar
    jsclose committed
    	ParsedUrl * getUrl ( );
    
    vcday's avatar
    vcday committed
    
    	virtual void run ( );
    
    	bool writeDocToDisk ( ParsedUrl url );
    
    vcday's avatar
    vcday committed
    	bool shouldURLbeCrawled ( size_t docId );
    
    vcday's avatar
    vcday committed
    	size_t hash ( const char *s );
    
    benbergk's avatar
    benbergk committed
    	//int getRobots(ParsedUrl url );
    
    vcday's avatar
    vcday committed
    	bool checkRobots ( ParsedUrl url );
    
    benbergk's avatar
    benbergk committed
    private:
    
    jsclose's avatar
    jsclose committed
    	int locationOnDisk;
    
    jsclose's avatar
    jsclose committed
    	UrlFrontier *urlFrontier;
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue< DocIndex * > *IndexerQueue;
    
    jsclose's avatar
    jsclose committed
    	string mode;
    
    vcday's avatar
    vcday committed
    	Parser parser;
    
    jsclose's avatar
    jsclose committed
    	};