Skip to content
Snippets Groups Projects
spider.h 1.14 KiB
#pragma once


#include<string>
#include <pthread.h>
#include <fstream>
#include "../shared/ProducerConsumerQueue.h"
#include "../shared/ThreadClass.h"
#include <iostream>
#include <unordered_map>
#include "Readers/StreamReader.h"
#include "../util/util.h"
#include "../parser/Parser.h"




using namespace std;


class Spider : public ThreadClass
	{

public:

	Spider( string mode_in, ProducerConsumerQueue < ParsedUrl > *url_q_in,
			  unordered_map < string, int > *doc_map_lookup_in, unordered_map < size_t, int > *duplicate_url_map_in )
			: mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( url_q_in), duplicateUrlMap(duplicate_url_map_in)
		{
		};


	//Takes a url off of the url frontier
	ParsedUrl getUrl();

	virtual void FuncToRun();

	bool writeDocToDisk(ParsedUrl url);

	bool shouldURLbeCrawled( size_t docId );
	size_t hash(const char * s);
	//int getRobots(ParsedUrl url );
	bool checkRobots(ParsedUrl url);

private:

	int locationOnDisk;
	ProducerConsumerQueue < ParsedUrl > *urlFrontier;
	unordered_map < size_t, int > *duplicateUrlMap;
	string mode;
	unordered_map < string, int > *docMapLookup;
	Parser parser;

	};