Skip to content
Snippets Groups Projects
Commit 5d939592 authored by jsclose's avatar jsclose
Browse files

initial spider + crawler classes

parent dfda4202
Branches front-end
No related tags found
No related merge requests found
#pragma once
#include<vector>
#include "spider.h"
#include<string>
/* /*
* *
* Must provide
Robustness:
The Web contains servers that create spider traps, which are generators of web pages that mislead crawlers into getting stuck fetching an infinite number of pages in a particular domain. Crawlers must be designed to be resilient to such traps. Not all such traps are malicious; some are the inadvertent side-effect of faulty website development.
Politeness:
Web servers have both implicit and explicit policies regulating the rate at which a crawler can visit them. These politeness policies must be respected.
*/ */
using namespace std;
class Crawler { class Crawler {
//robots.txt cache
vector< *Spider> spiders;
public: public:
string mode;
ProducerConsumerQueue *urlFrontier;
ProducerConsumerQueue *fileQueue;
//spawns a number of works
void spawnSpiders(size_t numberOfSpiders)
{
for( size_t i = 0 ; i < numberOfSpiders; i++)
{
Spider *temp = new Spider( this.mode );
this->spiders.push_back(temp);
}
}
//Creates a housekeeping thread
void houseKeeper();
Crawler(string mode_in, ProducerConsumerQueue* url_q_in , ProducerConsumerQueue* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in);
......
#pragma once
#include<string>
#include <pthread.h>
#include "crawler.h"
#include <fstream>
using namespace std;
Crawler c( " m ") ;
class Spider : public Crawler {
private:
int locationOnDisk;
pthread_t runningThread;
public:
//Takes a url off of the url frontier
string getUrl( )
{
return urlFrontier->Pop();
}
void* run(void * arg){
while( true )
{
string currentUrl = getUrl();
if ( request( currentUrl ) )
{
// markURLSeen( currentUrl );
// writeHTMLtoDisk( );
// addHTMLToQueue( );
}
else
{
cerr << "Error connecting" ;
}
}
}
//Makes request to given url
// if successful, writes file to disk, stores location to memeber value
// else return false and error information, retry if necessary
bool request( string url )
{
if ( mode == 'local' )
{
ifstream inFile;
string in;
inFile.open(url);
if ( !inFile ) {
cout << "Unable to open file";
exit(1); // terminate with error
}
while (inFile >> in) {
cout << in << endl;
}
inFile.close();
}
}
//Where to write to disk? What type of data are we reading in?
void writeHTMLtoDisk( );
//Adds location
void addHTMLToQueue();
void markURLSeen( string URL );
Spider( )
{
cout << "SPAWNING NEW SPIDER " << endl;
pthread_create(&runningThread, NULL, run, nullptr);
};
};
\ No newline at end of file
...@@ -2,12 +2,71 @@ ...@@ -2,12 +2,71 @@
// main.cpp // main.cpp
// //
#include <iostream> #include <iostream>
#include <stdlib.h> #include <stdlib.h>
#include <pthread.h> #include <pthread.h>
#include <queue> #include <queue>
#include "crawler/crawler.h"
#include <string>
#include <ProducerConsumerQueue.h>
#include <ProducerConsumerQueue.cpp>
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt'
#define PATH_TO_HTML_DIR = 'bin/html/'
#define PATH_TO_INDEX = 'bin/index/wordIDX'
#define PATH_TO_DOC_INDEX = 'bin/index/docIDX'
using namespace std;
int main(int argc, const char * argv[]) int main(int argc, const char * argv[])
{ {
/*
*
* Settings Flags to control program on start up
* to be read in via command line with default settings
*
* string : Mode : Getting content from the web vs local
*
* string : Seed : filename of list of starting urls
*
* int : numberOfSpiders: # of spiders crawler spawns
*
* int : numberOfParsers: # of parsers spawned
*
* bool : restoreFromLog: bool represeting if the program should load from saved state
*/
//
string mode = "local";
// Seed urls?
string seed;
//
int numberOfSpiders;
int numberOfParsers;
bool restoreFromLog;
ProducerConsumerQueue urlFrontier;
ProducerConsumerQueue fileQueue;
urlFrontier.Push("tests/cats.html");
Crawler crawler(mode, &urlFrontier, &fileQueue );
crawler.spawnSpiders(1);
// crawler.houseKeeper();
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment