Skip to content
Snippets Groups Projects
Commit e6bbbee1 authored by benbergk's avatar benbergk
Browse files

Made thread class, fixed bugs

parent 80b2b69f
No related branches found
No related tags found
No related merge requests found
...@@ -29,5 +29,7 @@ public: ...@@ -29,5 +29,7 @@ public:
}; };
//Necessary because this class is templated
#include"ProducerConsumerQueue.cpp"
#endif //EECS398_SEARCH_PRODUCERCONSUMERQUEUE_H #endif //EECS398_SEARCH_PRODUCERCONSUMERQUEUE_H
//
// Created by Ben Bergkamp on 1/31/18.
//
#ifndef EECS398_SEARCH_THREADCLASS_H
#define EECS398_SEARCH_THREADCLASS_H
#include <pthread.h>
class ThreadClass
{
public:
ThreadClass() { };
virtual ~ThreadClass() { };
//Returns true if thread was created successfully
bool StartThread()
{
return (pthread_create(&thread, NULL, StaticFuncToRun, this) == 0);
}
//Blocks until thread finishes
void WaitForFinish()
{
pthread_join(thread, NULL);
}
protected:
//IMPLEMENT THIS METHOD IN YOUR SUB CLASS WITH CODE YOU WANT YOUR THREAD TO RUN
virtual void FuncToRun() = 0;
private:
static void * StaticFuncToRun(void * This) { ((ThreadClass *)This)->FuncToRun(); return nullptr;}
pthread_t thread;
};
#endif //EECS398_SEARCH_THREADCLASS_H
//
// Created by Ben Bergkamp on 1/31/18.
//
#include "crawler.h"
void Crawler::SpawnSpiders(size_t num_spiders)
{
for( size_t i = 0 ; i < num_spiders; i++)
{
Spider *temp = new Spider( this->mode , this->urlFrontier, this->fileQueue);
temp->StartThread();
this->spiders.push_back(temp);
}
}
void Crawler::WaitOnAllSpiders()
{
for (Spider* spider : spiders)
{
spider->WaitForFinish();
}
}
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
#include<vector> #include<vector>
#include "spider.h" #include "spider.h"
#include<string> #include<string>
//#include "../ProducerConsumerQueue.cpp"
#include "../ProducerConsumerQueue.h" #include "../ProducerConsumerQueue.h"
/* /*
* *
...@@ -12,43 +11,23 @@ using namespace std; ...@@ -12,43 +11,23 @@ using namespace std;
class Crawler { class Crawler {
vector<*Spider> spiders;
public: public:
string mode; Crawler(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in)
: mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) { } ;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
//spawns a number of works //spawns a number of works
void spawnSpiders(size_t numberOfSpiders) void SpawnSpiders(size_t num_spiders);
{
for( size_t i = 0 ; i < numberOfSpiders; i++)
{
Spider *temp = new Spider( this->mode , this->urlFrontier, this->fileQueue);
this->spiders.push_back(temp);
}
}
//Creates a housekeeping thread //Creates a housekeeping thread
void houseKeeper(); void houseKeeper();
void WaitOnAllSpiders();
private:
Crawler(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) { } ; vector<Spider*> spiders;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
string mode;
}; };
......
//
// Created by Ben Bergkamp on 1/31/18.
//
#include "spider.h"
string Spider::getUrl()
{
return urlFrontier->Pop();
}
void Spider::FuncToRun()
{
std::cout << "Spider is crawling" << endl;
bool cond = true;
while( cond )
{
string currentUrl = getUrl();
if ( request( currentUrl ) )
{
// markURLSeen( currentUrl );
// writeHTMLtoDisk( );
// addHTMLToQueue( );
}
else
{
cerr << "Error connecting" ;
}
}
}
bool Spider::request( string url )
{
if ( this->mode == "local" )
{
ifstream inFile;
string in;
inFile.open(url);
if ( !inFile )
{
cout << "Unable to open file";
exit(1); // terminate with error
}
while (inFile >> in)
{
cout << in << endl;
}
inFile.close();
return true;
}
return false;
}
\ No newline at end of file
#pragma once #pragma once
#include<string> #include<string>
#include <pthread.h> #include <pthread.h>
#include "crawler.h"
#include <fstream> #include <fstream>
//#include "../ProducerConsumerQueue.cpp"
#include "../ProducerConsumerQueue.h" #include "../ProducerConsumerQueue.h"
#include "../ThreadClass.h"
#include<iostream>
using namespace std; using namespace std;
class Spider { class Spider : public ThreadClass{
private:
int locationOnDisk;
pthread_t runningThread;
string mode;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
public: public:
Spider(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in)
: mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) {};
//Takes a url off of the url frontier
string getUrl( )
{
return urlFrontier->Pop();
};
void* run(void * arg){
cout << "Spider is crawling" << endl;
while( true )
{
string currentUrl = getUrl();
if ( request( currentUrl ) )
{
// markURLSeen( currentUrl );
// writeHTMLtoDisk( );
// addHTMLToQueue( );
}
else
{
cerr << "Error connecting" ;
}
}
//Takes a url off of the url frontier
string getUrl();
} virtual void FuncToRun();
//Makes request to given url //Makes request to given url
// if successful, writes file to disk, stores location to memeber value // if successful, writes file to disk, stores location to memeber value
// else return false and error information, retry if necessary // else return false and error information, retry if necessary
bool request( string url ) bool request( string url );
{
if ( this->mode == "local" )
{
ifstream inFile;
string in;
inFile.open(url);
if ( !inFile ) {
cout << "Unable to open file";
exit(1); // terminate with error
}
while (inFile >> in) {
cout << in << endl;
}
inFile.close();
return true;
}
return false;
}
//Where to write to disk? What type of data are we reading in? //Where to write to disk? What type of data are we reading in?
void writeHTMLtoDisk( ); void writeHTMLtoDisk( );
...@@ -92,17 +37,11 @@ public: ...@@ -92,17 +37,11 @@ public:
void markURLSeen( string URL ); void markURLSeen( string URL );
private:
Spider(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) int locationOnDisk;
{ ProducerConsumerQueue<string> *urlFrontier;
cout << "SPAWNING NEW SPIDER " << endl; ProducerConsumerQueue<string> *fileQueue;
pthread_create(&runningThread, NULL, run, nullptr); string mode;
};
}; };
\ No newline at end of file
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "crawler/crawler.h" #include "crawler/crawler.h"
#include <string> #include <string>
#include "ProducerConsumerQueue.h" #include "ProducerConsumerQueue.h"
//#include "ProducerConsumerQueue.cpp" #include "crawler/spider.h"
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt' #define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
...@@ -55,18 +55,15 @@ int main(int argc, const char * argv[]) ...@@ -55,18 +55,15 @@ int main(int argc, const char * argv[])
ProducerConsumerQueue<string> urlFrontier; ProducerConsumerQueue<string> urlFrontier;
ProducerConsumerQueue<string> fileQueue; ProducerConsumerQueue<string> fileQueue;
cout << "Pushed File\n";
urlFrontier.Push("tests/cats.html"); urlFrontier.Push("tests/cats.html");
Crawler crawler(mode, &urlFrontier, &fileQueue ); Crawler crawler(mode, &urlFrontier, &fileQueue );
crawler.spawnSpiders(1); crawler.SpawnSpiders(1);
// crawler.houseKeeper();
crawler.WaitOnAllSpiders();
} }
\ No newline at end of file
all:
g++ -std=c++11 main.cpp crawler/crawler.cpp crawler/spider.cpp -o crawler.exe -lpthread
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment