Skip to content
Snippets Groups Projects
Commit e6bbbee1 authored by benbergk's avatar benbergk
Browse files

Made thread class, fixed bugs

parent 80b2b69f
No related branches found
No related tags found
No related merge requests found
......@@ -29,5 +29,7 @@ public:
};
//Necessary because this class is templated
#include"ProducerConsumerQueue.cpp"
#endif //EECS398_SEARCH_PRODUCERCONSUMERQUEUE_H
//
// Created by Ben Bergkamp on 1/31/18.
//
#ifndef EECS398_SEARCH_THREADCLASS_H
#define EECS398_SEARCH_THREADCLASS_H
#include <pthread.h>
class ThreadClass
{
public:
ThreadClass() { };
virtual ~ThreadClass() { };
//Returns true if thread was created successfully
bool StartThread()
{
return (pthread_create(&thread, NULL, StaticFuncToRun, this) == 0);
}
//Blocks until thread finishes
void WaitForFinish()
{
pthread_join(thread, NULL);
}
protected:
//IMPLEMENT THIS METHOD IN YOUR SUB CLASS WITH CODE YOU WANT YOUR THREAD TO RUN
virtual void FuncToRun() = 0;
private:
static void * StaticFuncToRun(void * This) { ((ThreadClass *)This)->FuncToRun(); return nullptr;}
pthread_t thread;
};
#endif //EECS398_SEARCH_THREADCLASS_H
//
// Created by Ben Bergkamp on 1/31/18.
//
#include "crawler.h"
void Crawler::SpawnSpiders(size_t num_spiders)
{
for( size_t i = 0 ; i < num_spiders; i++)
{
Spider *temp = new Spider( this->mode , this->urlFrontier, this->fileQueue);
temp->StartThread();
this->spiders.push_back(temp);
}
}
void Crawler::WaitOnAllSpiders()
{
for (Spider* spider : spiders)
{
spider->WaitForFinish();
}
}
......@@ -3,7 +3,6 @@
#include<vector>
#include "spider.h"
#include<string>
//#include "../ProducerConsumerQueue.cpp"
#include "../ProducerConsumerQueue.h"
/*
*
......@@ -12,43 +11,23 @@ using namespace std;
class Crawler {
vector<*Spider> spiders;
public:
string mode;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
Crawler(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in)
: mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) { } ;
//spawns a number of works
void spawnSpiders(size_t numberOfSpiders)
{
for( size_t i = 0 ; i < numberOfSpiders; i++)
{
Spider *temp = new Spider( this->mode , this->urlFrontier, this->fileQueue);
this->spiders.push_back(temp);
}
}
void SpawnSpiders(size_t num_spiders);
//Creates a housekeeping thread
void houseKeeper();
void WaitOnAllSpiders();
Crawler(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) { } ;
private:
vector<Spider*> spiders;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
string mode;
};
......
//
// Created by Ben Bergkamp on 1/31/18.
//
#include "spider.h"
string Spider::getUrl()
{
return urlFrontier->Pop();
}
void Spider::FuncToRun()
{
std::cout << "Spider is crawling" << endl;
bool cond = true;
while( cond )
{
string currentUrl = getUrl();
if ( request( currentUrl ) )
{
// markURLSeen( currentUrl );
// writeHTMLtoDisk( );
// addHTMLToQueue( );
}
else
{
cerr << "Error connecting" ;
}
}
}
bool Spider::request( string url )
{
if ( this->mode == "local" )
{
ifstream inFile;
string in;
inFile.open(url);
if ( !inFile )
{
cout << "Unable to open file";
exit(1); // terminate with error
}
while (inFile >> in)
{
cout << in << endl;
}
inFile.close();
return true;
}
return false;
}
\ No newline at end of file
#pragma once
#include<string>
#include <pthread.h>
#include "crawler.h"
#include <fstream>
//#include "../ProducerConsumerQueue.cpp"
#include "../ProducerConsumerQueue.h"
#include "../ThreadClass.h"
#include<iostream>
using namespace std;
class Spider {
private:
int locationOnDisk;
pthread_t runningThread;
string mode;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
class Spider : public ThreadClass{
public:
Spider(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in)
: mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in) {};
//Takes a url off of the url frontier
string getUrl( )
{
return urlFrontier->Pop();
};
void* run(void * arg){
cout << "Spider is crawling" << endl;
while( true )
{
string currentUrl = getUrl();
if ( request( currentUrl ) )
{
// markURLSeen( currentUrl );
// writeHTMLtoDisk( );
// addHTMLToQueue( );
}
else
{
cerr << "Error connecting" ;
}
}
//Takes a url off of the url frontier
string getUrl();
}
virtual void FuncToRun();
//Makes request to given url
// if successful, writes file to disk, stores location to memeber value
// else return false and error information, retry if necessary
bool request( string url )
{
if ( this->mode == "local" )
{
ifstream inFile;
string in;
inFile.open(url);
if ( !inFile ) {
cout << "Unable to open file";
exit(1); // terminate with error
}
while (inFile >> in) {
cout << in << endl;
}
inFile.close();
return true;
}
return false;
}
bool request( string url );
//Where to write to disk? What type of data are we reading in?
void writeHTMLtoDisk( );
......@@ -92,17 +37,11 @@ public:
void markURLSeen( string URL );
private:
Spider(string mode_in, ProducerConsumerQueue<string>* url_q_in , ProducerConsumerQueue<string>* html_q_in) : mode( mode_in ), urlFrontier(url_q_in) , fileQueue(html_q_in)
{
cout << "SPAWNING NEW SPIDER " << endl;
pthread_create(&runningThread, NULL, run, nullptr);
};
int locationOnDisk;
ProducerConsumerQueue<string> *urlFrontier;
ProducerConsumerQueue<string> *fileQueue;
string mode;
};
\ No newline at end of file
......@@ -11,7 +11,7 @@
#include "crawler/crawler.h"
#include <string>
#include "ProducerConsumerQueue.h"
//#include "ProducerConsumerQueue.cpp"
#include "crawler/spider.h"
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
......@@ -55,18 +55,15 @@ int main(int argc, const char * argv[])
ProducerConsumerQueue<string> urlFrontier;
ProducerConsumerQueue<string> fileQueue;
cout << "Pushed File\n";
urlFrontier.Push("tests/cats.html");
Crawler crawler(mode, &urlFrontier, &fileQueue );
crawler.spawnSpiders(1);
// crawler.houseKeeper();
crawler.SpawnSpiders(1);
crawler.WaitOnAllSpiders();
}
\ No newline at end of file
all:
g++ -std=c++11 main.cpp crawler/crawler.cpp crawler/spider.cpp -o crawler.exe -lpthread
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment