Skip to content
Snippets Groups Projects
Commit 70e42436 authored by jsclose's avatar jsclose
Browse files

housekeeping thread to write urls in queue to disk

parent 7b25b094
No related branches found
No related tags found
No related merge requests found
......@@ -35,6 +35,7 @@ add_executable(crawler-parser-indexer-test
shared/url.h
crawler/crawler.cpp
crawler/UrlFrontier.cpp
crawler/HouseKeeper.cpp
crawler/Readers/StreamReader.h
crawler/Readers/HttpReader.cpp
crawler/Readers/HttpsReader.cpp
......
No preview for this file type
//
// Created by Ben Bergkamp on 2/1/18.
// Created by Jake Close on 2/1/18.
//
#include <thread> // std::this_thread::sleep_for
#include <chrono> // std::chrono::seconds
#include "HouseKeeper.h"
void HouseKeeper::FuncToRun ( )
{
void HouseKeeper::run(){
//Sleep(3 minutes)
//Gather data
cout << "SAVING STATE OF URL FRONTIER " << endl;
while(true)
{
std::this_thread::sleep_for (std::chrono::seconds(30));
crawler->urlFrontier->writeDataToDisk();
}
}
\ No newline at end of file
......@@ -8,20 +8,20 @@
#include<string>
#include <pthread.h>
#include <iostream>
#include "crawler.h"
class HouseKeeper : public ThreadClass
{
public:
HouseKeeper ( )
HouseKeeper ( Crawler * crawler_in ) : crawler(crawler_in)
{ };
virtual void FuncToRun ( );
void run( );
private:
//members
Crawler* crawler;
};
#endif //EECS398_SEARCH_CRAWLERSTATISTICS_H
......@@ -105,6 +105,19 @@ size_t UrlFrontier::Size ( )
return size;
}
// Get current date/time, format is YYYY-MM-DD.HH:mm:ss
const std::string currentDateTime() {
time_t now = time(0);
struct tm tstruct;
char buf[80];
tstruct = *localtime(&now);
// Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
// for more information about date/time format
strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct);
return buf;
}
void UrlFrontier::writeDataToDisk( )
{
......@@ -113,7 +126,17 @@ void UrlFrontier::writeDataToDisk( )
cout << "Writing queue to disk" << endl;
string fileName = util::GetCurrentWorkingDir( ) + "/crawler/savedQueue.txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
if( remove( fileName.c_str() ) != 0 )
perror( "Error deleting file" );
else
puts( "File successfully deleted" );
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
pthread_mutex_lock( &m );
string currentTime = currentDateTime();
write( file, currentTime.c_str( ), strlen( currentTime.c_str( ) ) );
while(! queue.empty() )
{
ParsedUrl * url = queue.top( );
......@@ -122,6 +145,8 @@ void UrlFrontier::writeDataToDisk( )
url = 0;
delete url;
}
pthread_mutex_unlock( &m );
close( file );
return;
......
......@@ -21,6 +21,7 @@ void Crawler::SpawnSpiders ( size_t num_spiders )
this->spiders.push_back( temp );
}
}
/*
......
......@@ -35,10 +35,12 @@ public:
void KillAllSpiders ( );
void WaitOnAllSpiders ( );
UrlFrontier *urlFrontier;
private:
vector< Spider * > spiders;
UrlFrontier *urlFrontier;
//UrlFrontier *urlFrontier;
ProducerConsumerQueue< DocIndex * > *IndexerQueue;
//CrawlerStatistics housekeeper;
string mode;
......
File moved
......@@ -63,6 +63,7 @@ int main ( int argc, char *argv[] )
string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1";
string bad_url2 ="http-equiv=Content-Type";
string bad_url3 = "\"http-equiv=\\\"refresh\\\" content=\\\"1;url=/2.73.0/static/unsupp.html\\\" /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>\";
//ParsedUrl url = ParsedUrl(bad_url);
ParsedUrl url1 = ParsedUrl(bad_url);
ParsedUrl url2 = ParsedUrl(bad_url2);
......
......@@ -23,7 +23,7 @@
#include <chrono>
#include <future>
#include <ctime>
#include "crawler/HouseKeeper.h"
using DocIndex = const unordered_map< string, vector< unsigned long > >;
using namespace std;
......@@ -171,10 +171,12 @@ int main ( int argc, char *argv[] )
Indexer indexer( IndexerQueue );
indexer.StartThread( );
Crawler crawler( mode, urlFrontier, IndexerQueue );
Crawler *crawler = new Crawler( mode, urlFrontier, IndexerQueue );
crawler.SpawnSpiders( numberOfSpiders );
crawler->SpawnSpiders( numberOfSpiders );
HouseKeeper logger( crawler );
logger.StartThread( );
string input;
while(true)
......@@ -186,8 +188,8 @@ int main ( int argc, char *argv[] )
{
cout << "Shutting down the indexer " << endl ;
crawler.KillAllSpiders();
crawler.WaitOnAllSpiders( );
crawler->KillAllSpiders();
crawler->WaitOnAllSpiders( );
indexer.Kill();
indexer.WaitForFinish( );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment