Skip to content
Snippets Groups Projects
UrlFrontier.cpp 3.19 KiB
Newer Older
//
// Created by Jake Close on 3/26/18.
//

jsclose's avatar
jsclose committed
#include "urlFrontier.h"


jsclose's avatar
jsclose committed
//checks the current url to see if should be crawled
//first, checks if the exact url has already seen
//if so , doesnt add to the frontier
//then checks if the host has been seen
//if it has, it checks how long ago it was
// gets that difference and then updates the time score so it
// goes back in the queue
// then adds both to the url map and the host map

jsclose's avatar
jsclose committed

void UrlFrontier::checkUrl(ParsedUrl* url)
	{

jsclose's avatar
jsclose committed
	//Looks to see if the complete url already exists, if so return
jsclose's avatar
jsclose committed
	if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) )
		return ;

	else
		{
		time_t now;
		time( &now );
		double difference = 0;
		//Has the domain been seen?
		if ( this->domainMap->find( url->getHost( )) != this->domainMap->end( ))
			{
			//get the last time it was seen and find the time difference
			time_t lastSeen = this->domainMap->at( url->getHost( ));
jsclose's avatar
jsclose committed
			difference =  difftime( now , lastSeen);
jsclose's avatar
jsclose committed
			if(difference == 0)
jsclose's avatar
jsclose committed
				difference = 5 ;
jsclose's avatar
jsclose committed
			else
jsclose's avatar
jsclose committed
				difference =  difference/10;
jsclose's avatar
jsclose committed
			url->updateScore( difference );

jsclose's avatar
jsclose committed
			}
		else
			this->domainMap->insert( std::make_pair( url->getHost( ), now ));   //otherwise add to the map the current time


		//add url to the duplicate url map
jsclose's avatar
jsclose committed
		this->duplicateUrlMap->insert( url->getCompleteUrl( ) );
jsclose's avatar
jsclose committed
		return;
		}
	}


void UrlFrontier::Push( ParsedUrl * url )
	{
	//if the url has been seen? if so, dont add it

		checkUrl(url);


		//set the value of the last time the domain was seen to score
		//url.setTime(difference);
		//url.setScore();
		pthread_mutex_lock( &m );

		queue.push( url );

		if ( queue.size( ) == 1 )
			{
			pthread_cond_broadcast( &consumer_cv );
			}

		pthread_mutex_unlock( &m );
	}



jsclose's avatar
jsclose committed
ParsedUrl * UrlFrontier::Pop()
	{


	pthread_mutex_lock( &m );

	while ( queue.empty( ) == true )
		{
		pthread_cond_wait( &consumer_cv, &m );
		}

	ParsedUrl * front = queue.top( );
	queue.pop( );

	pthread_mutex_unlock( &m );

	return front;

	}

size_t UrlFrontier::Size ( )
	{
	pthread_mutex_lock( &m );
	size_t size = queue.size( );
	pthread_mutex_unlock( &m );
	return size;
	}
// Get current date/time, format is YYYY-MM-DD.HH:mm:ss
const std::string currentDateTime() {
	time_t     now = time(0);
	struct tm  tstruct;
	char       buf[80];
	tstruct = *localtime(&now);
	// Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
	// for more information about date/time format
	strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct);

	return buf;
	}


void UrlFrontier::writeDataToDisk( )
	{


	cout << "Writing queue to disk" << endl;

	string fileName = util::GetCurrentWorkingDir( )  + "/crawler/savedQueue.txt";

	if( remove( fileName.c_str() ) != 0 )
		perror( "Error deleting file" );
	else
		puts( "File successfully deleted" );
		int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );

	pthread_mutex_lock( &m );
	string currentTime = currentDateTime();
	write( file, currentTime.c_str( ), strlen( currentTime.c_str( ) ) );

	while(! queue.empty() )
		{
		ParsedUrl * url = queue.top( );
		queue.pop( );
		write( file, url->getCompleteUrl().c_str( ), strlen( url->getCompleteUrl().c_str( ) ) );
		url = 0;
		delete url;
		}
	pthread_mutex_unlock( &m );