Skip to content
Snippets Groups Projects
main.cpp 4.6 KiB
Newer Older
benbergk's avatar
benbergk committed
//
//  main.cpp
//

benbergk's avatar
benbergk committed
#include <iostream>
#include <stdlib.h>
benbergk's avatar
benbergk committed
#include <unistd.h>
benbergk's avatar
benbergk committed
#include <pthread.h>
#include <queue>
#include "crawler/crawler.h"
#include <openssl/ssl.h>
#include <string>
jsclose's avatar
jsclose committed
//#include "crawler/CrawlerStatistics.h"
#include <unordered_map>
#include "util/util.h"
benbergk's avatar
benbergk committed
#include <getopt.h>
#include "indexer/Indexer.h"
jsclose's avatar
jsclose committed
#include "crawler/UrlFrontier.h"
#include <csignal>
#include <iostream>
#include <chrono>
#include <future>
#include <ctime>
using DocIndex = const unordered_map< string, vector< unsigned long > >;

using namespace std;

atomic_bool *alive = new atomic_bool(true);


void signalHandler( int signum ) {
	cout << "Interrupt signal (" << signum << ") received.\n";
	cout << "Ending the Index build" << endl;
	// cleanup and close up stuff here
	// terminate program
	(*alive) = false;
	//exit(signum);
benbergk's avatar
benbergk committed

vcday's avatar
vcday committed
int main ( int argc, char *argv[] )
	{
	/*
	 *
	 * Settings Flags to control program on start up
	 * to be read in via command line with default settings
	 *
	 * string :  Mode : Getting content from the web vs local
	 *
	 * string : Seed : filename of list of starting urls
	 *
	 * int  : numberOfSpiders: # of spiders crawler spawns
	 *
	 * int  : numberOfParsers:  # of parsers  spawned
	 *
	 * bool : restoreFromLog: bool represeting if the program should load from saved state
	 */
	signal(SIGINT, signalHandler);
benbergk's avatar
benbergk committed

jsclose's avatar
jsclose committed
	string mode = "web";
jsclose's avatar
jsclose committed
	bool restart = false;
benbergk's avatar
benbergk committed
	opterr = true;
	int choice;
	int option_index = 0;
	option long_options[] = {
vcday's avatar
vcday committed
			{ "mode",         optional_argument, nullptr, 'm' },
jsclose's avatar
jsclose committed
			{ "num_crawlers", optional_argument, nullptr, 'c' },
			{ "docsToCrawl", optional_argument, nullptr, 'd' },
jsclose's avatar
jsclose committed
			{ "from_restart", optional_argument, nullptr, 'r' }
	while ( ( choice = getopt_long( argc, argv, "m:c:d:r", long_options, &option_index ) ) != -1 )
vcday's avatar
vcday committed
		{
		switch ( choice )
			{
benbergk's avatar
benbergk committed
			case 'm':

				mode = optarg;
vcday's avatar
vcday committed
				if ( mode != "web" && mode != "local" )
					{
benbergk's avatar
benbergk committed
					cerr << "Unknown input option";
vcday's avatar
vcday committed
					exit( 1 );
					}
benbergk's avatar
benbergk committed
				break;

			case 'c':

vcday's avatar
vcday committed
				numberOfSpiders = atoi( optarg );
				if ( numberOfSpiders > 100 )
					{
benbergk's avatar
benbergk committed
					cerr << "Too many crawlers!";
vcday's avatar
vcday committed
					exit( 1 );
					}
benbergk's avatar
benbergk committed
				break;
jsclose's avatar
jsclose committed
			case 'r':

				restart = true;
				break;
benbergk's avatar
benbergk committed
			default:
				cerr << "Unknown input option";
vcday's avatar
vcday committed
				exit( 1 );
vcday's avatar
vcday committed
			}
benbergk's avatar
benbergk committed
		}
	bool restoreFromLog;
jsclose's avatar
jsclose committed
	UrlFrontier *urlFrontier = new UrlFrontier();
vcday's avatar
vcday committed
	ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
	ProducerConsumerQueue< unordered_map<string , DocIndex * >  > *AnchorQueue = new ProducerConsumerQueue< unordered_map<string , DocIndex * >  >( );
vcday's avatar
vcday committed
	if ( mode == "local" )
		seeds = util::getFileMap( "/tests/localSeed.txt" );
	else
		{
		seeds = util::getFileMap( "/tests/webSeed.txt" );
		SSL_library_init( );
vcday's avatar
vcday committed
		}
jsclose's avatar
jsclose committed
	if(restart == false)
vcday's avatar
vcday committed
		{
jsclose's avatar
jsclose committed
		string testFile;
		while ( *seeds )
			{
			if ( *seeds == '\n' )
				{

				ParsedUrl * url = new ParsedUrl( testFile );
				cout << "Pushing: " << testFile << " to queue\n";
				urlFrontier->Push( url );
				testFile = "";
				}
			else
				testFile.push_back( *seeds );
			++seeds;
			}
		if ( testFile != "" )
vcday's avatar
vcday committed
			{
			cout << "Pushing: " << testFile << " to queue\n";
jsclose's avatar
jsclose committed
			ParsedUrl * url = new ParsedUrl( testFile );
vcday's avatar
vcday committed
			urlFrontier->Push( url );
			}
		}
	else
		urlFrontier->readDataFromDisk();
	Indexer indexer( IndexerQueue , AnchorQueue );
vcday's avatar
vcday committed
	indexer.StartThread( );
	Crawler *crawler = new Crawler( mode, urlFrontier, IndexerQueue, AnchorQueue );
	crawler->SpawnSpiders( numberOfSpiders , alive);
jsclose's avatar
jsclose committed
	if(DocsToCrawl > 0 )
		{
		cout << "Crawling 100,000 documents for each spider" << endl;
		crawler->WaitOnAllSpiders( );
		crawler->passAnchorTextToIndex( );
		indexer.Kill();
		indexer.WaitForFinish( );
		urlFrontier->writeDataToDisk();
		delete urlFrontier;
		delete IndexerQueue;

		cout << "Indexer has finished running " << endl;
		clock_t end = clock();
		cout << "Time to complete build: " << (end - start) / (double) CLOCKS_PER_SEC << endl;
	while(true)
		{
		cout << "press enter to quit\n" << std::endl ;
		//getline (cin, input);
		cin >> input;
		if(input == "q")
			{

			cout << "Shutting down the indexer  " << endl ;
			crawler->KillAllSpiders();
			crawler->WaitOnAllSpiders( );
			indexer.Kill();
			indexer.WaitForFinish( );
			urlFrontier->writeDataToDisk();

			delete urlFrontier;
			delete IndexerQueue;

			cout << "Indexer has finished running " << endl;
			cout << "Time to complete build: " << (end - start) / (double) CLOCKS_PER_SEC << endl;
vcday's avatar
vcday committed
	}