Skip to content
Snippets Groups Projects
main.cpp 3.04 KiB
Newer Older
benbergk's avatar
benbergk committed
//
//  main.cpp
//

benbergk's avatar
benbergk committed
#include <iostream>
#include <stdlib.h>
benbergk's avatar
benbergk committed
#include <unistd.h>
benbergk's avatar
benbergk committed
#include <pthread.h>
#include <queue>
#include "crawler/crawler.h"
#include <openssl/ssl.h>
#include <string>
jsclose's avatar
jsclose committed
//#include "crawler/CrawlerStatistics.h"
#include <unordered_map>
#include "util/util.h"
benbergk's avatar
benbergk committed
#include <getopt.h>
#include "indexer/Indexer.h"

#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt'
#define PATH_TO_HTML_DIR = 'bin/html/'
#define PATH_TO_INDEX = 'bin/index/wordIDX'
#define PATH_TO_DOC_INDEX = 'bin/index/docIDX'

using DocIndex = const unordered_map< string, vector< unsigned long > >;

using namespace std;

benbergk's avatar
benbergk committed

benbergk's avatar
benbergk committed
int main( int argc, char *argv[] )
	{
	/*
	 *
	 * Settings Flags to control program on start up
	 * to be read in via command line with default settings
	 *
	 * string :  Mode : Getting content from the web vs local
	 *
	 * string : Seed : filename of list of starting urls
	 *
	 * int  : numberOfSpiders: # of spiders crawler spawns
	 *
	 * int  : numberOfParsers:  # of parsers  spawned
	 *
	 * bool : restoreFromLog: bool represeting if the program should load from saved state
	 */
jsclose's avatar
jsclose committed
	string mode = "web";
benbergk's avatar
benbergk committed

	opterr = true;
	int choice;
	int option_index = 0;
	option long_options[] = {
			{"mode",         optional_argument, nullptr, 'm'},
benbergk's avatar
benbergk committed
			{"num_crawlers", optional_argument, nullptr, 'c'}

	};

	while ((choice = getopt_long(argc, argv, "m:c:", long_options, &option_index)) != -1) {
		switch (choice) {
			case 'm':

				mode = optarg;
				if (mode != "web" && mode != "local") {
					cerr << "Unknown input option";
					exit(1);
				}
				break;

			case 'c':

				numberOfSpiders = atoi(optarg);
				if (numberOfSpiders > 100) {
					cerr << "Too many crawlers!";
					exit(1);
				}
				break;

			default:
				cerr << "Unknown input option";
				exit(1);

		}
	}
	bool restoreFromLog;
jsclose's avatar
jsclose committed
	unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( );

	ProducerConsumerQueue<ParsedUrl> *urlFrontier = new ProducerConsumerQueue<ParsedUrl>();
	ProducerConsumerQueue< DocIndex* > *IndexerQueue = new ProducerConsumerQueue<DocIndex*>();



	if (mode == "local")
		seeds = util::getFileMap("/tests/localSeed.txt");
		seeds = util::getFileMap("/tests/webSeed.txt");
	while (*seeds) {
		if (*seeds == '\n') {
jsclose's avatar
jsclose committed
			ParsedUrl url = ParsedUrl(testFile);
			cout << "Pushing: " << testFile << " to queue\n";
			urlFrontier->Push(url);
			testFile.push_back(*seeds);
		++seeds;
	}
	if (testFile != "") {
		cout << "Pushing: " << testFile << " to queue\n";
		ParsedUrl url = ParsedUrl(testFile);
		urlFrontier->Push(url);
	}
unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( );

Indexer indexer(IndexerQueue);
	indexer.StartThread();

Crawler crawler( mode, urlFrontier, IndexerQueue );
jsclose's avatar
jsclose committed
crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap);
crawler.WaitOnAllSpiders();
	indexer.WaitForFinish();
	int x = 0;
	delete urlFrontier;
benbergk's avatar
benbergk committed
}