Newer
Older
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt'
#define PATH_TO_HTML_DIR = 'bin/html/'
#define PATH_TO_INDEX = 'bin/index/wordIDX'
#define PATH_TO_DOC_INDEX = 'bin/index/docIDX'
using DocIndex = const unordered_map< string, vector< unsigned long > >;
{
/*
*
* Settings Flags to control program on start up
* to be read in via command line with default settings
*
* string : Mode : Getting content from the web vs local
*
* string : Seed : filename of list of starting urls
*
* int : numberOfSpiders: # of spiders crawler spawns
*
* int : numberOfParsers: # of parsers spawned
*
* bool : restoreFromLog: bool represeting if the program should load from saved state
*/
jsclose
committed
int numberOfSpiders = 1;
opterr = true;
int choice;
int option_index = 0;
option long_options[] = {
{"mode", optional_argument, nullptr, 'm'},
{"num_crawlers", optional_argument, nullptr, 'c'}
};
while ((choice = getopt_long(argc, argv, "m:c:", long_options, &option_index)) != -1) {
switch (choice) {
case 'm':
mode = optarg;
if (mode != "web" && mode != "local") {
cerr << "Unknown input option";
exit(1);
}
break;
case 'c':
numberOfSpiders = atoi(optarg);
if (numberOfSpiders > 100) {
cerr << "Too many crawlers!";
exit(1);
}
break;
default:
cerr << "Unknown input option";
exit(1);
}
}
unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( );
ProducerConsumerQueue<ParsedUrl> *urlFrontier = new ProducerConsumerQueue<ParsedUrl>();
ProducerConsumerQueue< DocIndex* > *IndexerQueue = new ProducerConsumerQueue<DocIndex*>();
if (mode == "local")
seeds = util::getFileMap("/tests/localSeed.txt");
seeds = util::getFileMap("/tests/webSeed.txt");
SSL_library_init( );
}
while (*seeds) {
if (*seeds == '\n') {
cout << "Pushing: " << testFile << " to queue\n";
urlFrontier->Push(url);
testFile.push_back(*seeds);
++seeds;
}
if (testFile != "") {
cout << "Pushing: " << testFile << " to queue\n";
ParsedUrl url = ParsedUrl(testFile);
urlFrontier->Push(url);
}
unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( );
Indexer indexer(IndexerQueue);
indexer.StartThread();
Crawler crawler( mode, urlFrontier, IndexerQueue );
crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap);
jsclose
committed
auto f = urlFrontier->Pop();
int x = 0;
delete urlFrontier;