Skip to content
Snippets Groups Projects
main.cpp 2.47 KiB
Newer Older
  • Learn to ignore specific revisions
  • benbergk's avatar
    benbergk committed
    //
    //  main.cpp
    //
    
    
    benbergk's avatar
    benbergk committed
    #include <iostream>
    #include <stdlib.h>
    
    benbergk's avatar
    benbergk committed
    #include <unistd.h>
    
    benbergk's avatar
    benbergk committed
    #include <pthread.h>
    #include <queue>
    
    #include "crawler/crawler.h"
    #include <string>
    
    jsclose's avatar
    jsclose committed
    //#include "crawler/CrawlerStatistics.h"
    
    #include <unordered_map>
    
    #include "util/util.h"
    
    benbergk's avatar
    benbergk committed
    #include <getopt.h>
    
    
    
    #define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
    #define PATH_TO_VISITED_URL = 'bin/urls.txt'
    #define PATH_TO_HTML_DIR = 'bin/html/'
    #define PATH_TO_INDEX = 'bin/index/wordIDX'
    #define PATH_TO_DOC_INDEX = 'bin/index/docIDX'
    
    
    using namespace std;
    
    
    benbergk's avatar
    benbergk committed
    
    
    benbergk's avatar
    benbergk committed
    int main( int argc, char *argv[] )
    
    	{
    	/*
    	 *
    	 * Settings Flags to control program on start up
    	 * to be read in via command line with default settings
    	 *
    	 * string :  Mode : Getting content from the web vs local
    	 *
    	 * string : Seed : filename of list of starting urls
    	 *
    	 * int  : numberOfSpiders: # of spiders crawler spawns
    	 *
    	 * int  : numberOfParsers:  # of parsers  spawned
    	 *
    	 * bool : restoreFromLog: bool represeting if the program should load from saved state
    	 */
    
    jsclose's avatar
    jsclose committed
    	string mode = "web";
    
    benbergk's avatar
    benbergk committed
    	int numberOfSpiders = 1;
    
    	opterr = true;
    	int choice;
    	int option_index = 0;
    	option long_options[] = {
    			{"mode", optional_argument, nullptr, 'm'},
    			{"num_crawlers", optional_argument, nullptr, 'c'}
    
    	};
    
    	while ((choice = getopt_long(argc, argv, "m:c:", long_options, &option_index)) != -1) {
    		switch (choice) {
    			case 'm':
    
    				mode = optarg;
    				if (mode != "web" && mode != "local") {
    					cerr << "Unknown input option";
    					exit(1);
    				}
    				break;
    
    			case 'c':
    
    				numberOfSpiders = atoi(optarg);
    				if (numberOfSpiders > 100) {
    					cerr << "Too many crawlers!";
    					exit(1);
    				}
    				break;
    
    			default:
    				cerr << "Unknown input option";
    				exit(1);
    
    		}
    	}
    
    	bool restoreFromLog;
    
    	ProducerConsumerQueue < string > urlFrontier;
    
    	cout << "Pushed File\n";
    	char *seeds;
    	if ( mode == "local" )
    		seeds = util::getFileMap( "/tests/localSeed.txt" );
    	else
    		seeds = util::getFileMap( "/tests/webSeed.txt" );
    
    	string testFile;
    	while ( *seeds )
    		{
    		if ( *seeds == '\n')
    			{
    
    vcday's avatar
    vcday committed
    			cout << "Pushing to Url Frontier..." << endl;
    
    			urlFrontier.Push(testFile);
    			testFile = "";
    			}
    
    		else
    			testFile.push_back(*seeds);
    		++seeds;
    	}
    
    vcday's avatar
    vcday committed
    	cout << "Pushing to Url Frontier..." << endl;
    
    	urlFrontier.Push(testFile);
    //urlFrontier.Push("tests/store.html");
    
    unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( );
    
    Crawler crawler( mode, &urlFrontier );
    
    benbergk's avatar
    benbergk committed
    crawler.SpawnSpiders(numberOfSpiders , docMapLookUp);
    
    WaitOnAllSpiders();
    
    benbergk's avatar
    benbergk committed
    }