Skip to content
Snippets Groups Projects
main.cpp 3.04 KiB
Newer Older
  • Learn to ignore specific revisions
  • benbergk's avatar
    benbergk committed
    //
    //  main.cpp
    //
    
    
    benbergk's avatar
    benbergk committed
    #include <iostream>
    #include <stdlib.h>
    
    benbergk's avatar
    benbergk committed
    #include <unistd.h>
    
    benbergk's avatar
    benbergk committed
    #include <pthread.h>
    #include <queue>
    
    #include "crawler/crawler.h"
    
    #include <openssl/ssl.h>
    
    #include <string>
    
    jsclose's avatar
    jsclose committed
    //#include "crawler/CrawlerStatistics.h"
    
    #include <unordered_map>
    
    #include "util/util.h"
    
    benbergk's avatar
    benbergk committed
    #include <getopt.h>
    
    #include "indexer/Indexer.h"
    
    
    #define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
    #define PATH_TO_VISITED_URL = 'bin/urls.txt'
    #define PATH_TO_HTML_DIR = 'bin/html/'
    #define PATH_TO_INDEX = 'bin/index/wordIDX'
    #define PATH_TO_DOC_INDEX = 'bin/index/docIDX'
    
    
    using DocIndex = const unordered_map< string, vector< unsigned long > >;
    
    
    using namespace std;
    
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    int main ( int argc, char *argv[] )
    
    	{
    	/*
    	 *
    	 * Settings Flags to control program on start up
    	 * to be read in via command line with default settings
    	 *
    	 * string :  Mode : Getting content from the web vs local
    	 *
    	 * string : Seed : filename of list of starting urls
    	 *
    	 * int  : numberOfSpiders: # of spiders crawler spawns
    	 *
    	 * int  : numberOfParsers:  # of parsers  spawned
    	 *
    	 * bool : restoreFromLog: bool represeting if the program should load from saved state
    	 */
    
    jsclose's avatar
    jsclose committed
    	string mode = "web";
    
    benbergk's avatar
    benbergk committed
    
    	opterr = true;
    	int choice;
    	int option_index = 0;
    	option long_options[] = {
    
    vcday's avatar
    vcday committed
    			{ "mode",         optional_argument, nullptr, 'm' },
    			{ "num_crawlers", optional_argument, nullptr, 'c' }
    
    vcday's avatar
    vcday committed
    	while ( ( choice = getopt_long( argc, argv, "m:c:", long_options, &option_index ) ) != -1 )
    		{
    		switch ( choice )
    			{
    
    benbergk's avatar
    benbergk committed
    			case 'm':
    
    				mode = optarg;
    
    vcday's avatar
    vcday committed
    				if ( mode != "web" && mode != "local" )
    					{
    
    benbergk's avatar
    benbergk committed
    					cerr << "Unknown input option";
    
    vcday's avatar
    vcday committed
    					exit( 1 );
    					}
    
    benbergk's avatar
    benbergk committed
    				break;
    
    			case 'c':
    
    
    vcday's avatar
    vcday committed
    				numberOfSpiders = atoi( optarg );
    				if ( numberOfSpiders > 100 )
    					{
    
    benbergk's avatar
    benbergk committed
    					cerr << "Too many crawlers!";
    
    vcday's avatar
    vcday committed
    					exit( 1 );
    					}
    
    benbergk's avatar
    benbergk committed
    				break;
    
    			default:
    				cerr << "Unknown input option";
    
    vcday's avatar
    vcday committed
    				exit( 1 );
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    			}
    
    benbergk's avatar
    benbergk committed
    		}
    
    	bool restoreFromLog;
    
    vcday's avatar
    vcday committed
    	unordered_map< size_t, int > *duplicateUrlMap = new unordered_map< size_t, int >( );
    
    vcday's avatar
    vcday committed
    	ProducerConsumerQueue< ParsedUrl > *urlFrontier = new ProducerConsumerQueue< ParsedUrl >( );
    	ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
    
    vcday's avatar
    vcday committed
    	if ( mode == "local" )
    		seeds = util::getFileMap( "/tests/localSeed.txt" );
    	else
    		{
    		seeds = util::getFileMap( "/tests/webSeed.txt" );
    
    vcday's avatar
    vcday committed
    		}
    
    vcday's avatar
    vcday committed
    	while ( *seeds )
    		{
    		if ( *seeds == '\n' )
    			{
    
    vcday's avatar
    vcday committed
    			ParsedUrl url = ParsedUrl( testFile );
    
    			cout << "Pushing: " << testFile << " to queue\n";
    
    vcday's avatar
    vcday committed
    			urlFrontier->Push( url );
    
    vcday's avatar
    vcday committed
    			}
    		else
    			testFile.push_back( *seeds );
    
    vcday's avatar
    vcday committed
    		}
    	if ( testFile != "" )
    		{
    
    		cout << "Pushing: " << testFile << " to queue\n";
    
    vcday's avatar
    vcday committed
    		ParsedUrl url = ParsedUrl( testFile );
    		urlFrontier->Push( url );
    		}
    
    vcday's avatar
    vcday committed
    	Indexer indexer( IndexerQueue );
    	indexer.StartThread( );
    
    vcday's avatar
    vcday committed
    	Crawler crawler( mode, urlFrontier, IndexerQueue );
    
    	crawler.SpawnSpiders( numberOfSpiders, duplicateUrlMap );
    
    vcday's avatar
    vcday committed
    	crawler.WaitOnAllSpiders( );
    	indexer.WaitForFinish( );
    
    vcday's avatar
    vcday committed
    	auto f = urlFrontier->Pop( );
    
    	int x = 0;
    	delete urlFrontier;
    
    vcday's avatar
    vcday committed
    	}