Skip to content
Snippets Groups Projects
spider.cpp 5.09 KiB
Newer Older
  • Learn to ignore specific revisions
  • benbergk's avatar
    benbergk committed
    #include "spider.h"
    
    benbergk's avatar
    benbergk committed
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <unistd.h>
    
    vcday's avatar
    vcday committed
    #include "../shared/Document.h"
    
    benbergk's avatar
    benbergk committed
    #include "../parser/Parser.h"
    
    benbergk's avatar
    benbergk committed
    #include "Readers/HttpsReader.h"
    #include "Readers/HttpReader.h"
    #include "Readers/LocalReader.h"
    
    #include "../parser/Parser.h"
    
    jsclose's avatar
    jsclose committed
    #include "UrlFrontier.h"
    
    
    
    using DocIndex = const unordered_map< string, vector< unsigned long > >;
    
    benbergk's avatar
    benbergk committed
    
    
    benbergk's avatar
    benbergk committed
    // FIND A BETTER PLACE TO PUT THIS FUNCTION
    
    
    jsclose's avatar
    jsclose committed
    StreamReader *SR_factory ( ParsedUrl * url, string mode )
    
    benbergk's avatar
    benbergk committed
    	string localFile;
    
    
    vcday's avatar
    vcday committed
    	StreamReader *newReader = nullptr;
    
    benbergk's avatar
    benbergk committed
    	if ( mode == "local" )
    
    jsclose's avatar
    jsclose committed
    		newReader = new LocalReader( url->getCompleteUrl() );
    
    vcday's avatar
    vcday committed
    	else if ( mode == "web" )
    
    jsclose's avatar
    jsclose committed
    		if ( url->getService() == "http" )
    
    vcday's avatar
    vcday committed
    		{
    			newReader = new HttpReader( url );
    
    jsclose's avatar
    jsclose committed
    		else if ( url->getService() == "https" )
    
    vcday's avatar
    vcday committed
    			newReader = new HttpsReader( url );
    
    vcday's avatar
    vcday committed
    		else
    
    			cerr << "Error reading service type\n";
    
    jsclose's avatar
    jsclose committed
    			cerr << "Service Type: " << url->getService() << "\n";
    
    benbergk's avatar
    benbergk committed
    
    	return newReader;
    	}
    
    
    vcday's avatar
    vcday committed
    void printDocIndex ( DocIndex *dict )
    
    	{
    	for ( auto it = dict->begin( ); it != dict->end( ); it++ )
    
    		cout << it->first << " : ";
    		for ( int i = 0; i < it->second.size( ); ++i )
    
    			cout << it->second[ i ] << " ";
    
    vcday's avatar
    vcday committed
    		}
    
    		cout << std::endl;
    	}
    
    	cout << std::endl;
    
    benbergk's avatar
    benbergk committed
    
    
    benbergk's avatar
    benbergk committed
    
    
    benbergk's avatar
    benbergk committed
    
    
    vcday's avatar
    vcday committed
    size_t Spider::hash ( const char *s )
    
    jsclose's avatar
    jsclose committed
    	{
    
    vcday's avatar
    vcday committed
    	// http://www.cse.yorku.ca/~oz/hash.html
    	size_t h = 5381;
    	int c;
    	while ( ( c = *s++ ) )
    		h = ( ( h << 5 ) + h ) + c;
    	return h;
    
    jsclose's avatar
    jsclose committed
    	}
    
    
    jsclose's avatar
    jsclose committed
    ParsedUrl * Spider::getUrl ( )
    
    jsclose's avatar
    jsclose committed
    	{
    	return urlFrontier->Pop( );
    	}
    
    jsclose's avatar
    jsclose committed
    
    
    vcday's avatar
    vcday committed
    void Spider::run ( )
    
    jsclose's avatar
    jsclose committed
    	{
    	std::cout << "Spider is crawling" << endl;
    
    	int cond = 0;
    
    jsclose's avatar
    jsclose committed
    		ParsedUrl * currentUrl = getUrl( );
    		size_t docID = hash( currentUrl->getCompleteUrl().c_str() );
    
    vcday's avatar
    vcday committed
    		if ( shouldURLbeCrawled( docID ) )
    
    			StreamReader *reader = SR_factory( currentUrl, this->mode );
    
    jsclose's avatar
    jsclose committed
    			if(reader)
    
    jsclose's avatar
    jsclose committed
    				bool success = reader->request( );
    				if ( success )
    
    jsclose's avatar
    jsclose committed
    					cout << "Parsing " << currentUrl->getCompleteUrl();
    
    jsclose's avatar
    jsclose committed
    					DocIndex *dict = parser.execute( reader );
    					IndexerQueue->Push( dict );
    
    
    jsclose's avatar
    jsclose committed
    					//printDocIndex(dict);
    
    jsclose's avatar
    jsclose committed
    					reader->closeReader( );
    					//delete dict;
    
    					cond++;
    
    jsclose's avatar
    jsclose committed
    
    
    			delete reader;
    
    	cout << "Spider has finished running " << endl;
    	return;
    
    void Spider::kill()
    	{
    
    /*
    Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, check file on disk to see if its been crawled successfully
     if it has been indexed, (check last time index, maybe reindex?) return false (ignore this url)
    
     if its not in the docMapLookup, get the current size of the docMap on disk, then calculate new location for this docObjec
     create a doc object, find a new place, write the document contents to spot. Write the spot to the cache. Return true
    
    */
    
    
    /*
     * Takes in a parsed url,  creates a document object, writes information about the document to disk
     *  returns the begining position of the document on disk, stores that into the in memory lookup hash table
    */
    
    vcday's avatar
    vcday committed
    bool Spider::writeDocToDisk ( ParsedUrl url )
    
    vcday's avatar
    vcday committed
    	Document d( url );
    	int resultPosition = d.WriteToDocMap( );
    	if ( resultPosition == -1 )
    		{
    
    vcday's avatar
    vcday committed
    	this->docMapLookup->insert( std::pair< string, int >( url.CompleteUrl, resultPosition ) );
    
    	for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
    
    vcday's avatar
    vcday committed
    		std::cout << it->first << " => " << it->second << '\n';
    
    /*
     *
     * Takes a parsed url, checks if its in the local in memory hash table of documents return false
     * If url was crawled but past a certain point, reindexs or does not exist , indexes the doc
     * and returns true
     */
    
    vcday's avatar
    vcday committed
    bool Spider::shouldURLbeCrawled ( size_t docID )
    
    jsclose's avatar
    jsclose committed
    	/*
    
    vcday's avatar
    vcday committed
    	if ( this->duplicateUrlMap->find( docID ) != this->duplicateUrlMap->end( ) )
    		{
    
    vcday's avatar
    vcday committed
    		this->duplicateUrlMap->insert( std::make_pair( docID, 1 ) );
    
    jsclose's avatar
    jsclose committed
    	 */
    	return true;
    
    benbergk's avatar
    benbergk committed
    /*
    
    //check if path in url is in the robots txt
    
    vcday's avatar
    vcday committed
    bool Spider::checkRobots ( ParsedUrl url )
    
    vcday's avatar
    vcday committed
    	string pathToRobots = util::GetCurrentWorkingDir( ) + "/robots/" + string( url.Host, strlen( url.Host ) ) + ".txt";
    	int robotsFileD = util::getFileDescriptor( pathToRobots, "R" );
    
    	//File does not exist yet
    
    vcday's avatar
    vcday committed
    	if ( robotsFileD == -1 )
    
    vcday's avatar
    vcday committed
    		robotsFileD = getRobots( url );
    
    vcday's avatar
    vcday committed
    	char *robotsTXT = util::getFileMap( robotsFileD );
    
    	return 1;
    	}
    
    
    //Makes request to get a new robots txt file, returns the file pointer
    
    vcday's avatar
    vcday committed
    int Spider::getRobots ( ParsedUrl url )
    
    vcday's avatar
    vcday committed
    	string pathToDiskRobots =
    			util::GetCurrentWorkingDir( ) + "/robots/" + string( url.Host, strlen( url.Host ) ) + ".txt";
    	string pathToWebRobots = "https://" + string( url.Host, strlen( url.Host ) ) + "/robots.txt";
    
    	//string(url.Service, strlen(url.Service))+
    
    vcday's avatar
    vcday committed
    	SocketReader *reader = new SocketReader( pathToWebRobots );
    	reader->fillBuffer( );
    
    vcday's avatar
    vcday committed
    	if ( reader->buffer != NULL )
    
    vcday's avatar
    vcday committed
    		int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots );
    		if ( fd == -1 )
    
    jsclose's avatar
    jsclose committed
    			cerr << "Error getting Robots.txt file " << endl;
    
    		return fd;
    
    jsclose's avatar
    jsclose committed
    	cerr << "issue filling buffer from robots.txt" << endl;
    	return -1;
    
    
    benbergk's avatar
    benbergk committed
    */
    
    //request function that handles sending over get request via socket or trying to open file
    
    //Error handling
    
    
    
    jsclose's avatar
    jsclose committed