Newer
Older
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "Readers/HttpsReader.h"
#include "Readers/HttpReader.h"
#include "Readers/LocalReader.h"
using DocIndex = const unordered_map< string, vector< unsigned long > >;
cerr << "Error reading service type\n";
{
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
cout << it->first << " : ";
for ( int i = 0; i < it->second.size( ); ++i )
// http://www.cse.yorku.ca/~oz/hash.html
size_t h = 5381;
int c;
while ( ( c = *s++ ) )
h = ( ( h << 5 ) + h ) + c;
return h;
while (*alive)
ParsedUrl * currentUrl = getUrl( );
size_t docID = hash( currentUrl->getCompleteUrl().c_str() );
StreamReader *reader = SR_factory( currentUrl, this->mode );
DocIndex *dict = parser.execute( reader );
IndexerQueue->Push( dict );
cout << "Spider has finished running " << endl;
return;
*(this->alive) = false;
/*
Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, check file on disk to see if its been crawled successfully
if it has been indexed, (check last time index, maybe reindex?) return false (ignore this url)
if its not in the docMapLookup, get the current size of the docMap on disk, then calculate new location for this docObjec
create a doc object, find a new place, write the document contents to spot. Write the spot to the cache. Return true
*/
/*
* Takes in a parsed url, creates a document object, writes information about the document to disk
* returns the begining position of the document on disk, stores that into the in memory lookup hash table
*/
/*
Document d( url );
int resultPosition = d.WriteToDocMap( );
if ( resultPosition == -1 )
{
this->docMapLookup->insert( std::pair< string, int >( url.CompleteUrl, resultPosition ) );
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
*/
/*
*
* Takes a parsed url, checks if its in the local in memory hash table of documents return false
* If url was crawled but past a certain point, reindexs or does not exist , indexes the doc
* and returns true
*/
if ( this->duplicateUrlMap->find( docID ) != this->duplicateUrlMap->end( ) )
{
return false;
}
else
{
//check if path in url is in the robots txt
string pathToRobots = util::GetCurrentWorkingDir( ) + "/robots/" + string( url.Host, strlen( url.Host ) ) + ".txt";
int robotsFileD = util::getFileDescriptor( pathToRobots, "R" );
char *robotsTXT = util::getFileMap( robotsFileD );
return 1;
}
//Makes request to get a new robots txt file, returns the file pointer
string pathToDiskRobots =
util::GetCurrentWorkingDir( ) + "/robots/" + string( url.Host, strlen( url.Host ) ) + ".txt";
string pathToWebRobots = "https://" + string( url.Host, strlen( url.Host ) ) + "/robots.txt";
//string(url.Service, strlen(url.Service))+
SocketReader *reader = new SocketReader( pathToWebRobots );
reader->fillBuffer( );
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots );
if ( fd == -1 )
cerr << "Error getting Robots.txt file " << endl;
return fd;
cerr << "issue filling buffer from robots.txt" << endl;
return -1;
//request function that handles sending over get request via socket or trying to open file
//Error handling