Skip to content
Snippets Groups Projects
Commit 32fcdac0 authored by jsclose's avatar jsclose
Browse files

working on shared docMap and docMapLookup

parent 1cddffb7
No related branches found
No related tags found
No related merge requests found
......@@ -3,13 +3,13 @@
//
#include "crawler.h"
#include <unordered_map>
void Crawler::SpawnSpiders( size_t num_spiders )
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map<string, int> *docMapLookup )
{
for ( size_t i = 0; i < num_spiders; i++ )
{
Spider *temp = new Spider( this->mode, this->urlFrontier);
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup);
temp->StartThread( );
this->spiders.push_back( temp );
}
......
......@@ -4,6 +4,8 @@
#include "spider.h"
#include<string>
#include "../shared/ProducerConsumerQueue.h"
#include <unordered_map>
//#include "CrawlerStatistics.h"
/*
*
......@@ -19,7 +21,7 @@ public:
{ };
//spawns a number of works
void SpawnSpiders( size_t num_spiders );
void SpawnSpiders( size_t num_spiders, unordered_map<string, int> *docMapLookup);
//Creates a housekeeping thread
void houseKeeper();
......
......@@ -28,6 +28,9 @@ void Spider::FuncToRun()
string currentUrl = getUrl( );
char *fileMap;
shouldURLbeCrawled( currentUrl );
if ( request( currentUrl, fileMap ))
{
// markURLSeen( currentUrl );
......@@ -41,7 +44,65 @@ void Spider::FuncToRun()
}
}
/*
Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, check file on disk to see if its been crawled successfully
if it has been indexed, (check last time index, maybe reindex?) return false (ignore this url)
if its not in the docMapLookup, get the current size of the docMap on disk, then calculate new location for this docObjec
create a doc object, find a new place, write the document contents to spot. Write the spot to the cache. Return true
*/
bool Spider::shouldURLbeCrawled( string url )
{
auto locationOnDisk = this->docMapLookup->find(url);
if ( locationOnDisk == this->docMapLookup->end() )
{
cerr << "Url Not Found In Cache Lookup";
int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt" );
if ( file )
{
size_t seekPosition = FileSize( file );
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
{
cerr << "Could not seek to " << seekPosition <<
", error = " << errno;
return errno;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
}
else
cerr << "Error opening docMap" << endl;
}
else
std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
}
/*
returns true if fileMap was created, otherwise false
Modifies the filemap to be a char* of the file of the url passed
*/
bool Spider::request( string url, char *fileMap )
{
string localFile;
......
......@@ -7,7 +7,7 @@
#include "../shared/ProducerConsumerQueue.h"
#include "../shared/ThreadClass.h"
#include<iostream>
#include <unordered_map>
using namespace std;
class Spider : public ThreadClass
......@@ -15,8 +15,8 @@ class Spider : public ThreadClass
public:
Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in )
: mode( mode_in ), urlFrontier( url_q_in )
Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in , unordered_map<string, int> *doc_map_lookup_in)
: mode( mode_in ), urlFrontier( url_q_in ) , docMapLookup(doc_map_lookup_in)
{ };
......@@ -39,11 +39,15 @@ public:
void markURLSeen( string URL );
bool shouldURLbeCrawled( string URL );
private:
int locationOnDisk;
ProducerConsumerQueue < string > *urlFrontier;
string mode;
unordered_map<string, int> *docMapLookup;
};
\ No newline at end of file
......@@ -12,7 +12,7 @@
#include "crawler/crawler.h"
#include <string>
//#include "crawler/CrawlerStatistics.h"
#include <unordered_map>
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt'
......@@ -58,10 +58,13 @@ int main(int argc, const char * argv[])
cout << "Pushed File\n";
urlFrontier.Push("tests/cats.html");
unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();
Crawler crawler(mode, &urlFrontier);
crawler.SpawnSpiders(1);
crawler.SpawnSpiders(1 , docMapLookUp);
crawler.WaitOnAllSpiders();
......
......@@ -12,8 +12,8 @@ class Document
{
public:
Url url;
long docID;
string domain;
long docID;
bool lastCrawlStatus;
int lastCrawlDate;
int lastCrawlPageCount;
......
......@@ -30,6 +30,15 @@ size_t FileSize( int f )
*
*/
int getFileDescriptor( string fileName )
{
return open( fileName.c_str( ), O_RDONLY );
}
char *getFileMap( string fileName )
{
......@@ -92,4 +101,7 @@ int writeToNewFileToLocation( char *fileContents, string locationOnDisk )
return fd;
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment