Skip to content
Snippets Groups Projects
Commit 32fcdac0 authored by jsclose's avatar jsclose
Browse files

working on shared docMap and docMapLookup

parent 1cddffb7
Branches front-end
No related tags found
No related merge requests found
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
// //
#include "crawler.h" #include "crawler.h"
#include <unordered_map>
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map<string, int> *docMapLookup )
void Crawler::SpawnSpiders( size_t num_spiders )
{ {
for ( size_t i = 0; i < num_spiders; i++ ) for ( size_t i = 0; i < num_spiders; i++ )
{ {
Spider *temp = new Spider( this->mode, this->urlFrontier); Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup);
temp->StartThread( ); temp->StartThread( );
this->spiders.push_back( temp ); this->spiders.push_back( temp );
} }
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include "spider.h" #include "spider.h"
#include<string> #include<string>
#include "../shared/ProducerConsumerQueue.h" #include "../shared/ProducerConsumerQueue.h"
#include <unordered_map>
//#include "CrawlerStatistics.h" //#include "CrawlerStatistics.h"
/* /*
* *
...@@ -19,7 +21,7 @@ public: ...@@ -19,7 +21,7 @@ public:
{ }; { };
//spawns a number of works //spawns a number of works
void SpawnSpiders( size_t num_spiders ); void SpawnSpiders( size_t num_spiders, unordered_map<string, int> *docMapLookup);
//Creates a housekeeping thread //Creates a housekeeping thread
void houseKeeper(); void houseKeeper();
......
...@@ -28,6 +28,9 @@ void Spider::FuncToRun() ...@@ -28,6 +28,9 @@ void Spider::FuncToRun()
string currentUrl = getUrl( ); string currentUrl = getUrl( );
char *fileMap; char *fileMap;
shouldURLbeCrawled( currentUrl );
if ( request( currentUrl, fileMap )) if ( request( currentUrl, fileMap ))
{ {
// markURLSeen( currentUrl ); // markURLSeen( currentUrl );
...@@ -41,7 +44,65 @@ void Spider::FuncToRun() ...@@ -41,7 +44,65 @@ void Spider::FuncToRun()
} }
} }
/*
Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, check file on disk to see if its been crawled successfully
if it has been indexed, (check last time index, maybe reindex?) return false (ignore this url)
if its not in the docMapLookup, get the current size of the docMap on disk, then calculate new location for this docObjec
create a doc object, find a new place, write the document contents to spot. Write the spot to the cache. Return true
*/
bool Spider::shouldURLbeCrawled( string url )
{
auto locationOnDisk = this->docMapLookup->find(url);
if ( locationOnDisk == this->docMapLookup->end() )
{
cerr << "Url Not Found In Cache Lookup";
int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt" );
if ( file )
{
size_t seekPosition = FileSize( file );
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
{
cerr << "Could not seek to " << seekPosition <<
", error = " << errno;
return errno;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
}
else
cerr << "Error opening docMap" << endl;
}
else
std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
}
/*
returns true if fileMap was created, otherwise false
Modifies the filemap to be a char* of the file of the url passed
*/
bool Spider::request( string url, char *fileMap ) bool Spider::request( string url, char *fileMap )
{ {
string localFile; string localFile;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "../shared/ProducerConsumerQueue.h" #include "../shared/ProducerConsumerQueue.h"
#include "../shared/ThreadClass.h" #include "../shared/ThreadClass.h"
#include<iostream> #include<iostream>
#include <unordered_map>
using namespace std; using namespace std;
class Spider : public ThreadClass class Spider : public ThreadClass
...@@ -15,8 +15,8 @@ class Spider : public ThreadClass ...@@ -15,8 +15,8 @@ class Spider : public ThreadClass
public: public:
Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in ) Spider( string mode_in, ProducerConsumerQueue < string > *url_q_in , unordered_map<string, int> *doc_map_lookup_in)
: mode( mode_in ), urlFrontier( url_q_in ) : mode( mode_in ), urlFrontier( url_q_in ) , docMapLookup(doc_map_lookup_in)
{ }; { };
...@@ -39,11 +39,15 @@ public: ...@@ -39,11 +39,15 @@ public:
void markURLSeen( string URL ); void markURLSeen( string URL );
bool shouldURLbeCrawled( string URL );
private: private:
int locationOnDisk; int locationOnDisk;
ProducerConsumerQueue < string > *urlFrontier; ProducerConsumerQueue < string > *urlFrontier;
string mode; string mode;
unordered_map<string, int> *docMapLookup;
}; };
\ No newline at end of file
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "crawler/crawler.h" #include "crawler/crawler.h"
#include <string> #include <string>
//#include "crawler/CrawlerStatistics.h" //#include "crawler/CrawlerStatistics.h"
#include <unordered_map>
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt' #define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt' #define PATH_TO_VISITED_URL = 'bin/urls.txt'
...@@ -58,10 +58,13 @@ int main(int argc, const char * argv[]) ...@@ -58,10 +58,13 @@ int main(int argc, const char * argv[])
cout << "Pushed File\n"; cout << "Pushed File\n";
urlFrontier.Push("tests/cats.html"); urlFrontier.Push("tests/cats.html");
unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();
Crawler crawler(mode, &urlFrontier); Crawler crawler(mode, &urlFrontier);
crawler.SpawnSpiders(1); crawler.SpawnSpiders(1 , docMapLookUp);
crawler.WaitOnAllSpiders(); crawler.WaitOnAllSpiders();
......
...@@ -12,8 +12,8 @@ class Document ...@@ -12,8 +12,8 @@ class Document
{ {
public: public:
Url url; Url url;
long docID;
string domain; string domain;
long docID;
bool lastCrawlStatus; bool lastCrawlStatus;
int lastCrawlDate; int lastCrawlDate;
int lastCrawlPageCount; int lastCrawlPageCount;
......
...@@ -30,6 +30,15 @@ size_t FileSize( int f ) ...@@ -30,6 +30,15 @@ size_t FileSize( int f )
* *
*/ */
int getFileDescriptor( string fileName )
{
return open( fileName.c_str( ), O_RDONLY );
}
char *getFileMap( string fileName ) char *getFileMap( string fileName )
{ {
...@@ -92,4 +101,7 @@ int writeToNewFileToLocation( char *fileContents, string locationOnDisk ) ...@@ -92,4 +101,7 @@ int writeToNewFileToLocation( char *fileContents, string locationOnDisk )
return fd; return fd;
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment