Skip to content
Snippets Groups Projects
Commit 8364f3db authored by jsclose's avatar jsclose
Browse files

modifiying duplicate url

parent 8b53e797
Branches
No related tags found
No related merge requests found
No preview for this file type
......@@ -4,11 +4,11 @@
#include "crawler.h"
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup )
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap )
{
for ( size_t i = 0; i < num_spiders; i++ )
{
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup );
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap );
temp->StartThread( );
this->spiders.push_back( temp );
}
......
......@@ -21,7 +21,7 @@ public:
{ };
//spawns a number of works
void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup );
void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap);
//Creates a housekeeping thread
void houseKeeper();
......
......@@ -60,6 +60,15 @@ void Spider::FuncToRun()
StreamReader *reader = request( currentUrl );
size_t docID = hash(currentUrl.CompleteUrl);
if(this->duplicateUrlMap->find(docID) != this->duplicateUrlMap->end()){
continue;
}
else
{
this->duplicateUrlMap->insert(std::make_pair(docID, 1));
}
string localPath = util::GetCurrentWorkingDir( );
// don't include debug in file path
auto debug = findPrev( "cmake-build-debug", localPath.begin( ) + localPath.size( ) - 1, localPath.begin( ) );
......@@ -75,6 +84,8 @@ void Spider::FuncToRun()
auto dict = parser.execute ( &document );
cout << "docID: " << docID << endl;
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
cout << it->first << " : ";
......@@ -141,6 +152,8 @@ bool Spider::writeDocToDisk(ParsedUrl url)
bool Spider::shouldURLbeCrawled( ParsedUrl url )
{
//search for url in doc cache
auto locationOnDisk = this->docMapLookup->find( url.CompleteUrl );
//bool protectedByRobots = checkRobots( url );
......
......@@ -24,8 +24,8 @@ class Spider : public ThreadClass
public:
Spider( string mode_in, ProducerConsumerQueue < ParsedUrl > *url_q_in,
unordered_map < string, int > *doc_map_lookup_in )
: mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( url_q_in)
unordered_map < string, int > *doc_map_lookup_in, unordered_map < size_t, int > *duplicate_url_map_in )
: mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( url_q_in), duplicateUrlMap(duplicate_url_map_in)
{
};
......@@ -52,6 +52,7 @@ private:
int locationOnDisk;
ProducerConsumerQueue < ParsedUrl > *urlFrontier;
unordered_map < size_t, int > *duplicateUrlMap;
string mode;
unordered_map < string, int > *docMapLookup;
Parser parser;
......
......@@ -90,6 +90,8 @@ int main( int argc, char *argv[] )
ProducerConsumerQueue < ParsedUrl > urlFrontier;
unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( );
cout << "Pushed File\n";
char *seeds;
......@@ -123,7 +125,7 @@ unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >(
Crawler crawler( mode, &urlFrontier );
crawler.SpawnSpiders(numberOfSpiders , docMapLookUp);
crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap);
crawler.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment