diff --git a/crawler-parser-indexer-test b/crawler-parser-indexer-test index ce21248c3c1380ecdec7b889687283388a7482e3..3d6c5705e1f7c3c4d34e408f0ce3fd422b5b3187 100755 Binary files a/crawler-parser-indexer-test and b/crawler-parser-indexer-test differ diff --git a/crawler/UrlFrontier.cpp b/crawler/UrlFrontier.cpp index 8f3a75b0e299187e8c36f95e5e70a66c8e2236eb..f86d0b9029d71b6e105d878078734d9576e9d507 100644 --- a/crawler/UrlFrontier.cpp +++ b/crawler/UrlFrontier.cpp @@ -20,7 +20,15 @@ void UrlFrontier::checkUrl(ParsedUrl* url) //Looks to see if the complete url already exists, if so return if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) ) + { + //update the anchor text + pthread_mutex_lock( &m ); + (*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()]++; + pthread_mutex_unlock( &m ); + //add the new return ; + } + else { @@ -41,11 +49,21 @@ void UrlFrontier::checkUrl(ParsedUrl* url) } else + { + pthread_mutex_lock( &m ); this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time + pthread_mutex_unlock( &m ); + + + + } //add url to the duplicate url map - this->duplicateUrlMap->insert( url->getCompleteUrl( ) ); + pthread_mutex_lock( &m ); + (*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()] = 1; + pthread_mutex_unlock( &m ); + return; } } @@ -54,23 +72,26 @@ void UrlFrontier::checkUrl(ParsedUrl* url) void UrlFrontier::Push( ParsedUrl * url ) { //if the url has been seen? if so, dont add it + if(url->isValid) + { - checkUrl(url); + checkUrl(url); - //set the value of the last time the domain was seen to score - //url.setTime(difference); - //url.setScore(); - pthread_mutex_lock( &m ); + //set the value of the last time the domain was seen to score + //url.setTime(difference); + //url.setScore(); + pthread_mutex_lock( &m ); - queue.push( url ); + queue.push( url ); - if ( queue.size( ) == 1 ) - { - pthread_cond_broadcast( &consumer_cv ); - } + if ( queue.size( ) == 1 ) + { + pthread_cond_broadcast( &consumer_cv ); + } - pthread_mutex_unlock( &m ); + pthread_mutex_unlock( &m ); + } } diff --git a/crawler/UrlFrontier.h b/crawler/UrlFrontier.h index c8a48dee6c6bfa0508b959387c55d2d17e5b5a02..5e7390fab2157a147b151444fe5dcad891919d70 100644 --- a/crawler/UrlFrontier.h +++ b/crawler/UrlFrontier.h @@ -11,6 +11,9 @@ using namespace std; +typedef unordered_map<string, int> anchorToCountMap ; +typedef unordered_map<string , anchorToCountMap> urlMap; + class ComparisonClass { public: bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) { @@ -41,7 +44,7 @@ class UrlFrontier private: - set< string > *duplicateUrlMap = new set< string>( ); + urlMap *duplicateUrlMap = new urlMap; unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( ); }; diff --git a/shared/url.h b/shared/url.h index 14f0b34d6790c2f840f83d959d37f58372103add..0defb58bc3fb290303bffb453ca666c58e92df1f 100644 --- a/shared/url.h +++ b/shared/url.h @@ -32,6 +32,7 @@ private: public: + ParsedUrl() {} ParsedUrl ( string input_url ) @@ -129,6 +130,9 @@ public: setScore( ); } + else + isValid = false; + } @@ -237,6 +241,7 @@ public: delete[] pathBuffer; } + bool isValid = true; private: char *pathBuffer; }; \ No newline at end of file