diff --git a/crawler-parser-indexer-test b/crawler-parser-indexer-test index 7da485f92206770d76f92362597fe6f3e9d1fce9..0b43ea7e52c9f0f361ff84e888815ca11945a899 100755 Binary files a/crawler-parser-indexer-test and b/crawler-parser-indexer-test differ diff --git a/crawler/UrlFrontier.cpp b/crawler/UrlFrontier.cpp index d19db50e701593bf9cd80ec57b0ffe19e0604cc7..de1f137b584e489f019f9b6c82dd390ae6c6617a 100644 --- a/crawler/UrlFrontier.cpp +++ b/crawler/UrlFrontier.cpp @@ -32,11 +32,11 @@ void UrlFrontier::checkUrl(ParsedUrl* url) { //get the last time it was seen and find the time difference time_t lastSeen = this->domainMap->at( url->getHost( )); - difference = difftime( now ,lastSeen); + difference = difftime( now , lastSeen); if(difference == 0) - difference = 5; + difference = 5 ; else - difference = 1/difference; + difference = difference/10; url->updateScore( difference ); } @@ -45,7 +45,7 @@ void UrlFrontier::checkUrl(ParsedUrl* url) //add url to the duplicate url map - this->duplicateUrlMap->insert( std::make_pair( url->getCompleteUrl( ), 1 )); + this->duplicateUrlMap->insert( url->getCompleteUrl( ) ); return; } } diff --git a/crawler/UrlFrontier.h b/crawler/UrlFrontier.h index 1b2fe3c133a0f9454ac9a661b82a34879234a093..c8a48dee6c6bfa0508b959387c55d2d17e5b5a02 100644 --- a/crawler/UrlFrontier.h +++ b/crawler/UrlFrontier.h @@ -7,6 +7,7 @@ #include "../shared/url.h" #include <time.h> #include <unordered_map> +#include <set> using namespace std; @@ -40,7 +41,7 @@ class UrlFrontier private: - unordered_map< string , bool > *duplicateUrlMap = new unordered_map< string, bool >( ); + set< string > *duplicateUrlMap = new set< string>( ); unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( ); }; diff --git a/crawler/savedQueue.txt b/crawler/savedQueue.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a9adca9644199683ba695128a29cad8991d536ba 100755 --- a/crawler/savedQueue.txt +++ b/crawler/savedQueue.txt @@ -0,0 +1 @@ +2018-03-29.11:05:59 \ No newline at end of file diff --git a/crawler/tests/crawlerTest.cpp b/crawler/tests/crawlerTest.cpp index e0928810890a1f055ae5848a03e3c529d8076e05..a173ab39f5a482eed0d8c828c09086993cd46fe3 100644 --- a/crawler/tests/crawlerTest.cpp +++ b/crawler/tests/crawlerTest.cpp @@ -63,9 +63,9 @@ int main ( int argc, char *argv[] ) string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1"; string bad_url2 ="http-equiv=Content-Type"; - string bad_url3 = "\"http-equiv=\\\"refresh\\\" content=\\\"1;url=/2.73.0/static/unsupp.html\\\" /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>\"; + string bad_url3 = "http-equiv=refresh content=1;url=/2.73.0/static/unsupp.html /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>"; //ParsedUrl url = ParsedUrl(bad_url); - ParsedUrl url1 = ParsedUrl(bad_url); + ParsedUrl url1 = ParsedUrl(bad_url3); ParsedUrl url2 = ParsedUrl(bad_url2); urlFrontier->Push(&url1); diff --git a/isolated-integration b/isolated-integration new file mode 100755 index 0000000000000000000000000000000000000000..4c37366b7d485188078c3919541e0661abfc07f9 Binary files /dev/null and b/isolated-integration differ diff --git a/main.cpp b/main.cpp index 20b1c87d1bb29de91377cea2817f3c0a9619f365..c2620a2845950e874abb2f51595f780e50c61b81 100644 --- a/main.cpp +++ b/main.cpp @@ -176,7 +176,7 @@ int main ( int argc, char *argv[] ) crawler->SpawnSpiders( numberOfSpiders ); HouseKeeper logger( crawler ); - logger.StartThread( ); + //logger.StartThread( ); string input; while(true) diff --git a/shared/url.h b/shared/url.h index fb4d4fe55faccbd1b828b73f5bde9efecc116762..8bc51d819905c14c16f5b70f985b6d0b0d2dc2d3 100644 --- a/shared/url.h +++ b/shared/url.h @@ -90,7 +90,7 @@ public: //char * domainBuffer = new char[ 20 ]; //get the domain: char *i = temp_Host; - //temp_Domain = null; + temp_Domain = nullptr; if(i) { for ( ; *i; i++ ) @@ -116,7 +116,7 @@ public: CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); Service = string(temp_Service, strlen(temp_Service)); Host = string(temp_Host, strlen(temp_Host)); - if( sizeof( temp_Domain) > 0 ) + if( temp_Domain != nullptr ) Domain = string(temp_Domain, strlen(temp_Domain)); Path = string(temp_Path, strlen(temp_Path)); @@ -211,7 +211,7 @@ public: void updateScore( double time ) { - Score += time; + Score += 3 * time; } std::string getAnchorText ( ) diff --git a/tests/webSeed.txt b/tests/webSeed.txt index 9ba6be04f21fc2426fff6dc8d76ac528199949c7..e54cf8e29710dfeb0a9cdaa4ee1a1ee124a5e100 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,10 +1,8 @@ -https://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:cars http://www.fastcompany.com/ http://www.bbc.com/ https://www.eecs.umich.edu/ https://www.nytimes.com/ http://www.bostonglobe.com/ -https://www.huffingtonpost.com/2015/01/14/strangest-wikipedia-entries_n_6463488.html https://www.wired.com/ http://www.espn.com/ http://fivethirtyeight.com/