Skip to content
Snippets Groups Projects
Commit 24f56373 authored by jsclose's avatar jsclose
Browse files

modifying queue rate

parent 70e42436
No related branches found
No related tags found
No related merge requests found
No preview for this file type
...@@ -32,11 +32,11 @@ void UrlFrontier::checkUrl(ParsedUrl* url) ...@@ -32,11 +32,11 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
{ {
//get the last time it was seen and find the time difference //get the last time it was seen and find the time difference
time_t lastSeen = this->domainMap->at( url->getHost( )); time_t lastSeen = this->domainMap->at( url->getHost( ));
difference = difftime( now ,lastSeen); difference = difftime( now , lastSeen);
if(difference == 0) if(difference == 0)
difference = 5; difference = 5 ;
else else
difference = 1/difference; difference = difference/10;
url->updateScore( difference ); url->updateScore( difference );
} }
...@@ -45,7 +45,7 @@ void UrlFrontier::checkUrl(ParsedUrl* url) ...@@ -45,7 +45,7 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
//add url to the duplicate url map //add url to the duplicate url map
this->duplicateUrlMap->insert( std::make_pair( url->getCompleteUrl( ), 1 )); this->duplicateUrlMap->insert( url->getCompleteUrl( ) );
return; return;
} }
} }
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "../shared/url.h" #include "../shared/url.h"
#include <time.h> #include <time.h>
#include <unordered_map> #include <unordered_map>
#include <set>
using namespace std; using namespace std;
...@@ -40,7 +41,7 @@ class UrlFrontier ...@@ -40,7 +41,7 @@ class UrlFrontier
private: private:
unordered_map< string , bool > *duplicateUrlMap = new unordered_map< string, bool >( ); set< string > *duplicateUrlMap = new set< string>( );
unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( ); unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( );
}; };
......
2018-03-29.11:05:59
\ No newline at end of file
...@@ -63,9 +63,9 @@ int main ( int argc, char *argv[] ) ...@@ -63,9 +63,9 @@ int main ( int argc, char *argv[] )
string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1"; string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1";
string bad_url2 ="http-equiv=Content-Type"; string bad_url2 ="http-equiv=Content-Type";
string bad_url3 = "\"http-equiv=\\\"refresh\\\" content=\\\"1;url=/2.73.0/static/unsupp.html\\\" /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>\"; string bad_url3 = "http-equiv=refresh content=1;url=/2.73.0/static/unsupp.html /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>";
//ParsedUrl url = ParsedUrl(bad_url); //ParsedUrl url = ParsedUrl(bad_url);
ParsedUrl url1 = ParsedUrl(bad_url); ParsedUrl url1 = ParsedUrl(bad_url3);
ParsedUrl url2 = ParsedUrl(bad_url2); ParsedUrl url2 = ParsedUrl(bad_url2);
urlFrontier->Push(&url1); urlFrontier->Push(&url1);
......
File added
...@@ -176,7 +176,7 @@ int main ( int argc, char *argv[] ) ...@@ -176,7 +176,7 @@ int main ( int argc, char *argv[] )
crawler->SpawnSpiders( numberOfSpiders ); crawler->SpawnSpiders( numberOfSpiders );
HouseKeeper logger( crawler ); HouseKeeper logger( crawler );
logger.StartThread( ); //logger.StartThread( );
string input; string input;
while(true) while(true)
......
...@@ -90,7 +90,7 @@ public: ...@@ -90,7 +90,7 @@ public:
//char * domainBuffer = new char[ 20 ]; //char * domainBuffer = new char[ 20 ];
//get the domain: //get the domain:
char *i = temp_Host; char *i = temp_Host;
//temp_Domain = null; temp_Domain = nullptr;
if(i) if(i)
{ {
for ( ; *i; i++ ) for ( ; *i; i++ )
...@@ -116,7 +116,7 @@ public: ...@@ -116,7 +116,7 @@ public:
CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl));
Service = string(temp_Service, strlen(temp_Service)); Service = string(temp_Service, strlen(temp_Service));
Host = string(temp_Host, strlen(temp_Host)); Host = string(temp_Host, strlen(temp_Host));
if( sizeof( temp_Domain) > 0 ) if( temp_Domain != nullptr )
Domain = string(temp_Domain, strlen(temp_Domain)); Domain = string(temp_Domain, strlen(temp_Domain));
Path = string(temp_Path, strlen(temp_Path)); Path = string(temp_Path, strlen(temp_Path));
...@@ -211,7 +211,7 @@ public: ...@@ -211,7 +211,7 @@ public:
void updateScore( double time ) void updateScore( double time )
{ {
Score += time; Score += 3 * time;
} }
std::string getAnchorText ( ) std::string getAnchorText ( )
......
https://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:cars
http://www.fastcompany.com/ http://www.fastcompany.com/
http://www.bbc.com/ http://www.bbc.com/
https://www.eecs.umich.edu/ https://www.eecs.umich.edu/
https://www.nytimes.com/ https://www.nytimes.com/
http://www.bostonglobe.com/ http://www.bostonglobe.com/
https://www.huffingtonpost.com/2015/01/14/strangest-wikipedia-entries_n_6463488.html
https://www.wired.com/ https://www.wired.com/
http://www.espn.com/ http://www.espn.com/
http://fivethirtyeight.com/ http://fivethirtyeight.com/
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment