diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp index e669924c76794367fb991c092f4556eb66ed2889..2ffe3e421d14df170aa85f74b091e7d03a4773f4 100644 --- a/crawler/SocketReader.cpp +++ b/crawler/SocketReader.cpp @@ -29,6 +29,8 @@ void SocketReader::fillBuffer() // Send a GET message for the desired page. + cout << "Socket Reader is pulling from the web" << endl; + string getMessage = "GET "; getMessage += url.CompleteUrl; getMessage += " HTTP/1.1\r\nHost: "; @@ -43,8 +45,10 @@ void SocketReader::fillBuffer() char buffer[ 10240 ]; int bytes; + while ( ( bytes = recv( s, buffer, sizeof( buffer ), 0 ) ) > 0 ) write( 1, buffer, bytes ); close( s ); + return; } \ No newline at end of file diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 5aa9ba21e0dd28fc2d26a1a66a845bf2dd0d958e..8e59c22e68e1a8cf67759da951ac7544dd640f03 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -47,9 +47,7 @@ void Spider::FuncToRun() { if ( cond ) { - // markURLSeen( currentUrl ); - //StreamReader* reader = request( currentUrl ); StreamReader *reader = request( currentUrl ); diff --git a/docMap.txt b/docMap.txt index 318dc5f5753bbbc781f081777db0e0a9677752cc..3b0c8c71437350f5607a5b3e314e572dd1066281 100644 Binary files a/docMap.txt and b/docMap.txt differ diff --git a/main.cpp b/main.cpp index a158cda61fad0d9cc8923a498c76ec34eebd33ff..4e0eac013232e6b0e16c00f58800f0ebdcf085c0 100644 --- a/main.cpp +++ b/main.cpp @@ -84,7 +84,7 @@ unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( Crawler crawler( mode, &urlFrontier ); -crawler.SpawnSpiders(1 , docMapLookUp); +crawler.SpawnSpiders(3 , docMapLookUp); crawler. diff --git a/tests/webSeed.txt b/tests/webSeed.txt index 8c509b32f6215baadb7e2a3f9b3d329b6a0ce7f6..8e3e8ddeca543f723708ac9052731c26e6c8af01 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,2 +1,3 @@ http://www.dailymail.co.uk/ushome/index.html http://www.bbc.com/ +http://www.bbc.co.uk/news/business-42959138 \ No newline at end of file