From 1a4a31490b4cbd13f8553cca0a4204a3c2f52337 Mon Sep 17 00:00:00 2001 From: jsclose <jsclose@umich.edu> Date: Wed, 14 Feb 2018 15:16:30 -0500 Subject: [PATCH] trying with 3 spiders --- crawler/SocketReader.cpp | 4 ++++ crawler/spider.cpp | 2 -- docMap.txt | Bin 182 -> 336 bytes main.cpp | 2 +- tests/webSeed.txt | 1 + 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp index e669924..2ffe3e4 100644 --- a/crawler/SocketReader.cpp +++ b/crawler/SocketReader.cpp @@ -29,6 +29,8 @@ void SocketReader::fillBuffer() // Send a GET message for the desired page. + cout << "Socket Reader is pulling from the web" << endl; + string getMessage = "GET "; getMessage += url.CompleteUrl; getMessage += " HTTP/1.1\r\nHost: "; @@ -43,8 +45,10 @@ void SocketReader::fillBuffer() char buffer[ 10240 ]; int bytes; + while ( ( bytes = recv( s, buffer, sizeof( buffer ), 0 ) ) > 0 ) write( 1, buffer, bytes ); close( s ); + return; } \ No newline at end of file diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 5aa9ba2..8e59c22 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -47,9 +47,7 @@ void Spider::FuncToRun() { if ( cond ) { - // markURLSeen( currentUrl ); - //StreamReader* reader = request( currentUrl ); StreamReader *reader = request( currentUrl ); diff --git a/docMap.txt b/docMap.txt index 318dc5f5753bbbc781f081777db0e0a9677752cc..3b0c8c71437350f5607a5b3e314e572dd1066281 100644 GIT binary patch delta 15 TcmdnSc!6ocwuuF9V8jOiHjM^t delta 6 Ncmcb>w2g7XHUJ5s0?+^e diff --git a/main.cpp b/main.cpp index a158cda..4e0eac0 100644 --- a/main.cpp +++ b/main.cpp @@ -84,7 +84,7 @@ unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( Crawler crawler( mode, &urlFrontier ); -crawler.SpawnSpiders(1 , docMapLookUp); +crawler.SpawnSpiders(3 , docMapLookUp); crawler. diff --git a/tests/webSeed.txt b/tests/webSeed.txt index 8c509b3..8e3e8dd 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,2 +1,3 @@ http://www.dailymail.co.uk/ushome/index.html http://www.bbc.com/ +http://www.bbc.co.uk/news/business-42959138 \ No newline at end of file -- GitLab