diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp index 655fc2b9f3031ee911e8f5405d4790efc274a8c9..ee6e7b717dc03c84c664aebeccc36a1692041fab 100644 --- a/crawler/SocketReader.cpp +++ b/crawler/SocketReader.cpp @@ -107,13 +107,18 @@ void SocketReader::httpsRequest(){ // Read from the SSL until there's no more data. - char buffer[ 10240 ]; + char * SSLBuffer = new char[ 11240 ]; int bytes; - - while ( ( bytes = SSL_read( ssl, buffer, - sizeof( buffer ) ) ) > 0 ) - write( 1, buffer, bytes ); - + while ( ( bytes = SSL_read( ssl, SSLBuffer, + 10240 ) ) > 0 ) + { + write( 1, SSLBuffer, bytes ); + size_t test = sizeof(SSLBuffer); + cout << test; + } + //write( 1, SSLBuffer, bytes ); + + buffer = SSLBuffer; SSL_shutdown( ssl ); SSL_free( ssl ); SSL_CTX_free( ctx ); diff --git a/crawler/spider.cpp b/crawler/spider.cpp index ebd63278e9a4b86815eeaf1b78c2aca763f43935..0569993135557ff1b93411cb988fcc35ba34e19c 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -47,7 +47,9 @@ void Spider::FuncToRun() StreamReader *reader = request( currentUrl ); - + string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + string(url.Host, strlen(url.Host)) + ".txt"; + int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk); + //parser.parse(reader); cond = true; } @@ -110,12 +112,12 @@ bool Spider::shouldURLbeCrawled( string url ) return false; } -/* + //check if path in url is in the robots txt bool Spider::checkRobots(string url_in) { ParsedUrl url = ParsedUrl(url_in); - string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)); + string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt"; int robotsFileD = util::getFileDescriptor(pathToRobots , "R"); //File does not exist yet if(robotsFileD == -1) @@ -123,7 +125,7 @@ bool Spider::checkRobots(string url_in) robotsFileD = getRobots(url); } - //char* robotsTXT = util::getFileMap(robotsFileD); + char* robotsTXT = util::getFileMap(robotsFileD); return 1; } @@ -134,23 +136,28 @@ int Spider::getRobots(ParsedUrl url ) { - string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)); - string pathToWebRobots = "http://" + string(url.Host, strlen(url.Host)) + "/robots.txt"; + string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt"; + string pathToWebRobots = "https://" + string(url.Host, strlen(url.Host)) + "/robots.txt"; //string(url.Service, strlen(url.Service))+ SocketReader *reader = new SocketReader(pathToWebRobots); reader->fillBuffer(); - int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots); - if( fd == -1) + if(reader->buffer != NULL) { - cerr << "Error getting Robots.txt file " << endl; + int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots); + if( fd == -1) + cerr << "Error getting Robots.txt file " << endl; + + return fd; } - return fd; - return 1; + cerr << "issue filling buffer from robots.txt" << endl; + return -1; + + }; -*/ + /* returns true if fileMap was created, otherwise false Modifies the filemap to be a char* of the file of the url passed diff --git a/docMap.txt b/docMap.txt index ece49798ba7e2549c46d70a86c453ae28e77b6f4..4806d35acdf8ff1ace8ffd6197e89ca69b84fe24 100644 Binary files a/docMap.txt and b/docMap.txt differ diff --git a/robots/en.wikipedia.org.txt b/robots/en.wikipedia.org.txt new file mode 100755 index 0000000000000000000000000000000000000000..293fb2b9d9074e75218f03d47cafbde9e2a18942 --- /dev/null +++ b/robots/en.wikipedia.org.txt @@ -0,0 +1,35 @@ +ki/Wikipedia%3AEditor_review +Disallow: /wiki/Wikipedia_talk:Editor_review +Disallow: /wiki/Wikipedia_talk%3AEditor_review +# +Disallow: /wiki/Wikipedia:Article_Incubator +Disallow: /wiki/Wikipedia%3AArticle_Incubator +Disallow: /wiki/Wikipedia_talk:Article_Incubator +Disallow: /wiki/Wikipedia_talk%3AArticle_Incubator +# +Disallow: /wiki/Category:Noindexed_pages +Disallow: /wiki/Category%3ANoindexed_pages +# +# </pre>lk:Arbitration_Committee_Elections +Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee_Elections +# +Disallow: /wiki/Wikipedia:Mediation_Committee +Disallow: /wiki/Wikipedia%3AMediation_Committee +Disallow: /wiki/Wikipedia_talk:Mediation_Committee +Disallow: /wiki/Wikipedia_talk%3AMediation_Committee +# +Disallow: /wiki/Wikipedia:Mediation_Cabal/Cases +Disallow: /wiki/Wikipedia%3AMediation_Cabal/Cases +# +Disallow: /wiki/Wikipedia:Requests_for_bureaucratship +Disallow: /wiki/Wikipedia%3ARequests_for_bureaucratship +Disallow: /wiki/Wikipedia_talk:Requests_for_bureaucratship +Disallow: /wiki/Wikipedia_talk%3ARequests_for_bureaucratship +# +Disallow: /wiki/Wikipedia:Administrator_review +Disallow: /wiki/Wikipedia%3AAdministrator_review +Disallow: /wiki/Wikipedia_talk:Administrator_review +Disallow: /wiki/Wikipedia_talk%3AAdministrator_review +# +Disallow: /wiki/Wikipedia:Editor_review +Disallow: /wi \ No newline at end of file diff --git a/robots/www.bbc.com.txt b/robots/www.bbc.com.txt new file mode 100755 index 0000000000000000000000000000000000000000..c79b6d0c6f60aed220616745bc6243303983149c --- /dev/null +++ b/robots/www.bbc.com.txt @@ -0,0 +1,40 @@ +HTTP/1.1 200 OK +Server: Apache +Last-Modified: Wed, 31 Jan 2018 00:26:36 GMT +ETag: "1f6-564078830c700" +Cache-Control: max-age=3600, public +Content-Type: text/plain +Content-Length: 502 +Accept-Ranges: bytes +Date: Thu, 22 Feb 2018 00:47:25 GMT +Via: 1.1 varnish +Age: 0 +Connection: close +X-Fastly-Cache-Status: MISS-CLUSTER +X-Served-By: cache-mdw17331-MDW +X-Cache: MISS +X-Cache-Hits: 0 +X-Timer: S1519260446.852690,VS0,VE114 +Vary: Accept-Encoding + +# v.4.6.6 +# HTTPS www.bbc.com +User-agent: * +Sitemap: https://www.bbc.com/sitemaps/https-index-com-archive.xml +Sitemap: https://www.bbc.com/sitemaps/https-index-com-news.xml + +Disallow: /cbbc/search/ +Disallow: /cbbc/search$ +Disallow: /cbbc/search? +Disallow: /cbeebies/search/ +Disallow: /cbeebies/search$ +Disallow: /cbeebies/search? +Disallow: /chwilio/ +Disallow: /chwilio$ +Disallow: /chwilio? +Disallow: /newsround +Disallow: /search/ +Disallow: /search$ +Disallow: /search? +Disallow: /food$ +Disallow: /food/ \ No newline at end of file diff --git a/robots/www.dailymail.co.uk.txt b/robots/www.dailymail.co.uk.txt new file mode 100755 index 0000000000000000000000000000000000000000..8d32301f87132423df6a27106d88d942420dca1e --- /dev/null +++ b/robots/www.dailymail.co.uk.txt @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Content-Type: text/plain +Content-Length: 68 +Date: Thu, 22 Feb 2018 00:44:57 GMT +Connection: close +Vary: User-Agent + +# All Robots +User-agent: * + +# Disallow All Pages +Disallow: / + +ÝŽÿ®^øk ïž;r늇LêÚ˜Q û°—Òç¡¥–a&ª §Téª!ÕÒp´ÞÔAgpܬIšöÜPÐÖÆ‘1¦´\øøW¬tÔ#ÅØ[ÕÞ¨T›s&$az·jyÙ/· +-…OG†¨ÞŠx¹Á»M·á \ No newline at end of file diff --git a/robots/www.nytimes.com.txt b/robots/www.nytimes.com.txt new file mode 100755 index 0000000000000000000000000000000000000000..a8298dc3a3dbea1dde41a1773b3fc7b7fb1c61c9 --- /dev/null +++ b/robots/www.nytimes.com.txt @@ -0,0 +1,16 @@ +HTTP/1.1 301 Moved Permanently +Server: Varnish +Retry-After: 0 +Content-Length: 0 +Location: http://www.nytimes.com/robots.txt +Accept-Ranges: bytes +Date: Thu, 22 Feb 2018 00:42:47 GMT +X-Frame-Options: DENY +Connection: close +X-API-Version: F-0 +X-PageType: legacy +Content-Security-Policy: default-src data: 'unsafe-inline' 'unsafe-eval' https:; script-src data: 'unsafe-inline' 'unsafe-eval' https: blob:; style-src data: 'unsafe-inline' https:; img-src data: https: blob:; font-src data: https:; connect-src https: wss:; media-src https: blob:; object-src https:; child-src https: data: blob:; form-action https:; block-all-mixed-content; +X-Served-By: cache-mdw17330-MDW +X-Cache: HIT +X-Cache-Hits: 0 + diff --git a/shared/urlTest.cpp b/shared/urlTest.cpp index a21e7f8ac6bfd10877eaa8ff46b02a70e84c6c79..7fda4f8ed3deb028e7ecbac0632160956b44b683 100644 --- a/shared/urlTest.cpp +++ b/shared/urlTest.cpp @@ -12,21 +12,30 @@ using namespace std; int main(int argc, const char * argv[]) { - ParsedUrl test1 = ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ; + ParsedUrl absoluteURLTest = ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ; //string protocol = test1.getProtocol(); - test1.printUrl(); + absoluteURLTest.printUrl(); //assert( strcmp(test1.Service, "https") == 1); //assert( strcmp(test1.Host, "developer.mozilla.org") == 1); - ParsedUrl test2 = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument"); - test2.printUrl(); - assert( strcmp(test2.Service, "http")); - assert( strcmp(test2.Host, "example.com")); + ParsedUrl fragmentTest = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument"); + //fragmentTest.printUrl(); + //assert( strcmp(fragmentTest.Service, "http")); + //assert( strcmp(fragmentTest.Host, "example.com")); + ParsedUrl relativeURLTest = ParsedUrl("/wiki/List_of_sheep_breeds"); + relativeURLTest.printUrl(); + + + ParsedUrl pointToFragment = ParsedUrl("#topOfPage"); + + ParsedUrl mailToTest = ParsedUrl("mailto:someone@example.com?cc=someoneelse@example.com&bcc=andsomeoneelse@example.com\n" + "&subject=Summer%20Party&body=You%20are%20invited%20to%20a%20big%20summer%20party!\""); + mailToTest.printUrl(); std::cout << "URL TEST PASSED" << std::endl; } \ No newline at end of file diff --git a/tests/webSeed.txt b/tests/webSeed.txt index b4d953a36ace68d4c8033a5776dcd94eacc5de21..280e3f0153bf701827906e1ad8cd8ac16623fa3e 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,5 +1,7 @@ +https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards https://www.nytimes.com/ http://www.dailymail.co.uk/ushome/index.html http://www.bbc.com/ http://www.bbc.co.uk/news/business-42959138 -http://umich.edu \ No newline at end of file +http://umich.edu +https://en.wikipedia.org/wiki/North_Ronaldsay_sheep \ No newline at end of file diff --git a/url_test b/url_test index 1f663afde3cd05c74437f205d7fae5b2594a687e..f7ce01bbec97816e60e0c1312d56eb2ef36459b2 100755 Binary files a/url_test and b/url_test differ