working on crawler buffer

6f1b122b · jsclose · c42194a6 · 6f1b122b · 6f1b122b · 6f1b122b
Commit 6f1b122b authored 7 years ago by jsclose
--- a/crawler/SocketReader.cpp
+++ b/crawler/SocketReader.cpp
@@ -107,13 +107,18 @@ void SocketReader::httpsRequest(){
 	// Read from the SSL until there's no more data.
-	char buffer[ 10240 ];
+	char * SSLBuffer = new char[ 11240 ];
 	int bytes;
+	while ( ( bytes = SSL_read( ssl, SSLBuffer,
-	while ( ( bytes = SSL_read( ssl, buffer,
+										 10240  ) ) > 0 )
-										 sizeof( buffer ) ) ) > 0 )
+		{
-		write( 1, buffer, bytes );
+		write( 1, SSLBuffer, bytes );
+		size_t test = sizeof(SSLBuffer);
+		cout << test;
+		}
+		//write( 1, SSLBuffer, bytes );
+	buffer = SSLBuffer;
 	SSL_shutdown( ssl );
 	SSL_free( ssl );
 	SSL_CTX_free( ctx );

--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -47,7 +47,9 @@ void Spider::FuncToRun()
 				StreamReader *reader = request( currentUrl );
+				string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" +  string(url.Host, strlen(url.Host)) + ".txt";
+				int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
 				//parser.parse(reader);
 				cond = true;
 				}
@@ -110,12 +112,12 @@ bool Spider::shouldURLbeCrawled( string url )
 	return false;
 	}
-/*
 //check if path in url is in the robots txt
 bool Spider::checkRobots(string url_in)
 	{
 	ParsedUrl url = ParsedUrl(url_in);
-	string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host));
+	string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host)) + ".txt";
 	int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
 	//File does not exist yet
 	if(robotsFileD == -1)
@@ -123,7 +125,7 @@ bool Spider::checkRobots(string url_in)
 		robotsFileD = getRobots(url);
 		}
-	//char* robotsTXT = util::getFileMap(robotsFileD);
+	char* robotsTXT = util::getFileMap(robotsFileD);
 	return 1;
 	}
@@ -134,23 +136,28 @@ int Spider::getRobots(ParsedUrl url )
 	{
-	string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host));
+	string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host)) + ".txt";
-	string pathToWebRobots =  "http://" + string(url.Host, strlen(url.Host)) + "/robots.txt";
+	string pathToWebRobots =  "https://" + string(url.Host, strlen(url.Host)) + "/robots.txt";
 	//string(url.Service, strlen(url.Service))+
 	SocketReader *reader = new SocketReader(pathToWebRobots);
 	reader->fillBuffer();
-	int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots);
+	if(reader->buffer != NULL)
-	if( fd == -1)
 		{
-		cerr << "Error getting Robots.txt file " << endl;
+		int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots);
+		if( fd == -1)
+			cerr << "Error getting Robots.txt file " << endl;
+		return fd;
 		}
-	return fd;
-	return 1;
+	cerr << "issue filling buffer from robots.txt" << endl;
+	return -1;
 	};
-*/
 /*
 returns true if fileMap was created, otherwise false
 Modifies the filemap to be a char* of the file of the url passed

--- a/docMap.txt
+++ b/docMap.txt
--- a/robots/en.wikipedia.org.txt
+++ b/robots/en.wikipedia.org.txt
+ki/Wikipedia%3AEditor_review
+Disallow: /wiki/Wikipedia_talk:Editor_review
+Disallow: /wiki/Wikipedia_talk%3AEditor_review
+#
+Disallow: /wiki/Wikipedia:Article_Incubator
+Disallow: /wiki/Wikipedia%3AArticle_Incubator
+Disallow: /wiki/Wikipedia_talk:Article_Incubator
+Disallow: /wiki/Wikipedia_talk%3AArticle_Incubator
+#
+Disallow: /wiki/Category:Noindexed_pages
+Disallow: /wiki/Category%3ANoindexed_pages
+#
+# </pre>lk:Arbitration_Committee_Elections
+Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee_Elections
+#
+Disallow: /wiki/Wikipedia:Mediation_Committee
+Disallow: /wiki/Wikipedia%3AMediation_Committee
+Disallow: /wiki/Wikipedia_talk:Mediation_Committee
+Disallow: /wiki/Wikipedia_talk%3AMediation_Committee
+#
+Disallow: /wiki/Wikipedia:Mediation_Cabal/Cases
+Disallow: /wiki/Wikipedia%3AMediation_Cabal/Cases
+#
+Disallow: /wiki/Wikipedia:Requests_for_bureaucratship
+Disallow: /wiki/Wikipedia%3ARequests_for_bureaucratship
+Disallow: /wiki/Wikipedia_talk:Requests_for_bureaucratship
+Disallow: /wiki/Wikipedia_talk%3ARequests_for_bureaucratship
+#
+Disallow: /wiki/Wikipedia:Administrator_review
+Disallow: /wiki/Wikipedia%3AAdministrator_review
+Disallow: /wiki/Wikipedia_talk:Administrator_review
+Disallow: /wiki/Wikipedia_talk%3AAdministrator_review
+#
+Disallow: /wiki/Wikipedia:Editor_review
+Disallow: /wi
\ No newline at end of file
--- a/robots/www.bbc.com.txt
+++ b/robots/www.bbc.com.txt
+HTTP/1.1 200 OK
+Server: Apache
+Last-Modified: Wed, 31 Jan 2018 00:26:36 GMT
+ETag: "1f6-564078830c700"
+Cache-Control: max-age=3600, public
+Content-Type: text/plain
+Content-Length: 502
+Accept-Ranges: bytes
+Date: Thu, 22 Feb 2018 00:47:25 GMT
+Via: 1.1 varnish
+Age: 0
+Connection: close
+X-Fastly-Cache-Status: MISS-CLUSTER
+X-Served-By: cache-mdw17331-MDW
+X-Cache: MISS
+X-Cache-Hits: 0
+X-Timer: S1519260446.852690,VS0,VE114
+Vary: Accept-Encoding
+# v.4.6.6
+# HTTPS  www.bbc.com
+User-agent: *
+Sitemap: https://www.bbc.com/sitemaps/https-index-com-archive.xml
+Sitemap: https://www.bbc.com/sitemaps/https-index-com-news.xml
+Disallow: /cbbc/search/
+Disallow: /cbbc/search$
+Disallow: /cbbc/search?
+Disallow: /cbeebies/search/
+Disallow: /cbeebies/search$
+Disallow: /cbeebies/search?
+Disallow: /chwilio/
+Disallow: /chwilio$
+Disallow: /chwilio?
+Disallow: /newsround
+Disallow: /search/
+Disallow: /search$
+Disallow: /search?
+Disallow: /food$
+Disallow: /food/
\ No newline at end of file
--- a/robots/www.dailymail.co.uk.txt
+++ b/robots/www.dailymail.co.uk.txt
+HTTP/1.1 200 OK
+Content-Type: text/plain
+Content-Length: 68
+Date: Thu, 22 Feb 2018 00:44:57 GMT
+Connection: close
+Vary: User-Agent
+# All Robots
+User-agent: *
+# Disallow All Pages
+Disallow: /
+^k	;r늇LژQ
硥a&	T!pAgp܁IPƑ1\Wt#[ިTs&$azjy/
+-OGxM
\ No newline at end of file
--- a/robots/www.nytimes.com.txt
+++ b/robots/www.nytimes.com.txt
+HTTP/1.1 301 Moved Permanently
+Server: Varnish
+Retry-After: 0
+Content-Length: 0
+Location: http://www.nytimes.com/robots.txt
+Accept-Ranges: bytes
+Date: Thu, 22 Feb 2018 00:42:47 GMT
+X-Frame-Options: DENY
+Connection: close
+X-API-Version: F-0
+X-PageType: legacy
+Content-Security-Policy: default-src data: 'unsafe-inline' 'unsafe-eval' https:; script-src data: 'unsafe-inline' 'unsafe-eval' https: blob:; style-src data: 'unsafe-inline' https:; img-src data: https: blob:; font-src data: https:; connect-src https: wss:; media-src https: blob:; object-src https:; child-src https: data: blob:; form-action https:; block-all-mixed-content;
+X-Served-By: cache-mdw17330-MDW
+X-Cache: HIT
+X-Cache-Hits: 0
--- a/shared/urlTest.cpp
+++ b/shared/urlTest.cpp
@@ -12,21 +12,30 @@ using namespace std;
 int main(int argc, const char * argv[])
 	{
-	ParsedUrl test1 =  ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ;
+	ParsedUrl absoluteURLTest =  ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ;
 	//string protocol = test1.getProtocol();
-	test1.printUrl();
+	absoluteURLTest.printUrl();
 	//assert( strcmp(test1.Service, "https") == 1);
 	//assert( strcmp(test1.Host, "developer.mozilla.org") == 1);
-	ParsedUrl test2 = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
+	ParsedUrl fragmentTest = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
-	test2.printUrl();
+	//fragmentTest.printUrl();
-	assert( strcmp(test2.Service, "http"));
+	//assert( strcmp(fragmentTest.Service, "http"));
-	assert( strcmp(test2.Host, "example.com"));
+	//assert( strcmp(fragmentTest.Host, "example.com"));
+	ParsedUrl relativeURLTest = ParsedUrl("/wiki/List_of_sheep_breeds");
+	relativeURLTest.printUrl();
+	ParsedUrl pointToFragment = ParsedUrl("#topOfPage");
+	ParsedUrl mailToTest = ParsedUrl("mailto:someone@example.com?cc=someoneelse@example.com&bcc=andsomeoneelse@example.com\n"
+														"&subject=Summer%20Party&body=You%20are%20invited%20to%20a%20big%20summer%20party!\"");
+	mailToTest.printUrl();
 	std::cout << "URL TEST PASSED" << std::endl;
 	}
\ No newline at end of file
--- a/tests/webSeed.txt
+++ b/tests/webSeed.txt
+https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards
 https://www.nytimes.com/
 http://www.dailymail.co.uk/ushome/index.html
 http://www.bbc.com/
 http://www.bbc.co.uk/news/business-42959138
 http://umich.edu
\ No newline at end of file
+https://en.wikipedia.org/wiki/North_Ronaldsay_sheep
\ No newline at end of file
--- a/url_test
+++ b/url_test