Skip to content
Snippets Groups Projects
Commit 6f1b122b authored by jsclose's avatar jsclose
Browse files

working on crawler buffer

parent c42194a6
No related branches found
No related tags found
No related merge requests found
...@@ -107,13 +107,18 @@ void SocketReader::httpsRequest(){ ...@@ -107,13 +107,18 @@ void SocketReader::httpsRequest(){
// Read from the SSL until there's no more data. // Read from the SSL until there's no more data.
char buffer[ 10240 ]; char * SSLBuffer = new char[ 11240 ];
int bytes; int bytes;
while ( ( bytes = SSL_read( ssl, SSLBuffer,
while ( ( bytes = SSL_read( ssl, buffer, 10240 ) ) > 0 )
sizeof( buffer ) ) ) > 0 ) {
write( 1, buffer, bytes ); write( 1, SSLBuffer, bytes );
size_t test = sizeof(SSLBuffer);
cout << test;
}
//write( 1, SSLBuffer, bytes );
buffer = SSLBuffer;
SSL_shutdown( ssl ); SSL_shutdown( ssl );
SSL_free( ssl ); SSL_free( ssl );
SSL_CTX_free( ctx ); SSL_CTX_free( ctx );
......
...@@ -47,7 +47,9 @@ void Spider::FuncToRun() ...@@ -47,7 +47,9 @@ void Spider::FuncToRun()
StreamReader *reader = request( currentUrl ); StreamReader *reader = request( currentUrl );
string pathToDisk = util::GetCurrentWorkingDir() + "/crawlerOutput/" + string(url.Host, strlen(url.Host)) + ".txt";
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk);
//parser.parse(reader); //parser.parse(reader);
cond = true; cond = true;
} }
...@@ -110,12 +112,12 @@ bool Spider::shouldURLbeCrawled( string url ) ...@@ -110,12 +112,12 @@ bool Spider::shouldURLbeCrawled( string url )
return false; return false;
} }
/*
//check if path in url is in the robots txt //check if path in url is in the robots txt
bool Spider::checkRobots(string url_in) bool Spider::checkRobots(string url_in)
{ {
ParsedUrl url = ParsedUrl(url_in); ParsedUrl url = ParsedUrl(url_in);
string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)); string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt";
int robotsFileD = util::getFileDescriptor(pathToRobots , "R"); int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
//File does not exist yet //File does not exist yet
if(robotsFileD == -1) if(robotsFileD == -1)
...@@ -123,7 +125,7 @@ bool Spider::checkRobots(string url_in) ...@@ -123,7 +125,7 @@ bool Spider::checkRobots(string url_in)
robotsFileD = getRobots(url); robotsFileD = getRobots(url);
} }
//char* robotsTXT = util::getFileMap(robotsFileD); char* robotsTXT = util::getFileMap(robotsFileD);
return 1; return 1;
} }
...@@ -134,23 +136,28 @@ int Spider::getRobots(ParsedUrl url ) ...@@ -134,23 +136,28 @@ int Spider::getRobots(ParsedUrl url )
{ {
string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)); string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host)) + ".txt";
string pathToWebRobots = "http://" + string(url.Host, strlen(url.Host)) + "/robots.txt"; string pathToWebRobots = "https://" + string(url.Host, strlen(url.Host)) + "/robots.txt";
//string(url.Service, strlen(url.Service))+ //string(url.Service, strlen(url.Service))+
SocketReader *reader = new SocketReader(pathToWebRobots); SocketReader *reader = new SocketReader(pathToWebRobots);
reader->fillBuffer(); reader->fillBuffer();
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots); if(reader->buffer != NULL)
if( fd == -1)
{ {
cerr << "Error getting Robots.txt file " << endl; int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots);
if( fd == -1)
cerr << "Error getting Robots.txt file " << endl;
return fd;
} }
return fd;
return 1; cerr << "issue filling buffer from robots.txt" << endl;
return -1;
}; };
*/
/* /*
returns true if fileMap was created, otherwise false returns true if fileMap was created, otherwise false
Modifies the filemap to be a char* of the file of the url passed Modifies the filemap to be a char* of the file of the url passed
......
No preview for this file type
ki/Wikipedia%3AEditor_review
Disallow: /wiki/Wikipedia_talk:Editor_review
Disallow: /wiki/Wikipedia_talk%3AEditor_review
#
Disallow: /wiki/Wikipedia:Article_Incubator
Disallow: /wiki/Wikipedia%3AArticle_Incubator
Disallow: /wiki/Wikipedia_talk:Article_Incubator
Disallow: /wiki/Wikipedia_talk%3AArticle_Incubator
#
Disallow: /wiki/Category:Noindexed_pages
Disallow: /wiki/Category%3ANoindexed_pages
#
# </pre>lk:Arbitration_Committee_Elections
Disallow: /wiki/Wikipedia_talk%3AArbitration_Committee_Elections
#
Disallow: /wiki/Wikipedia:Mediation_Committee
Disallow: /wiki/Wikipedia%3AMediation_Committee
Disallow: /wiki/Wikipedia_talk:Mediation_Committee
Disallow: /wiki/Wikipedia_talk%3AMediation_Committee
#
Disallow: /wiki/Wikipedia:Mediation_Cabal/Cases
Disallow: /wiki/Wikipedia%3AMediation_Cabal/Cases
#
Disallow: /wiki/Wikipedia:Requests_for_bureaucratship
Disallow: /wiki/Wikipedia%3ARequests_for_bureaucratship
Disallow: /wiki/Wikipedia_talk:Requests_for_bureaucratship
Disallow: /wiki/Wikipedia_talk%3ARequests_for_bureaucratship
#
Disallow: /wiki/Wikipedia:Administrator_review
Disallow: /wiki/Wikipedia%3AAdministrator_review
Disallow: /wiki/Wikipedia_talk:Administrator_review
Disallow: /wiki/Wikipedia_talk%3AAdministrator_review
#
Disallow: /wiki/Wikipedia:Editor_review
Disallow: /wi
\ No newline at end of file
HTTP/1.1 200 OK
Server: Apache
Last-Modified: Wed, 31 Jan 2018 00:26:36 GMT
ETag: "1f6-564078830c700"
Cache-Control: max-age=3600, public
Content-Type: text/plain
Content-Length: 502
Accept-Ranges: bytes
Date: Thu, 22 Feb 2018 00:47:25 GMT
Via: 1.1 varnish
Age: 0
Connection: close
X-Fastly-Cache-Status: MISS-CLUSTER
X-Served-By: cache-mdw17331-MDW
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1519260446.852690,VS0,VE114
Vary: Accept-Encoding
# v.4.6.6
# HTTPS www.bbc.com
User-agent: *
Sitemap: https://www.bbc.com/sitemaps/https-index-com-archive.xml
Sitemap: https://www.bbc.com/sitemaps/https-index-com-news.xml
Disallow: /cbbc/search/
Disallow: /cbbc/search$
Disallow: /cbbc/search?
Disallow: /cbeebies/search/
Disallow: /cbeebies/search$
Disallow: /cbeebies/search?
Disallow: /chwilio/
Disallow: /chwilio$
Disallow: /chwilio?
Disallow: /newsround
Disallow: /search/
Disallow: /search$
Disallow: /search?
Disallow: /food$
Disallow: /food/
\ No newline at end of file
HTTP/1.1 200 OK
Content-Type: text/plain
Content-Length: 68
Date: Thu, 22 Feb 2018 00:44:57 GMT
Connection: close
Vary: User-Agent
# All Robots
User-agent: *
# Disallow All Pages
Disallow: /
^k ;r늇LژQ 硥a& T!pAgp܁IPƑ1\Wt#[ިTs&$azjy/
-OGxM
\ No newline at end of file
HTTP/1.1 301 Moved Permanently
Server: Varnish
Retry-After: 0
Content-Length: 0
Location: http://www.nytimes.com/robots.txt
Accept-Ranges: bytes
Date: Thu, 22 Feb 2018 00:42:47 GMT
X-Frame-Options: DENY
Connection: close
X-API-Version: F-0
X-PageType: legacy
Content-Security-Policy: default-src data: 'unsafe-inline' 'unsafe-eval' https:; script-src data: 'unsafe-inline' 'unsafe-eval' https: blob:; style-src data: 'unsafe-inline' https:; img-src data: https: blob:; font-src data: https:; connect-src https: wss:; media-src https: blob:; object-src https:; child-src https: data: blob:; form-action https:; block-all-mixed-content;
X-Served-By: cache-mdw17330-MDW
X-Cache: HIT
X-Cache-Hits: 0
...@@ -12,21 +12,30 @@ using namespace std; ...@@ -12,21 +12,30 @@ using namespace std;
int main(int argc, const char * argv[]) int main(int argc, const char * argv[])
{ {
ParsedUrl test1 = ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ; ParsedUrl absoluteURLTest = ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ;
//string protocol = test1.getProtocol(); //string protocol = test1.getProtocol();
test1.printUrl(); absoluteURLTest.printUrl();
//assert( strcmp(test1.Service, "https") == 1); //assert( strcmp(test1.Service, "https") == 1);
//assert( strcmp(test1.Host, "developer.mozilla.org") == 1); //assert( strcmp(test1.Host, "developer.mozilla.org") == 1);
ParsedUrl test2 = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument"); ParsedUrl fragmentTest = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
test2.printUrl(); //fragmentTest.printUrl();
assert( strcmp(test2.Service, "http")); //assert( strcmp(fragmentTest.Service, "http"));
assert( strcmp(test2.Host, "example.com")); //assert( strcmp(fragmentTest.Host, "example.com"));
ParsedUrl relativeURLTest = ParsedUrl("/wiki/List_of_sheep_breeds");
relativeURLTest.printUrl();
ParsedUrl pointToFragment = ParsedUrl("#topOfPage");
ParsedUrl mailToTest = ParsedUrl("mailto:someone@example.com?cc=someoneelse@example.com&bcc=andsomeoneelse@example.com\n"
"&subject=Summer%20Party&body=You%20are%20invited%20to%20a%20big%20summer%20party!\"");
mailToTest.printUrl();
std::cout << "URL TEST PASSED" << std::endl; std::cout << "URL TEST PASSED" << std::endl;
} }
\ No newline at end of file
https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards
https://www.nytimes.com/ https://www.nytimes.com/
http://www.dailymail.co.uk/ushome/index.html http://www.dailymail.co.uk/ushome/index.html
http://www.bbc.com/ http://www.bbc.com/
http://www.bbc.co.uk/news/business-42959138 http://www.bbc.co.uk/news/business-42959138
http://umich.edu http://umich.edu
\ No newline at end of file https://en.wikipedia.org/wiki/North_Ronaldsay_sheep
\ No newline at end of file
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment