From 8f7de9b7b798515f9c4a58437dddfea98be3df79 Mon Sep 17 00:00:00 2001 From: jsclose <jsclose@umich.edu> Date: Sun, 18 Feb 2018 17:58:01 -0500 Subject: [PATCH] working on ssl --- .gitignore | 1 + crawler/SocketReader.cpp | 70 ++++++++++++++++++++++++++++++++++++++- crawler/SocketReader.h | 2 ++ crawler/StreamReader.h | 1 + crawler/spider.cpp | 4 ++- docMap.txt | Bin 364 -> 476 bytes main.cpp | 3 +- shared/url.h | 41 +++++++++++++++++++++++ 8 files changed, 118 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 755395c..ca34380 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/* +openssl-1.1.0g/* .vagrant/* CMakeLists.txt cmake-build-debug/* diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp index 2ffe3e4..6519fc1 100644 --- a/crawler/SocketReader.cpp +++ b/crawler/SocketReader.cpp @@ -4,7 +4,7 @@ #include "SocketReader.h" - +/* void SocketReader::fillBuffer() { int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); @@ -51,4 +51,72 @@ void SocketReader::fillBuffer() close( s ); return; + } +*/ + +void SocketReader::fillBuffer( ) + { + struct hostent *host = gethostbyname( url.Host ); + assert( host ); + + struct sockaddr_in address; + memset( &address, 0, sizeof( address ) ); + address.sin_family = AF_INET; + address.sin_port = htons( 443 ); + memcpy( &address.sin_addr, host->h_addr, host->h_length ); + + // Create a TCP/IP socket. + + int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); + assert( s != -1 ); + + // Connect the socket to the host address. + + int connectResult = connect( s, ( struct sockaddr * )&address, + sizeof( address ) ); + assert( connectResult == 0 ); + + // Build an SSL layer and set it to read/write + // to the socket we've connected. + + SSL_library_init( ); + + SSL_CTX *ctx = SSL_CTX_new( SSLv23_method( ) ); + assert( ctx ); + SSL *ssl = SSL_new( ctx ); + assert( ssl ); + + SSL_set_fd( ssl, s ); + + // Establish an SSL connection. + + int sslConnectResult = SSL_connect( ssl ); + assert( sslConnectResult == 1 ); + + // Send a GET message for the desired page through the SSL. + + string getMessage = "GET "; + getMessage += url.CompleteUrl; + getMessage += " HTTP/1.1\r\nHost: "; + getMessage += url.Host; + getMessage += "\r\nConnection: close\r\n\r\n"; + + cout << getMessage << endl; + SSL_write( ssl, getMessage.c_str( ), getMessage.length( ) ); + + // Read from the SSL until there's no more data. + + char buffer[ 10240 ]; + int bytes; + + while ( ( bytes = SSL_read( ssl, buffer, + sizeof( buffer ) ) ) > 0 ) + write( 1, buffer, bytes ); + + SSL_shutdown( ssl ); + SSL_free( ssl ); + SSL_CTX_free( ctx ); + close( s ); + + } \ No newline at end of file diff --git a/crawler/SocketReader.h b/crawler/SocketReader.h index 68734df..0c7ca21 100644 --- a/crawler/SocketReader.h +++ b/crawler/SocketReader.h @@ -14,6 +14,8 @@ public: SocketReader( string url_in ) : url( ParsedUrl( url_in ) ) { } virtual void fillBuffer(); + //virtual void fillBuffer(char ssl); + private: ParsedUrl url; }; diff --git a/crawler/StreamReader.h b/crawler/StreamReader.h index 5e35b08..5f60e8c 100644 --- a/crawler/StreamReader.h +++ b/crawler/StreamReader.h @@ -14,6 +14,7 @@ #include <string.h> #include <string> #include <cassert> +#include <openssl/ssl.h> diff --git a/crawler/spider.cpp b/crawler/spider.cpp index e7466d2..089407e 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -34,9 +34,11 @@ void Spider::FuncToRun() while ( cond ) { - string currentUrl = getUrl( ); + + string currentUrl = getUrl( ); //get url from url frontier char *fileMap; + //url has not seen before or time since seen is past certain criteria if ( shouldURLbeCrawled( currentUrl )) { bool success = writeDocToDisk(currentUrl); diff --git a/docMap.txt b/docMap.txt index a0fd9ed502cd83e2bd416bcb013d3adf973b0d16..840e157a464ac07a84efd03a820fc0dc8c9aaac7 100644 GIT binary patch delta 10 RcmaFEbccCE4&%gv0st9|1Y7_B delta 7 Ocmcb^{Dx^m4kG{!ssif( diff --git a/main.cpp b/main.cpp index d9e3471..4dab388 100644 --- a/main.cpp +++ b/main.cpp @@ -44,12 +44,11 @@ int main( int argc, const char *argv[] ) */ // - string mode = "local"; + string mode = "web"; // Seed urls? string seed; // int numberOfSpiders; - int numberOfParsers; bool restoreFromLog; diff --git a/shared/url.h b/shared/url.h index 5315673..875520e 100644 --- a/shared/url.h +++ b/shared/url.h @@ -6,6 +6,9 @@ #include <string> #include <iostream> +#include "../util/util.h" +//#include "../crawler/StreamReader.h" +//#include "../crawler/SocketReader.h" using namespace std; // @@ -157,6 +160,44 @@ public: Host = Path = p; } +/* + //check if path in url is in the robots txt + void checkRobots() + { + string pathToRobots = util::GetCurrentWorkingDir() + '/' + Service; + int robotsFileD = util::getFileDescriptor(pathToRobots , "R"); + //File does not exist yet + if(robotsFileD == -1) + { + robotsFileD = getRobots(); + } + + char* robotsTXT = util::getFileMap(robotsFileD); + + + + } + + + + //Makes request to get a new robots txt file, returns the file pointer + int getRobots( ) + { + + StreamReader reader; + string pathToRobots = util::GetCurrentWorkingDir() + '/' + Service; + reader = new SocketReader(CompleteUrl+ '/' + 'robots.txt'); + reader->fillBuffer(); + int fd = util::writeToNewFileToLocation( reader->buffer, pathToRobots); + if( fd == -1) + { + cerr << "Error getting Robots.txt file " << endl; + } + return fd; + + return 1; + }; + */ ~ParsedUrl( ) { delete [ ] pathBuffer; -- GitLab