From 8f7de9b7b798515f9c4a58437dddfea98be3df79 Mon Sep 17 00:00:00 2001
From: jsclose <jsclose@umich.edu>
Date: Sun, 18 Feb 2018 17:58:01 -0500
Subject: [PATCH] working on ssl

---
 .gitignore               |   1 +
 crawler/SocketReader.cpp |  70 ++++++++++++++++++++++++++++++++++++++-
 crawler/SocketReader.h   |   2 ++
 crawler/StreamReader.h   |   1 +
 crawler/spider.cpp       |   4 ++-
 docMap.txt               | Bin 364 -> 476 bytes
 main.cpp                 |   3 +-
 shared/url.h             |  41 +++++++++++++++++++++++
 8 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 755395c..ca34380 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .idea/*
+openssl-1.1.0g/*
 .vagrant/*
 CMakeLists.txt
 cmake-build-debug/*
diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp
index 2ffe3e4..6519fc1 100644
--- a/crawler/SocketReader.cpp
+++ b/crawler/SocketReader.cpp
@@ -4,7 +4,7 @@
 
 #include "SocketReader.h"
 
-
+/*
 void SocketReader::fillBuffer()
 	{
 	int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP );
@@ -51,4 +51,72 @@ void SocketReader::fillBuffer()
 
 	close( s );
 	return;
+	}
+*/
+
+void SocketReader::fillBuffer(  )
+	{
+	struct hostent *host = gethostbyname( url.Host );
+	assert( host );
+
+	struct sockaddr_in address;
+	memset( &address, 0, sizeof( address ) );
+	address.sin_family = AF_INET;
+	address.sin_port = htons( 443 );
+	memcpy( &address.sin_addr, host->h_addr, host->h_length );
+
+	// Create a TCP/IP socket.
+
+	int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP );
+	assert( s != -1 );
+
+	// Connect the socket to the host address.
+
+	int connectResult = connect( s, ( struct sockaddr * )&address,
+										  sizeof( address ) );
+	assert( connectResult == 0 );
+
+	// Build an SSL layer and set it to read/write
+	// to the socket we've connected.
+
+	SSL_library_init( );
+
+	SSL_CTX *ctx = SSL_CTX_new( SSLv23_method( ) );
+	assert( ctx );
+	SSL *ssl = SSL_new( ctx );
+	assert( ssl );
+
+	SSL_set_fd( ssl, s );
+
+	// Establish an SSL connection.
+
+	int sslConnectResult = SSL_connect( ssl );
+	assert( sslConnectResult == 1 );
+
+	// Send a GET message for the desired page through the SSL.
+
+	string getMessage = "GET ";
+	getMessage += url.CompleteUrl;
+	getMessage += " HTTP/1.1\r\nHost: ";
+	getMessage += url.Host;
+	getMessage += "\r\nConnection: close\r\n\r\n";
+
+	cout << getMessage << endl;
+	SSL_write( ssl, getMessage.c_str( ), getMessage.length( ) );
+
+	// Read from the SSL until there's no more data.
+
+	char buffer[ 10240 ];
+	int bytes;
+
+	while ( ( bytes = SSL_read( ssl, buffer,
+										 sizeof( buffer ) ) ) > 0 )
+		write( 1, buffer, bytes );
+
+	SSL_shutdown( ssl );
+	SSL_free( ssl );
+	SSL_CTX_free( ctx );
+	close( s );
+
+
 	}
\ No newline at end of file
diff --git a/crawler/SocketReader.h b/crawler/SocketReader.h
index 68734df..0c7ca21 100644
--- a/crawler/SocketReader.h
+++ b/crawler/SocketReader.h
@@ -14,6 +14,8 @@ public:
 	SocketReader( string url_in ) : url( ParsedUrl( url_in ) ) { }
 	virtual void fillBuffer();
 
+	//virtual void fillBuffer(char ssl);
+
 private:
 	ParsedUrl url;
 	};
diff --git a/crawler/StreamReader.h b/crawler/StreamReader.h
index 5e35b08..5f60e8c 100644
--- a/crawler/StreamReader.h
+++ b/crawler/StreamReader.h
@@ -14,6 +14,7 @@
 #include <string.h>
 #include <string>
 #include <cassert>
+#include <openssl/ssl.h>
 
 
 
diff --git a/crawler/spider.cpp b/crawler/spider.cpp
index e7466d2..089407e 100644
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -34,9 +34,11 @@ void Spider::FuncToRun()
 	while ( cond )
 		{
 
-		string currentUrl = getUrl( );
+
+		string currentUrl = getUrl( );	//get url from url frontier
 		char *fileMap;
 
+		//url has not seen before or time since seen is past certain criteria
 		if ( shouldURLbeCrawled( currentUrl ))
 			{
 			bool success = writeDocToDisk(currentUrl);
diff --git a/docMap.txt b/docMap.txt
index a0fd9ed502cd83e2bd416bcb013d3adf973b0d16..840e157a464ac07a84efd03a820fc0dc8c9aaac7 100644
GIT binary patch
delta 10
RcmaFEbccCE4&%gv0st9|1Y7_B

delta 7
Ocmcb^{Dx^m4kG{!ssif(

diff --git a/main.cpp b/main.cpp
index d9e3471..4dab388 100644
--- a/main.cpp
+++ b/main.cpp
@@ -44,12 +44,11 @@ int main( int argc, const char *argv[] )
 	 */
 
 	//
-	string mode = "local";
+	string mode = "web";
 	// Seed urls?
 	string seed;
 	//
 	int numberOfSpiders;
-	int numberOfParsers;
 	bool restoreFromLog;
 
 
diff --git a/shared/url.h b/shared/url.h
index 5315673..875520e 100644
--- a/shared/url.h
+++ b/shared/url.h
@@ -6,6 +6,9 @@
 
 #include <string>
 #include <iostream>
+#include "../util/util.h"
+//#include "../crawler/StreamReader.h"
+//#include "../crawler/SocketReader.h"
 using namespace std;
 
 //
@@ -157,6 +160,44 @@ public:
 			Host = Path = p;
 		}
 
+/*
+	//check if path in url is in the robots txt
+	void checkRobots()
+		{
+			string pathToRobots = util::GetCurrentWorkingDir() + '/' +  Service;
+			int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
+			//File does not exist yet
+			if(robotsFileD == -1)
+				{
+					robotsFileD = getRobots();
+				}
+
+			char* robotsTXT = util::getFileMap(robotsFileD);
+
+
+
+		}
+
+
+
+	//Makes request to get a new robots txt file, returns the file pointer
+	int getRobots( )
+		{
+
+			StreamReader reader;
+			string pathToRobots = util::GetCurrentWorkingDir() + '/' +  Service;
+			reader = new SocketReader(CompleteUrl+ '/' + 'robots.txt');
+			reader->fillBuffer();
+			int fd = util::writeToNewFileToLocation( reader->buffer, pathToRobots);
+			if( fd == -1)
+				{
+				cerr << "Error getting Robots.txt file " << endl;
+				}
+		return fd;
+
+		return 1;
+		};
+ */
 	~ParsedUrl( )
 		{
 		delete [ ] pathBuffer;
-- 
GitLab