From 9d4652386e31954439e15b4ff2aa2aa922943bcc Mon Sep 17 00:00:00 2001
From: bergkampben <benbergk@umich.edu>
Date: Wed, 21 Mar 2018 16:03:29 -0400
Subject: [PATCH] changed ParsedUrl to be strings

---
 crawler/Readers/HttpReader.cpp  |  26 +++---
 crawler/Readers/HttpsReader.cpp |  36 ++++----
 crawler/spider.cpp              |  47 +++++------
 shared/url.h                    | 142 +++++++++++++++++---------------
 4 files changed, 132 insertions(+), 119 deletions(-)

diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp
index 200c117..0424d16 100644
--- a/crawler/Readers/HttpReader.cpp
+++ b/crawler/Readers/HttpReader.cpp
@@ -6,7 +6,7 @@ std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" );
 bool HttpReader::request ( )
 	{
 	try
-		{
+	{
 
 
 		sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP );
@@ -14,11 +14,11 @@ bool HttpReader::request ( )
 
 		// Get the host address.
 
-		struct hostent *host = gethostbyname( url.Host );
+		struct hostent *host = gethostbyname( url.getHost().c_str() );
 		if ( host == nullptr )
 			throw HTTPConnectionError;
 
-		if( strcmp(url.Service, "http") != 0)
+		if(url.getService() != "http")
 			throw HTTPConnectionError;
 
 		assert( host );
@@ -32,7 +32,7 @@ bool HttpReader::request ( )
 		// Connect to the host.
 
 		int connectResult = connect( sock, ( struct sockaddr * ) &address,
-		                             sizeof( address ) );
+									 sizeof( address ) );
 		assert( connectResult == 0 );
 
 		// Send a GET message for the desired page.
@@ -40,9 +40,9 @@ bool HttpReader::request ( )
 		cout << "Socket Reader is pulling from the web" << endl;
 
 		string getMessage = "GET ";
-		getMessage += url.CompleteUrl;
+		getMessage += url.getCompleteUrl();
 		getMessage += " HTTP/1.1\r\nHost: ";
-		getMessage += url.Host;
+		getMessage += url.getHost();
 		getMessage += "\r\nConnection: close\r\n\r\n";
 
 		cout << getMessage << endl;
@@ -51,12 +51,12 @@ bool HttpReader::request ( )
 		bool isSuccess = checkStatus( );
 		return isSuccess;
 
-		}
+	}
 	catch ( std::exception & e )
-		{
+	{
 		cerr << "Error trying to connect to Host" << endl;
 		return false;
-		}
+	}
 	}
 
 bool HttpReader::fillBuffer ( char *buf, size_t buf_size )
@@ -72,9 +72,9 @@ string HttpReader::PageToString ( )
 	int bytes = 0;
 
 	while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 )
-		{
+	{
 		temp += string( buf, bytes );
-		}
+	}
 	return temp;
 	}
 
@@ -97,10 +97,10 @@ bool HttpReader::checkStatus ( )
 	else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
 		return true;
 	else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
-		{
+	{
 		cerr << "URL REDIRECTION" << endl;
 		return false;
-		}
+	}
 	cerr << "Bad Request of TYPE::  " << buff << endl;
 	return false;
 	}
diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp
index 78d2b5e..31b9528 100644
--- a/crawler/Readers/HttpsReader.cpp
+++ b/crawler/Readers/HttpsReader.cpp
@@ -6,13 +6,13 @@ std::runtime_error HTTPSconnectionError( "Error connecting HTTPS to url" );
 bool HttpsReader::request ( )
 	{
 	try
-		{
-		struct hostent *host = gethostbyname( url.Host );
+	{
+		struct hostent *host = gethostbyname( url.getHost().c_str() );
 
 		if ( host == nullptr )
 			throw HTTPSconnectionError;
 
-		if( strcmp(url.Service, "https") != 0)
+		if( url.getService() != "https")
 			throw HTTPSconnectionError;
 
 		assert( host );
@@ -30,7 +30,7 @@ bool HttpsReader::request ( )
 		// Connect the socket to the host address.
 
 		int connectResult = connect( sock, ( struct sockaddr * ) &address,
-		                             sizeof( address ) );
+									 sizeof( address ) );
 		assert( connectResult == 0 );
 
 		// Build an SSL layer and set it to read/write
@@ -54,9 +54,9 @@ bool HttpsReader::request ( )
 		// Send a GET message for the desired page through the SSL.
 
 		string getMessage = "GET ";
-		getMessage += url.CompleteUrl;
+		getMessage += url.getCompleteUrl();
 		getMessage += " HTTP/1.1\r\nHost: ";
-		getMessage += url.Host;
+		getMessage += url.getHost();
 		getMessage += "\r\nConnection: close\r\n\r\n";
 
 		cout << getMessage << endl;
@@ -64,12 +64,12 @@ bool HttpsReader::request ( )
 
 		bool isSuccess = checkStatus( );
 		return isSuccess;
-		}
+	}
 	catch ( std::exception & e )
-		{
+	{
 		cerr << "Error trying to connect to Host" << endl;
 		return false;
-		}
+	}
 	}
 
 bool HttpsReader::fillBuffer ( char *buf, size_t buf_size )
@@ -85,9 +85,9 @@ string HttpsReader::PageToString ( )
 	int bytes = 0;
 
 	while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 )
-		{
+	{
 		temp += string( buf, bytes );
-		}
+	}
 	return temp;
 	}
 
@@ -102,13 +102,13 @@ bool HttpsReader::checkStatus ( )
 
 	if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 )
 		return true;
-	 else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
-		 return true;
-	 else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
-		 {
-		 cerr << "URL REDIRECTION" << endl;
-		 return false;
-		 }
+	else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
+		return true;
+	else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
+	{
+		cerr << "URL REDIRECTION" << endl;
+		return false;
+	}
 	cerr << "Bad Request of TYPE::  " << buff << endl;
 	return false;
 
diff --git a/crawler/spider.cpp b/crawler/spider.cpp
index c50c658..8cbda41 100644
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -20,24 +20,25 @@ StreamReader *SR_factory ( ParsedUrl url, string mode )
 
 	StreamReader *newReader = nullptr;
 	if ( mode == "local" )
-		{
-		newReader = new LocalReader( string(url.CompleteUrl, strlen(url.CompleteUrl) ) );
-		}
+	{
+		newReader = new LocalReader( url.getCompleteUrl() );
+	}
 	else if ( mode == "web" )
+	{
+		if ( url.getService() == "http" )
 		{
-		if ( !strcmp( url.Service, "http" ) )
-			{
 			newReader = new HttpReader( url );
-			}
-		else if ( !strcmp( url.Service, "https" ) )
-			{
+		}
+		else if ( url.getService() == "https" )
+		{
 			newReader = new HttpsReader( url );
-			}
+		}
 		else
-			{
+		{
 			cerr << "Error reading service type\n";
-			}
+			cerr << "Service Type: " << url.getService() << "\n";
 		}
+	}
 
 	return newReader;
 	}
@@ -45,14 +46,14 @@ StreamReader *SR_factory ( ParsedUrl url, string mode )
 void printDocIndex ( DocIndex *dict )
 	{
 	for ( auto it = dict->begin( ); it != dict->end( ); it++ )
-		{
+	{
 		cout << it->first << " : ";
 		for ( int i = 0; i < it->second.size( ); ++i )
-			{
+		{
 			cout << it->second[ i ] << " ";
-			}
-		cout << std::endl;
 		}
+		cout << std::endl;
+	}
 	cout << std::endl;
 
 	}
@@ -80,18 +81,18 @@ void Spider::run ( )
 	int cond = 0;
 
 	while ( cond < 250 )
-		{
+	{
 		ParsedUrl currentUrl = getUrl( );
-		size_t docID = hash( currentUrl.CompleteUrl );
+		size_t docID = hash( currentUrl.getCompleteUrl().c_str() );
 		if ( shouldURLbeCrawled( docID ) )
-			{
+		{
 			StreamReader *reader = SR_factory( currentUrl, this->mode );
 			if(reader)
-				{
+			{
 				bool success = reader->request( );
 				if ( success )
-					{
-					cout << "Parsing " << currentUrl.CompleteUrl;
+				{
+					cout << "Parsing " << currentUrl.getCompleteUrl();
 					DocIndex *dict = parser.execute( reader );
 					IndexerQueue->Push( dict );
 
@@ -100,16 +101,16 @@ void Spider::run ( )
 					//delete dict;
 
 					cond++;
-					}
 				}
+			}
 
 
 			delete reader;
 
 
-			}
 		}
 	}
+	}
 
 
 
diff --git a/shared/url.h b/shared/url.h
index 08d8c41..20aad82 100644
--- a/shared/url.h
+++ b/shared/url.h
@@ -19,42 +19,50 @@ using namespace std;
 
 class ParsedUrl
 	{
-public:
-	char *CompleteUrl,
-			*Service,
-			*Host,
-			*Domain,
-			*Path,
-			*AnchorText;
+private:
+	string CompleteUrl,
+			Service,
+			Host,
+			Domain,
+			Path,
+			AnchorText;
 	double Score;
 
+public:
 	ParsedUrl ( string input_url )
 		{
 		// Assumes url points to static text but
 		// does not check.
+		char *temp_CompleteUrl,
+				*temp_Service,
+				*temp_Host,
+				*temp_Domain,
+				*temp_Path,
+				*temp_AnchorText,
+				*temp_pathBuffer;
 
 		//intialize anchor text to "null"
 		char *null = new char[2];
 		strcpy( null, string( "" ).c_str( ) );
-		AnchorText = null;
+		temp_AnchorText = null;
 
 		char *url = new char[input_url.length( ) + 1];
 		strcpy( url, input_url.c_str( ) );
 
-		CompleteUrl = url;
+		temp_CompleteUrl = url;
 
-		pathBuffer = new char[strlen( url ) + 1];
+		temp_pathBuffer = new char[strlen( url ) + 1];
 		char *f, *t;
-		for ( t = pathBuffer, f = url; ( *t++ = *f++ ); );
+		for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); );
 
-		Service = pathBuffer;
+		temp_Service = temp_pathBuffer;
 
 		const char Colon = ':', Slash = '/', HashTag = '#', Period = '.';
 		char *p;
-		for ( p = pathBuffer; *p && *p != Colon; p++ );
+		for ( p = temp_pathBuffer; *p && *p != Colon; p++ );
 
 		if ( *p )
-			{
+		{
 			// Mark the end of the Service.
 			*p++ = 0;
 
@@ -63,7 +71,7 @@ public:
 			if ( *p == Slash )
 				p++;
 
-			Host = p;
+			temp_Host = p;
 
 			for ( ; *p && *p != Slash; p++ );
 
@@ -73,24 +81,24 @@ public:
 
 			//char * domainBuffer = new char[ 20 ];
 			//get the domain:
-			char *i = Host;
-			Domain = null;
+			char *i = temp_Host;
+			temp_Domain = null;
 			if(i)
-				{
+			{
 				for ( ; *i; i++ )
-					{
+				{
 
 					if ( *i == Period )
-						Domain = i;
-
-					}
+						temp_Domain = i;
 
 				}
 
+			}
+
 
 			// Whatever remains is the Path. // need to remove fragments
 
-			Path = p;
+			temp_Path = p;
 			for ( ; *p && *p != HashTag; p++ );
 
 			if ( *p )
@@ -98,9 +106,18 @@ public:
 				*p++ = 0;
 
 
-			}
+		}
 		else
-			Host = Path = p;
+			temp_Host = temp_Path = p;
+
+
+		CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl));
+		Service = string(temp_Service, strlen(temp_Service));
+		Host = string(temp_Host, strlen(temp_Host));
+		Domain = string(temp_Domain, strlen(temp_Domain));
+		Path = string(temp_Path, strlen(temp_Path));
+		AnchorText = string(temp_AnchorText, strlen(temp_AnchorText));
+		pathBuffer = temp_pathBuffer;
 
 		setScore( );
 		}
@@ -119,72 +136,67 @@ public:
 
 	void setScore()
 		{
-		double lengthOfUrl = strlen(CompleteUrl);
+		double lengthOfUrl = CompleteUrl.length();
 		Score += 4 * 1/ log( lengthOfUrl );
 
 		if(lengthOfUrl > 4)
-			{
+		{
 
-			if(this->Domain )
-
-					{
-					if ( strcmp ( Domain , ORG ) )
-						Score += 5;
-					else if ( strcmp ( Domain , EDU ) )
-						Score += 4;
-					else if ( strcmp ( Domain , GOV ) )
-						Score += 3;
-					else if ( strcmp ( Domain , COM ) )
-						Score += 2;
-					else if ( strcmp ( Domain , NET ) )
-						Score += 1;
-					else if ( strcmp ( Domain , INT ) )
-						Score += 1;
-					else if ( strcmp ( Domain , MIL ) )
-						Score += .5;
-					}
+			if(this->Domain.length() )
 
+			{
+				if ( strcmp ( Domain.c_str() , ORG ) )
+					Score += 5;
+				else if ( strcmp ( Domain.c_str() , EDU ) )
+					Score += 4;
+				else if ( strcmp ( Domain.c_str() , GOV ) )
+					Score += 3;
+				else if ( strcmp ( Domain.c_str() , COM ) )
+					Score += 2;
+				else if ( strcmp ( Domain.c_str() , NET ) )
+					Score += 1;
+				else if ( strcmp ( Domain.c_str() , INT ) )
+					Score += 1;
+				else if ( strcmp ( Domain.c_str() , MIL ) )
+					Score += .5;
 			}
+
+		}
+		}
+
+	std::string getDomain ( )
+		{
+		return Domain;
+		}
+
+	std::string getService ( )
+		{
+		return Service;
 		}
 
 	std::string getCompleteUrl ( )
 		{
-		std::string completeUrl = "";
-		completeUrl.assign( this->CompleteUrl );
-		return completeUrl;
+		return CompleteUrl;
 		}
 
 	std::string getHost ( )
 		{
-		std::string host = "";
-		host.assign( this->Host );
-		return host;
+		return Host;
 		}
 
 	std::string getPath ( )
 		{
-		std::string path = "";
-		path.assign( this->Path );
-		return path;
+		return Path;
 		}
 
 	std::string getAnchorText ( )
 		{
-		std::string anchorText = "";
-		anchorText.assign( this->AnchorText );
-		return anchorText;
+		return AnchorText;
 		}
 
 	void setAnchorText ( std::string anchorText )
 		{
-		char *anchorCharStar = new char[anchorText.size( )];
-
-		for ( int i = 0; i < anchorText.size( ); ++i )
-			{
-			anchorCharStar += anchorText[ i ];
-			}
-		anchorCharStar += '\0';
-		this->AnchorText = anchorCharStar;
+		AnchorText = anchorText;
 		}
 
 
-- 
GitLab