working on testing url + robots.txt in spider.h

c42194a6 · jsclose · 81d48785 · c42194a6 · c42194a6 · c42194a6
Commit c42194a6 authored 7 years ago by jsclose
--- a/crawler/StreamReader.h
+++ b/crawler/StreamReader.h
@@ -23,9 +23,6 @@ class StreamReader
 public:
 	StreamReader() {};
 	virtual void fillBuffer() = 0;
-
-
-protected:
 	char *buffer;

 	};
\ No newline at end of file
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -96,6 +96,7 @@ bool Spider::shouldURLbeCrawled( string url )
 	//search for url in doc cache
 	auto locationOnDisk = this->docMapLookup->find( url );

+	//bool protectedByRobots = checkRobots( url );
 	//if it doesnt find anything for that url key
 	if ( locationOnDisk == this->docMapLookup->end( ))
 		{
@@ -109,7 +110,47 @@ bool Spider::shouldURLbeCrawled( string url )
 	return false;
 	}

+/*
+//check if path in url is in the robots txt
+bool Spider::checkRobots(string url_in)
+	{
+	ParsedUrl url = ParsedUrl(url_in);
+	string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host));
+	int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
+	//File does not exist yet
+	if(robotsFileD == -1)
+		{
+		robotsFileD = getRobots(url);
+		}

+	//char* robotsTXT = util::getFileMap(robotsFileD);
+	return 1;
+	}
+
+
+
+//Makes request to get a new robots txt file, returns the file pointer
+int Spider::getRobots(ParsedUrl url )
+	{
+
+
+	string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" +  string(url.Host, strlen(url.Host));
+	string pathToWebRobots =  "http://" + string(url.Host, strlen(url.Host)) + "/robots.txt";
+	//string(url.Service, strlen(url.Service))+
+	SocketReader *reader = new SocketReader(pathToWebRobots);
+	reader->fillBuffer();
+
+	int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots);
+	if( fd == -1)
+		{
+		cerr << "Error getting Robots.txt file " << endl;
+		}
+	return fd;
+
+	return 1;
+
+	};
+*/
 /*
 returns true if fileMap was created, otherwise false
 Modifies the filemap to be a char* of the file of the url passed

--- a/crawler/spider.h
+++ b/crawler/spider.h
@@ -36,9 +36,12 @@ public:

 	bool writeDocToDisk(string url);

-
 	bool shouldURLbeCrawled( string URL );

+	int getRobots(ParsedUrl url );
+	bool checkRobots(string url);
+
+
 private:

 	int locationOnDisk;

--- a/docMap.txt
+++ b/docMap.txt
--- a/shared/url.h
+++ b/shared/url.h
@@ -7,96 +7,9 @@
 #include <string>
 #include <iostream>
 #include "../util/util.h"
-//#include "../crawler/StreamReader.h"
 //#include "../crawler/SocketReader.h"
 using namespace std;

-//
-//class Url
-//	{
-//
-//public:
-//	std::string url;
-//
-//	Url( string url_in ) : url( url_in )
-//		{ };
-//
-//
-//	//Removes/ parses url
-//	void clean();
-//
-//	//parses domain from url
-//	string getDomain()
-//		{
-//		string domain;
-//		string protocol = getProtocol();
-//		int domainStart = protocol.size() + 3;
-//		if(url [ domainStart ] == 'w' )
-//			domainStart += 4;//starts with www.
-//
-//
-//
-//		for( int domainEnd = domainStart ; domainEnd < url.size() ; domainEnd++ )
-//			{
-//			if ( url[ domainEnd ] == '/'  || url[ domainEnd ] == ':')
-//				return domain;
-//			else
-//				domain.push_back( url[ domainEnd ] );
-//			}
-//		}
-//
-//	//return .gov, .com, .edu
-//	string getDomainType()
-//		{
-//		string domain = getDomain();
-//		string type = "";
-//
-//		auto i = domain.end();
-//		--i;
-//		while(*i != '.')
-//			{
-//			type.push_back(( *i ));
-//			--i;
-//			}
-//		reverse(type.begin(), type.end() );
-//
-//
-//		return type;
-//
-//		}
-//
-//	/*
-//	 * HTTP, HTTPS, MAILTO etc
-//	*/
-//	string getProtocol()
-//		{
-//		string protocol;
-//		for ( int i = 0; i < url.size( ); i++ )
-//			{
-//			if( url [ i ] != ':' )
-//				protocol.push_back( url [ i ] );
-//			else
-//				return protocol;
-//			}
-//
-//
-//		};
-//
-//	/*
-//	 * Returns URL such that
-//	 * http://www.example.com:80/path/to/myfile.html#SomewhereInTheDocument
-//	 * becomes http://www.example.com:80/path/to/myfile.html
-//	 */
-//	void removeAnchor( ){
-//		int i = 0;
-//		string cleaned;
-//		while( url[ i ] != '#')
-//			cleaned.push_back( url [ i++ ] );
-//
-//		url = cleaned;
-//		}
-//	};
-



@@ -110,6 +23,7 @@ public:
 	char  *CompleteUrl,
 			*Service,
 			*Host,
+			*Domain,
 			*Path;

 	ParsedUrl( string input_url )
@@ -129,7 +43,7 @@ public:

 		Service = pathBuffer;

-		const char Colon = ':', Slash = '/';
+		const char Colon = ':', Slash = '/', HashTag = '#', Period = '.';
 		char *p;
 		for ( p = pathBuffer;  *p && *p != Colon;  p++ )
 			;
@@ -153,51 +67,43 @@ public:
 				// Mark the end of the Host.
 				*p++ = 0;

-			// Whatever remains is the Path.
+			//char * domainBuffer = new char[ 20 ];
+			//get the domain:
+			for(int i = strlen(Host); Host[i] != Period; i--){
+				}
+
+
+
+
+
+			// Whatever remains is the Path. // need to remove fragments
+
 			Path = p;
+			for ( ;  *p && *p != HashTag;  p++ )
+				;
+
+			if ( *p )
+				// Mark the end of the Path, remove fragments.
+				*p++ = 0;
+
+
 			}
 		else
 			Host = Path = p;
 		}

-/*
-	//check if path in url is in the robots txt
-	void checkRobots()
+	void printUrl()
 		{
-			string pathToRobots = util::GetCurrentWorkingDir() + '/' +  Service;
-			int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
-			//File does not exist yet
-			if(robotsFileD == -1)
-				{
-					robotsFileD = getRobots();
-				}
-
-			char* robotsTXT = util::getFileMap(robotsFileD);
-
+		cout << "Complete URL: " << CompleteUrl << endl;
+		cout << "Service: " << Service << endl;
+		cout << "Host: " << Host << endl;
+		cout << "Path: " << Path << endl;


 		}



-	//Makes request to get a new robots txt file, returns the file pointer
-	int getRobots( )
-		{
-
-			StreamReader reader;
-			string pathToRobots = util::GetCurrentWorkingDir() + '/' +  Service;
-			reader = new SocketReader(CompleteUrl+ '/' + 'robots.txt');
-			reader->fillBuffer();
-			int fd = util::writeToNewFileToLocation( reader->buffer, pathToRobots);
-			if( fd == -1)
-				{
-				cerr << "Error getting Robots.txt file " << endl;
-				}
-		return fd;
-
-		return 1;
-		};
- */
 	~ParsedUrl( )
 		{
 		delete [ ] pathBuffer;

--- a/shared/urlTest.cpp
+++ b/shared/urlTest.cpp
@@ -12,25 +12,20 @@ using namespace std;
 int main(int argc, const char * argv[])
 	{

-	Url test1 =  Url("https://developer.mozilla.org/en-US/docs/Learn" ) ;
-	string protocol = test1.getProtocol();
-	assert( protocol == "https");
-	string domain = test1.getDomain();
-	assert( domain == "developer.mozilla.org");
-	string domainType = test1.getDomainType();
-	assert( domainType == "org");
-
-
-
-	Url test2 = Url("http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
-	test2.removeAnchor();
-	assert( test2.url == "http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2" );
-	protocol = test2.getProtocol();
-	assert( protocol == "http");
-	domain = test2.getDomain();
-	assert( domain == "example.com");
-	domainType = test2.getDomainType();
-	assert( domainType == "com");
+	ParsedUrl test1 =  ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ;
+	//string protocol = test1.getProtocol();
+	test1.printUrl();
+	//assert( strcmp(test1.Service, "https") == 1);
+	//assert( strcmp(test1.Host, "developer.mozilla.org") == 1);
+
+
+
+
+	ParsedUrl test2 = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
+	test2.printUrl();
+	assert( strcmp(test2.Service, "http"));
+	assert( strcmp(test2.Host, "example.com"));
+

 	std::cout << "URL TEST PASSED" << std::endl;


--- a/url_test
+++ b/url_test