working on url testing + docMap work

e4c9b47b · jsclose · 32fcdac0 · e4c9b47b · e4c9b47b · e4c9b47b
Commit e4c9b47b authored 7 years ago by jsclose
--- a/crawler/spider.cpp
+++ b/crawler/spider.cpp
@@ -28,19 +28,25 @@ void Spider::FuncToRun()
 		string currentUrl = getUrl( );
 		char *fileMap;

-		shouldURLbeCrawled( currentUrl );
+		bool toCrawl = shouldURLbeCrawled( currentUrl );
+		if(toCrawl)
+			//url has not been seen
+			{
+			if ( request( currentUrl, fileMap ))
+				{
+				// markURLSeen( currentUrl );

+				//parser.parse(fileMap);
+				cond = false;
+				} else
+				{
+				cerr << "Error connecting";
+				}

-		if ( request( currentUrl, fileMap ))
-			{
-			// markURLSeen( currentUrl );

-			//parser.parse(fileMap);
-			cond = false;
-			} else
-			{
-			cerr << "Error connecting";
 			}
+
+
 		}
 	}

@@ -55,16 +61,23 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec

 bool Spider::shouldURLbeCrawled( string url )
 	{
-
+	//search for url in doc cache
 	auto locationOnDisk = this->docMapLookup->find(url);

+	//if it doesnt find anything for that url key
 	if ( locationOnDisk == this->docMapLookup->end() )
 		{
-		cerr << "Url Not Found In Cache Lookup";
-		int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt" );
-		if ( file )
+		//cerr << "Url Not Found In Cache Lookup" << endl;
+		//get file descriptor for the docMap on disk
+		int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt", "W" );
+		//check if its available
+		if ( file == -1 )
+			cerr << "Error opening docMap" << endl;
+		else
 			{
+			//get the current size of the docMap
 			size_t seekPosition = FileSize( file );
+			//seack to the end of the file
 			off_t resultPosition = lseek( file, seekPosition, SEEK_SET );

 			if ( resultPosition == -1 )
@@ -73,26 +86,61 @@ bool Spider::shouldURLbeCrawled( string url )
 					  ", error = " << errno;
 				return errno;
 				}
-
 			cout << "Current docMap position on disk" << endl;
 			cout << resultPosition << endl;

+			size_t success = write( file, "Hello World!\n", 14 );
+			if ( success == -1 )
+				{
+				cerr << "Error writing document object to document map" << endl;
+				}


+			this->docMapLookup->insert( std::pair < string, int >( url, resultPosition ));
+			for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
+				std::cout << it->first << " => " << it->second << '\n';

-			}
-		else
-			cerr << "Error opening docMap" << endl;
-
+			close( file );

+			}
 		}


+
 	else
-		std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
+		{



+		//maps url id -> location on disk (where to seek too)
+
+		std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
+		/*
+		int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt", "R" );
+		//check if its available
+		if ( file )
+			{
+			size_t seekPosition = locationOnDisk->second;
+			off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
+			int bytes = 14;
+			if ( bytes >0 )
+				{
+				char *buffer = new char[ bytes ];
+				ssize_t bytesRead;
+				if ( bytesRead = read( file, buffer, bytes ) )
+					write( 1, buffer, bytesRead );
+				else
+					{
+					cerr << "Could not read " << bytes << " bytes at position " <<
+						  position << ", error = " << errno;
+					return errno;
+					}
+				}
+			}
+
+		return false;
+		 */
+		}

 	}


--- a/docMap.txt
+++ b/docMap.txt
--- a/main.cpp
+++ b/main.cpp
@@ -57,8 +57,10 @@ int main(int argc, const char * argv[])

    cout << "Pushed File\n";
    urlFrontier.Push("tests/cats.html");
+    urlFrontier.Push("tests/store.html");

-	unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();
+
+   unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();




--- a/shared/url.h
+++ b/shared/url.h
@@ -5,19 +5,101 @@
 #pragma once

 #include <string>
+#include <iostream>
+using namespace std;
 class Url
 	{

+public:
 	std::string url;

-	Url( string url_in ) : url( url_in) {};
+	Url( string url_in ) : url( url_in )
+		{ };


 	//Removes/ parses url
 	void clean();

 	//parses domain from url
-	void getDomain();
+	string getDomain()
+		{
+		string domain;
+		string protocol = getProtocol();
+		int domainStart = protocol.size() + 3;
+		if(url [ domainStart ] == 'w' )
+			{
+			//starts with www.
+			domainStart += 4;
+			}
+
+
+		for( int domainEnd = domainStart ; domainEnd < url.size() ; domainEnd++ )
+			{
+			if ( url[ domainEnd ] == '/'  || url[ domainEnd ] == ':')
+				return domain;
+			else
+				domain.push_back( url[ domainEnd ] );
+			}
+		}
+
+
+	string getDomainType()
+		{
+		string domain = getDomain();
+		string type;
+
+		//.gov, .com, .edu
+		auto i = domain.end();
+
+		while(*i != '.')
+			{
+			type.push_back(( *i ));
+			--i;
+			}
+		reverse(type.begin(), type.end() );
+
+
+		return type;
+
+		}
+
+	/*
+	 * HTTP, HTTPS, MAILTO etc
+	*/
+	string getProtocol()
+		{
+		string protocol;
+		for ( int i = 0; i < url.size( ); i++ )
+			{
+			if( url [ i ] != ':' )
+				protocol.push_back( url [ i ] );
+			else
+				return protocol;
+			}
+
+
+		};
+
+	/*
+	 * Returns URL such that
+	 * http://www.example.com:80/path/to/myfile.html#SomewhereInTheDocument
+	 * becomes http://www.example.com:80/path/to/myfile.html
+	 */
+	void removeAnchor( ){
+		int i = 0;
+		string cleaned;
+		while( url[ i ] != '#')
+			{
+			cleaned.push_back( url [ i ] );
+			i++;
+			}
+		url = cleaned;
+
+
+		}
+
+
+


 	};

--- a/shared/urlTest.cpp
+++ b/shared/urlTest.cpp
+//
+// Created by Jake Close on 2/13/18.
+//
+
+#include "url.h"
+#include <string>
+using namespace std;
+#include <stdlib.h>
+#include <iostream>
+#include <assert.h>
+
+int main(int argc, const char * argv[])
+	{
+
+	Url test1 =  Url("https://developer.mozilla.org/en-US/docs/Learn" ) ;
+	string protocol = test1.getProtocol();
+	assert( protocol == "https");
+	string domain = test1.getDomain();
+	assert( domain == "developer.mozilla.org");
+	string domainType = test1.getDomainType();
+	//assert( domainType == "org");
+
+
+
+	Url test2 = Url("http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
+	test2.removeAnchor();
+	assert( test2.url == "http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2" );
+	protocol = test2.getProtocol();
+	assert( protocol == "http");
+	domain = test2.getDomain();
+	assert( domain == "example.com");
+	domainType = test2.getDomainType();
+	//assert( domainType == "com");
+
+	std::cout << "URL TEST PASSED" << std::endl;
+
+	}
\ No newline at end of file
--- a/util/util.h
+++ b/util/util.h
@@ -30,9 +30,18 @@ size_t FileSize( int f )
 *
 */

-int getFileDescriptor( string fileName )
+int getFileDescriptor( string fileName , string type)
 	{
-	return open( fileName.c_str( ), O_RDONLY );
+	if(type == "R")
+		{
+		return open( fileName.c_str( ), O_RDONLY );
+		}
+	else if( type == "W")
+		{
+		return open( fileName.c_str( ), O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR );
+		}
+
+

 	}