Skip to content
Snippets Groups Projects
Commit e4c9b47b authored by jsclose's avatar jsclose
Browse files

working on url testing + docMap work

parent 32fcdac0
Branches
No related tags found
No related merge requests found
......@@ -28,19 +28,25 @@ void Spider::FuncToRun()
string currentUrl = getUrl( );
char *fileMap;
shouldURLbeCrawled( currentUrl );
bool toCrawl = shouldURLbeCrawled( currentUrl );
if(toCrawl)
//url has not been seen
{
if ( request( currentUrl, fileMap ))
{
// markURLSeen( currentUrl );
//parser.parse(fileMap);
cond = false;
} else
{
cerr << "Error connecting";
}
if ( request( currentUrl, fileMap ))
{
// markURLSeen( currentUrl );
//parser.parse(fileMap);
cond = false;
} else
{
cerr << "Error connecting";
}
}
}
......@@ -55,16 +61,23 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
bool Spider::shouldURLbeCrawled( string url )
{
//search for url in doc cache
auto locationOnDisk = this->docMapLookup->find(url);
//if it doesnt find anything for that url key
if ( locationOnDisk == this->docMapLookup->end() )
{
cerr << "Url Not Found In Cache Lookup";
int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt" );
if ( file )
//cerr << "Url Not Found In Cache Lookup" << endl;
//get file descriptor for the docMap on disk
int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt", "W" );
//check if its available
if ( file == -1 )
cerr << "Error opening docMap" << endl;
else
{
//get the current size of the docMap
size_t seekPosition = FileSize( file );
//seack to the end of the file
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
......@@ -73,26 +86,61 @@ bool Spider::shouldURLbeCrawled( string url )
", error = " << errno;
return errno;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write( file, "Hello World!\n", 14 );
if ( success == -1 )
{
cerr << "Error writing document object to document map" << endl;
}
this->docMapLookup->insert( std::pair < string, int >( url, resultPosition ));
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
std::cout << it->first << " => " << it->second << '\n';
}
else
cerr << "Error opening docMap" << endl;
close( file );
}
}
else
std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
{
//maps url id -> location on disk (where to seek too)
std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
/*
int file = getFileDescriptor( "/Users/jakeclose/Desktop/398/project/eecs398-search/docMap.txt", "R" );
//check if its available
if ( file )
{
size_t seekPosition = locationOnDisk->second;
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
int bytes = 14;
if ( bytes >0 )
{
char *buffer = new char[ bytes ];
ssize_t bytesRead;
if ( bytesRead = read( file, buffer, bytes ) )
write( 1, buffer, bytesRead );
else
{
cerr << "Could not read " << bytes << " bytes at position " <<
position << ", error = " << errno;
return errno;
}
}
}
return false;
*/
}
}
......
No preview for this file type
......@@ -57,8 +57,10 @@ int main(int argc, const char * argv[])
cout << "Pushed File\n";
urlFrontier.Push("tests/cats.html");
urlFrontier.Push("tests/store.html");
unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();
unordered_map<string, int>* docMapLookUp = new unordered_map<string, int>();
......
......@@ -5,19 +5,101 @@
#pragma once
#include <string>
#include <iostream>
using namespace std;
class Url
{
public:
std::string url;
Url( string url_in ) : url( url_in) {};
Url( string url_in ) : url( url_in )
{ };
//Removes/ parses url
void clean();
//parses domain from url
void getDomain();
string getDomain()
{
string domain;
string protocol = getProtocol();
int domainStart = protocol.size() + 3;
if(url [ domainStart ] == 'w' )
{
//starts with www.
domainStart += 4;
}
for( int domainEnd = domainStart ; domainEnd < url.size() ; domainEnd++ )
{
if ( url[ domainEnd ] == '/' || url[ domainEnd ] == ':')
return domain;
else
domain.push_back( url[ domainEnd ] );
}
}
string getDomainType()
{
string domain = getDomain();
string type;
//.gov, .com, .edu
auto i = domain.end();
while(*i != '.')
{
type.push_back(( *i ));
--i;
}
reverse(type.begin(), type.end() );
return type;
}
/*
* HTTP, HTTPS, MAILTO etc
*/
string getProtocol()
{
string protocol;
for ( int i = 0; i < url.size( ); i++ )
{
if( url [ i ] != ':' )
protocol.push_back( url [ i ] );
else
return protocol;
}
};
/*
* Returns URL such that
* http://www.example.com:80/path/to/myfile.html#SomewhereInTheDocument
* becomes http://www.example.com:80/path/to/myfile.html
*/
void removeAnchor( ){
int i = 0;
string cleaned;
while( url[ i ] != '#')
{
cleaned.push_back( url [ i ] );
i++;
}
url = cleaned;
}
};
......
//
// Created by Jake Close on 2/13/18.
//
#include "url.h"
#include <string>
using namespace std;
#include <stdlib.h>
#include <iostream>
#include <assert.h>
int main(int argc, const char * argv[])
{
Url test1 = Url("https://developer.mozilla.org/en-US/docs/Learn" ) ;
string protocol = test1.getProtocol();
assert( protocol == "https");
string domain = test1.getDomain();
assert( domain == "developer.mozilla.org");
string domainType = test1.getDomainType();
//assert( domainType == "org");
Url test2 = Url("http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
test2.removeAnchor();
assert( test2.url == "http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2" );
protocol = test2.getProtocol();
assert( protocol == "http");
domain = test2.getDomain();
assert( domain == "example.com");
domainType = test2.getDomainType();
//assert( domainType == "com");
std::cout << "URL TEST PASSED" << std::endl;
}
\ No newline at end of file
......@@ -30,9 +30,18 @@ size_t FileSize( int f )
*
*/
int getFileDescriptor( string fileName )
int getFileDescriptor( string fileName , string type)
{
return open( fileName.c_str( ), O_RDONLY );
if(type == "R")
{
return open( fileName.c_str( ), O_RDONLY );
}
else if( type == "W")
{
return open( fileName.c_str( ), O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR );
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment