Skip to content
Snippets Groups Projects
Commit 0fe06957 authored by benbergk's avatar benbergk
Browse files

formatting changes and added lock around file

parent 2f874e38
Branches
No related tags found
No related merge requests found
......@@ -16,11 +16,7 @@
#include "LocalReader.h"
#include "SocketReader.h"
namespace filepath
{
const char* DOC_MAP = "/docMap.txt";
}
#include "../shared/documentMap.h"
string Spider::getUrl()
......@@ -41,11 +37,10 @@ void Spider::FuncToRun()
string currentUrl = getUrl( );
char *fileMap;
bool toCrawl = shouldURLbeCrawled( currentUrl );
if ( toCrawl )
//url has not been seen
if ( shouldURLbeCrawled( currentUrl ))
{
if ( cond )
bool success = writeDocToDisk(currentUrl);
if ( success && cond )
{
......@@ -75,95 +70,39 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
*/
bool Spider::writeDocToDisk(string url)
{
Document d(url);
int resultPosition = d.WriteToDocMap();
if(resultPosition == -1) {
return false;
}
this->docMapLookup->insert( std::pair < string, int >( url, resultPosition ));
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
std::cout << it->first << " => " << it->second << '\n';
return true;
}
bool Spider::shouldURLbeCrawled( string url )
{
//search for url in doc cache
auto locationOnDisk = this->docMapLookup->find( url );
//if it doesnt find anything for that url key
if ( locationOnDisk == this->docMapLookup->end( ))
{
//cerr << "Url Not Found In Cache Lookup" << endl;
//get file descriptor for the docMap on disk
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor( loc.c_str(), "W" );
//check if its available
if ( file == -1 )
cerr << "Error opening docMap" << endl;
else
{
//get the current size of the docMap
size_t seekPosition = util::FileSize( file );
//seek to the end of the file
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
if ( resultPosition == -1 )
{
cerr << "Could not seek to " << seekPosition <<
", error = " << errno;
return errno;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write( file, "Hello World!\n", 14 );
if ( success == -1 )
{
cerr << "Error writing document object to document map" << endl;
}
this->docMapLookup->insert( std::pair < string, int >( url, resultPosition ));
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
std::cout << it->first << " => " << it->second << '\n';
close( file );
return true;
}
}
else
{
//maps url id -> location on disk (where to seek too)
std::cout << locationOnDisk->first << " is " << locationOnDisk->second;
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor( loc.c_str(), "R" );
//check if its available
if ( file )
{
size_t seekPosition = locationOnDisk->second;
off_t resultPosition = lseek( file, seekPosition, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
{
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read( file, buffer, bytes ))
write( 1, buffer, bytesRead );
else
{
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
return errno;
}
}
//
return false;
}
return false;
//Just for testing
Document::PrintDocMap(url, locationOnDisk->second);
}
return false;
}
......
......@@ -37,8 +37,7 @@ public:
//Where to write to disk? What type of data are we reading in?
int writeFileToDisk( char *fileContents, string locationOnDisk );
void markURLSeen( string URL );
bool writeDocToDisk(string url);
bool shouldURLbeCrawled( string URL );
......
......@@ -7,18 +7,113 @@
#include "url.h"
#include <string>
#include <vector>
#include <pthread.h>
using namespace std;
namespace filepath
{
const char* DOC_MAP = "/docMap.txt";
}
pthread_mutex_t docMap_mutex = PTHREAD_MUTEX_INITIALIZER;
class Document
{
public:
Url url;
private:
ParsedUrl url;
long docID;
bool lastCrawlStatus;
int lastCrawlDate;
int lastCrawlPageCount;
//add more info fields here
public:
Document(string url_in) : url(ParsedUrl(url_in)) {}
int WriteToDocMap()
{
pthread_mutex_lock(&docMap_mutex);
//for now just write url
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor(loc.c_str(), "W");
off_t resultPosition = 0;
//check if its available
if (file == -1) {
cerr << "Error opening docMap" << endl;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
} else
{
//get the current size of the docMap
size_t seekPosition = util::FileSize(file);
//seek to the end of the file
resultPosition = lseek(file, seekPosition, SEEK_SET);
if (resultPosition == -1) {
cerr << "Could not seek to " << seekPosition <<
", error = " << errno;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write(file, "Hello World!\n", 14);
if (success == -1)
{
cerr << "Error writing document object to document map" << endl;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
}
}
close( file );
pthread_mutex_unlock(&docMap_mutex);
return resultPosition;
}
static void PrintDocMap(string url, int location)
{
pthread_mutex_lock(&docMap_mutex);
std::cout << url << " is " << location;
string loc = util::GetCurrentWorkingDir() + filepath::DOC_MAP;
int file = util::getFileDescriptor( loc.c_str(), "R" );
Document() {};
//check if its available
if ( file )
{
off_t resultPosition = lseek( file, (size_t)location, SEEK_SET );
int bytes = 14;
if ( bytes > 0 )
{
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read( file, buffer, bytes ))
write( 1, buffer, bytesRead );
else
{
cerr << "Could not read " << bytes << " bytes at position " <<
resultPosition << ", error = " << errno;
pthread_mutex_unlock(&docMap_mutex);
return;
}
}
}
pthread_mutex_unlock(&docMap_mutex);
return;
}
};
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment