Skip to content
Snippets Groups Projects
Commit e8774451 authored by benbergk's avatar benbergk
Browse files

added command line args

parent 0fe06957
No related branches found
No related tags found
No related merge requests found
......@@ -87,6 +87,8 @@ bool Spider::writeDocToDisk(string url)
return true;
}
bool Spider::shouldURLbeCrawled( string url )
{
//search for url in doc cache
......
......@@ -34,9 +34,6 @@ public:
// else return false and error information, retry if necessary
StreamReader *request( string url );
//Where to write to disk? What type of data are we reading in?
int writeFileToDisk( char *fileContents, string locationOnDisk );
bool writeDocToDisk(string url);
......
......@@ -14,6 +14,8 @@
//#include "crawler/CrawlerStatistics.h"
#include <unordered_map>
#include "util/util.h"
#include <getopt.h>
#define PATH_TO_BLACKLIST = '/bin/blacklist.txt'
#define PATH_TO_VISITED_URL = 'bin/urls.txt'
......@@ -25,7 +27,7 @@
using namespace std;
int main( int argc, const char *argv[] )
int main( int argc, char *argv[] )
{
/*
*
......@@ -44,12 +46,47 @@ int main( int argc, const char *argv[] )
*/
//
string mode = "local";
// Seed urls?
string seed;
//
int numberOfSpiders;
int numberOfParsers;
int numberOfSpiders = 1;
opterr = true;
int choice;
int option_index = 0;
option long_options[] = {
{"mode", optional_argument, nullptr, 'm'},
{"num_crawlers", optional_argument, nullptr, 'c'}
};
while ((choice = getopt_long(argc, argv, "m:c:", long_options, &option_index)) != -1) {
switch (choice) {
case 'm':
mode = optarg;
if (mode != "web" && mode != "local") {
cerr << "Unknown input option";
exit(1);
}
break;
case 'c':
numberOfSpiders = atoi(optarg);
if (numberOfSpiders > 100) {
cerr << "Too many crawlers!";
exit(1);
}
break;
default:
cerr << "Unknown input option";
exit(1);
}
}
string seed;
bool restoreFromLog;
......@@ -84,15 +121,11 @@ unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >(
Crawler crawler( mode, &urlFrontier );
crawler.SpawnSpiders(3 , docMapLookUp);
crawler.SpawnSpiders(numberOfSpiders , docMapLookUp);
crawler.
WaitOnAllSpiders();
//This part is a work in progress I was just trying to simulate the
// parser and see if they could open and read the file
}
\ No newline at end of file
......@@ -16,6 +16,7 @@ namespace filepath
const char* DOC_MAP = "/docMap.txt";
}
pthread_mutex_t docMap_mutex = PTHREAD_MUTEX_INITIALIZER;
class Document
......@@ -32,6 +33,11 @@ class Document
public:
Document(string url_in) : url(ParsedUrl(url_in)) {}
string DocToString()
{
return string(url.CompleteUrl, strlen(url.CompleteUrl)) + "\n";
}
int WriteToDocMap()
{
......@@ -43,38 +49,34 @@ class Document
int file = util::getFileDescriptor(loc.c_str(), "W");
off_t resultPosition = 0;
//check if its available
if (file == -1) {
cerr << "Error opening docMap" << endl;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
} else
{
//get the current size of the docMap
size_t seekPosition = util::FileSize(file);
//seek to the end of the file
resultPosition = lseek(file, seekPosition, SEEK_SET);
if (resultPosition == -1) {
cerr << "Could not seek to " << seekPosition <<
", error = " << errno;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
try {
//check if its available
if (file == -1) {
throw("error opening docMap");
} else {
//get the current size of the docMap
size_t seekPosition = util::FileSize(file);
//seek to the end of the file
resultPosition = lseek(file, seekPosition, SEEK_SET);
if (resultPosition == -1) {
throw("Could not seek");
}
cout << "Current docMap position on disk" << endl;
cout << resultPosition << endl;
size_t success = write(file, "Hello World!\n", 14);
if (success == -1)
{
cerr << "Error writing document object to document map" << endl;
close( file );
pthread_mutex_unlock(&docMap_mutex);
return -1;
size_t success = write(file, this->DocToString().c_str(), strlen(this->DocToString().c_str()));
if (success == -1) {
throw("Error writing document object to document map");
}
}
}
catch(const char* str){
cerr << str << endl;
close(file);
pthread_mutex_unlock(&docMap_mutex);
return -1;
}
close( file );
pthread_mutex_unlock(&docMap_mutex);
return resultPosition;
......
http://www.dailymail.co.uk/ushome/index.html
http://www.bbc.com/
http://www.bbc.co.uk/news/business-42959138
\ No newline at end of file
http://www.bbc.co.uk/news/business-42959138
http://umich.edu
\ No newline at end of file
......@@ -21,8 +21,8 @@
#include <cstdlib>
using namespace std;
enum file_flag { read_only, write_only };
using namespace std;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment