diff --git a/crawler.exe b/crawler.exe index 667f57e74c03682722b27b595c47d8f0756ebdfb..3897f9acc8dad78397e8b3f699470a56c3e2c8eb 100755 Binary files a/crawler.exe and b/crawler.exe differ diff --git a/crawler/crawler.h b/crawler/crawler.h index 3ec2cadacf683ed555c55ad33bb1052249f21d9d..4de20d13df3d8bd768e97a2b8d3daf16d2e0ad58 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -4,7 +4,7 @@ #include "spider.h" #include<string> #include "../ProducerConsumerQueue.h" -#include "CrawlerStatistics.h" +//#include "CrawlerStatistics.h" /* * */ @@ -28,7 +28,7 @@ private: vector<Spider*> spiders; ProducerConsumerQueue<string> *urlFrontier; ProducerConsumerQueue<int> *fileQueue; - CrawlerStatistics housekeeper; + //CrawlerStatistics housekeeper; string mode; }; diff --git a/crawler/spider.cpp b/crawler/spider.cpp index cd6ecaecaca63b03caacf9a7f40321da66cb75f7..4bf8c7f235e4404a2bbc61dbd62a0094c334866c 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -7,6 +7,10 @@ #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> +#include "../util/util.h" + + + string Spider::getUrl() { @@ -19,14 +23,23 @@ void Spider::FuncToRun() std::cout << "Spider is crawling" << endl; bool cond = true; + while( cond ) { + string currentUrl = getUrl(); - if ( request( currentUrl ) ) + char * fileMap; + if ( request( currentUrl , fileMap ) ) { // markURLSeen( currentUrl ); - // writeHTMLtoDisk( ); - // addHTMLToQueue( ); + string HARDCODEDLOCATION = "../crawlerOutput/" + currentUrl; + int fd = writeFileToDisk(fileMap , HARDCODEDLOCATION ); + //Write to disk successful + if( fd !=-1 ) + { + addFDToQueue( fd ); + + } cond = false; } else @@ -37,50 +50,28 @@ void Spider::FuncToRun() } -bool Spider::request( string url ) +bool Spider::request( string url , char* fileMap) { - char buf[100]; if ( this->mode == "local" ) - { - ifstream inFile; - string in; - inFile.open(url); - if ( !inFile ) - { - cout << "Unable to open file"; - exit(1); // terminate with error//// - } - int i = 0; - while (i < 100 && inFile >> buf[i]) { - i++; + fileMap = getFileMap( url ); + if (fileMap != nullptr ) + return true; } - - inFile.close(); - int file = writeFileToDisk(buf, 100); - fileQueue->Push(file); - return true; - } return false; } -int Spider::writeFileToDisk( char * fileContents, size_t fileSize) +int Spider::writeFileToDisk( char * fileContents , string locationOnDisk) { - int fd = creat("/Users/benbergkamp/Desktop/398/eecs398-search/test.txt", S_IRWXU); - ssize_t bytes_written = 0; - if(fd != -1) - { - bytes_written = write(fd, fileContents, fileSize); - } else - { - cout << "ERROR CREATING FILE\n"; - } - if(bytes_written != 100) - { - cout << "ERROR: Only " << bytes_written << " bytes written\n"; - } - return fd; + return writeToNewFileToLocation( fileContents, locationOnDisk ); + +} + + +void Spider::addFDToQueue( int fileDescriptor ) +{ + fileQueue->Push( fileDescriptor ); } \ No newline at end of file diff --git a/crawler/spider.h b/crawler/spider.h index 4490a2dc0c84487c9649e5aacb64ba4a8b362d43..bbfd42bed2f5a483f36535cf6f5b4abb956e6ddd 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -26,13 +26,13 @@ public: //Makes request to given url // if successful, writes file to disk, stores location to memeber value // else return false and error information, retry if necessary - bool request( string url ); + bool request( string url , char* fileMap); //Where to write to disk? What type of data are we reading in? - int writeFileToDisk(char * fileContents, size_t fileSize ); + int writeFileToDisk( char * fileContents , string locationOnDisk); //Adds location - void addHTMLToQueue(); + void addFDToQueue( int fileDescriptor ); void markURLSeen( string URL ); diff --git a/crawlerOutput/example.txt b/crawlerOutput/example.txt new file mode 100644 index 0000000000000000000000000000000000000000..de2e0e3343c5daafdb76cd03ca31a5449162ee32 --- /dev/null +++ b/crawlerOutput/example.txt @@ -0,0 +1,64 @@ +<!DOCTYPE +html> +<html> +<head> +<!-- +HTML +Codes +by +Quackit.com +--> +<title> +Food +store +is +here</title> +<meta +name="viewport" +content="width=device-width, +initial-scale=1"> +<meta +name="keywords" +content="store +food +dinner +lunch"> +<meta +name="description" +content="The +food +store +sells +cat +food +for +dinner, +lunch, +and +breakfast."> +<style> +body +{background-color:#ffffff;background-repeat:no-repeat;background-position:top +left;background-attachment:fixed;} +h1{font-family:Arial, +sans-serif;color:#000000;background-color:#ffffff;} +p +{font-family:Georgia, +serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;} +</style> +</head> +<body> +<h1>COme +shop +Come +shop +at +our +Store</h1> +<p>Please +come +to +our +store!</p> +</body> +</html> diff --git a/main.cpp b/main.cpp index a7b2f0a0b666305e913992a722015c175ed00764..1ef7b1bc36f592701a06fc07328553e758c120a9 100644 --- a/main.cpp +++ b/main.cpp @@ -13,6 +13,7 @@ #include <string> #include "ProducerConsumerQueue.h" #include "crawler/spider.h" +//#include "crawler/CrawlerStatistics.h" #define PATH_TO_BLACKLIST = '/bin/blacklist.txt' diff --git a/util/util.h b/util/util.h new file mode 100644 index 0000000000000000000000000000000000000000..42910c921e65dd901a27cfa7694aead19ea9d78e --- /dev/null +++ b/util/util.h @@ -0,0 +1,89 @@ +// +// Created by Jake Close on 2/6/18. +// + +#pragma once + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <iostream> +using namespace std; + + +size_t FileSize( int f ) +{ + struct stat fileInfo; + fstat( f, &fileInfo ); + return fileInfo.st_size; +} + + + +/* + * Takes filename as input, maps file into character pointer + * If cannot open file, returns a nullptr + * + * + */ + +char * getFileMap( string fileName ) +{ + int f = open( fileName.c_str() , O_RDONLY ); + if ( f != -1 ) + { + size_t fileSize = FileSize(f); + char *map = (char *) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, f, 0); + if ( map != MAP_FAILED ){ + return map; + } + } + return nullptr; +} +/* + * Takes integer file descriptor, returns char* map of file + * + */ + +char * getFileMap( int fileDescriptor ) +{ + if ( fileDescriptor != -1 ) + { + size_t fileSize = FileSize(fileDescriptor); + char *map = (char *) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, fileDescriptor, 0); + if ( map != MAP_FAILED ){ + return map; + } + } + return nullptr; +} + + +/* + * + * Takes a point to start of a file and string of a location on disk + * writes filemap to disk at location, and returns the file descriptor to the new file + */ + + +int writeToNewFileToLocation( char * fileContents , string locationOnDisk) +{ + int fileSize = strlen( fileContents ); + + int fd = creat( locationOnDisk.c_str() , S_IRWXU ); + ssize_t bytes_written = 0; + if( fd != -1 ) + bytes_written = write( fd, fileContents, fileSize ); + else + cout << "ERROR CREATING FILE\n"; + + if( bytes_written != fileSize ) + cout << "ERROR: Only " << bytes_written << " bytes written\n"; + + + return fd; + +} \ No newline at end of file