diff --git a/CMakeLists.txt b/CMakeLists.txt index d0cadfc23205c4a58b0b6e3258b36c4ee00a8be8..917dc8d67b824e1cc8a97925f5b325bb954058d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,17 +3,22 @@ project(eecs398_search) set(CMAKE_CXX_STANDARD 11) +add_executable(QueueTest + shared/ProducerConsumerQueue.h + shared/ProducerConsumerQueue_test.cpp) + add_executable(crawler-parser-test main.cpp shared/ProducerConsumerQueue.h shared/ThreadClass.h shared/url.h crawler/crawler.cpp - crawler/SocketReader.cpp - crawler/StreamReader.h + crawler/Readers/StreamReader.h + crawler/Readers/HttpReader.cpp + crawler/Readers/HttpsReader.cpp + crawler/Readers/LocalReader.cpp crawler/spider.cpp util/util.cpp - crawler/LocalReader.h shared/Document.cpp parser/Parser.cpp util/Stemmer.cpp @@ -59,3 +64,5 @@ add_executable(ISRWord-tests util/util.cpp constraintSolver/ISRWord.cpp constrai find_package(OpenSSL REQUIRED) target_link_libraries(crawler-parser-test OpenSSL::SSL pthread) + +target_link_libraries(QueueTest pthread) diff --git a/crawler-parser-test b/crawler-parser-test index f6e68d1aa6b62b614da5b0ab0ddae084aed2c717..279e5aa1a06283e62f69cb8a27448fedf898c850 100755 Binary files a/crawler-parser-test and b/crawler-parser-test differ diff --git a/crawler/CrawlerStatistics.cpp b/crawler/HouseKeeper.cpp similarity index 56% rename from crawler/CrawlerStatistics.cpp rename to crawler/HouseKeeper.cpp index 647dcc13973d08745148ba221e22638fe25a75ab..1e3749fbef4b0d6f420ef80cb8f86558077d5bc8 100644 --- a/crawler/CrawlerStatistics.cpp +++ b/crawler/HouseKeeper.cpp @@ -2,9 +2,9 @@ // Created by Ben Bergkamp on 2/1/18. // -#include "CrawlerStatistics.h" +#include "HouseKeeper.h" -void CrawlerStatistics::FuncToRun() +void HouseKeeper::FuncToRun() { //Sleep(3 minutes) //Gather data diff --git a/crawler/CrawlerStatistics.h b/crawler/HouseKeeper.h similarity index 56% rename from crawler/CrawlerStatistics.h rename to crawler/HouseKeeper.h index 6add7bc7d15d47d2e1ee4e8bbab994c81c21e4d3..53ab48d32c81228bde93b87272c0a0e51182cb8e 100644 --- a/crawler/CrawlerStatistics.h +++ b/crawler/HouseKeeper.h @@ -2,16 +2,17 @@ // Created by Ben Bergkamp on 2/1/18. // -#ifndef EECS398_SEARCH_CRAWLERSTATISTICS_H -#define EECS398_SEARCH_CRAWLERSTATISTICS_H +#pragma once #include "../shared/ThreadClass.h" - -class CrawlerStatistics : public ThreadClass +#include<string> +#include <pthread.h> +#include <iostream> +class HouseKeeper : public ThreadClass { public: - CrawlerStatistics() + HouseKeeper() { }; virtual void FuncToRun(); diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8fd6e070e866fc7f9387f66f7487b2a2f80e39bd --- /dev/null +++ b/crawler/Readers/HttpReader.cpp @@ -0,0 +1,93 @@ +// +// Created by Ben Bergkamp on 3/13/18. +// + +#include "HttpReader.h" + + +void HttpReader::request() + { + sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); + assert( sock != -1 ); + + // Get the host address. + + struct hostent *host = gethostbyname( url.Host ); + assert( host ); + + struct sockaddr_in address; + memset( &address, 0, sizeof( address ) ); + address.sin_family = AF_INET; + address.sin_port = htons( 80 ); + memcpy( &address.sin_addr, host->h_addr, host->h_length ); + + // Connect to the host. + + int connectResult = connect( sock, ( struct sockaddr * )&address, + sizeof( address ) ); + assert( connectResult == 0 ); + + // Send a GET message for the desired page. + + cout << "Socket Reader is pulling from the web" << endl; + + string getMessage = "GET "; + getMessage += url.CompleteUrl; + getMessage += " HTTP/1.1\r\nHost: "; + getMessage += url.Host; + getMessage += "\r\nConnection: close\r\n\r\n"; + + cout << getMessage << endl; + send( sock, getMessage.c_str( ), getMessage.length( ), 0 ); + + } + +bool HttpReader::fillBuffer(char * buf, size_t buf_size) + { + return (recv( sock, buf, buf_size, 0 ) == buf_size); + } + +string HttpReader::PageToString() + { + + string temp = ""; + char buf[10240]; + int bytes = 0; + + while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 ) + { + temp += string(buf, bytes); + } + return temp; + } + +ParsedUrl HttpReader::getUrl() + { + return url; + } + + +bool HttpReader::checkStatus() + { + string code = ""; + char buff[12]; + int bytes = 0; + + bytes = recv( sock, buff, 12, 0 ) ; + + if( strncmp(buff, "HTTP/1.1 200",11 ) == 0) + return true; + else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) + { + cerr << "URL REDIRECTION" << endl; + return false; + } + cerr << "Bad Request of TYPE:: " << buff << endl; + return false; + } + + +void HttpReader::closeReader() + { + close( sock ); + } \ No newline at end of file diff --git a/crawler/Readers/HttpReader.h b/crawler/Readers/HttpReader.h new file mode 100644 index 0000000000000000000000000000000000000000..dfc1d4a84cf3bb0a45c9b1cb5a854b1ceff6f169 --- /dev/null +++ b/crawler/Readers/HttpReader.h @@ -0,0 +1,26 @@ +// +// Created by Ben Bergkamp on 3/13/18. +// +#pragma once + +#include "StreamReader.h" + +class HttpReader : public StreamReader + { +public: + + HttpReader( ParsedUrl url_in ) : url( url_in ) { } + void request(); + bool fillBuffer(char * buf, size_t buf_size); + bool checkStatus(); + string PageToString(); + ParsedUrl getUrl(); + void closeReader(); + + +private: + + ParsedUrl url; + int sock; + + }; diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7fecd06063120b7170e137e9448df02f58020aa --- /dev/null +++ b/crawler/Readers/HttpsReader.cpp @@ -0,0 +1,110 @@ +// +// Created by Ben Bergkamp on 3/13/18. +// + +#include "HttpsReader.h" + +void HttpsReader::request() + { + struct hostent *host = gethostbyname( url.Host ); + assert( host ); + + struct sockaddr_in address; + memset( &address, 0, sizeof( address ) ); + address.sin_family = AF_INET; + address.sin_port = htons( 443 ); + memcpy( &address.sin_addr, host->h_addr, host->h_length ); + + // Create a TCP/IP socket. + + sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); + assert( sock != -1 ); + + // Connect the socket to the host address. + + int connectResult = connect( sock, ( struct sockaddr * )&address, + sizeof( address ) ); + assert( connectResult == 0 ); + + // Build an SSL layer and set it to read/write + // to the socket we've connected. + + ctx = SSL_CTX_new( SSLv23_method( ) ); + assert( ctx ); + ssl = SSL_new( ctx ); + assert( ssl ); + + SSL_set_fd( ssl, sock ); + + // Establish an SSL connection. + + int sslConnectResult = SSL_connect( ssl ); + assert( sslConnectResult == 1 ); + + // Send a GET message for the desired page through the SSL. + + string getMessage = "GET "; + getMessage += url.CompleteUrl; + getMessage += " HTTP/1.1\r\nHost: "; + getMessage += url.Host; + getMessage += "\r\nConnection: close\r\n\r\n"; + + cout << getMessage << endl; + SSL_write( ssl, getMessage.c_str( ), getMessage.length( ) ); + + } + +bool HttpsReader::fillBuffer(char * buf, size_t buf_size) + { + return (SSL_read( ssl, buf, buf_size ) == buf_size); + } + +string HttpsReader::PageToString() + { + + string temp = ""; + char buf[10240]; + int bytes = 0; + + while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 ) + { + temp += string(buf, bytes); + } + return temp; + } + +bool HttpsReader::checkStatus() + { + string code = ""; + char buff[12]; + int bytes = 0; + + bytes = SSL_read( ssl, buff, 12 ); + + + if( strncmp(buff, "HTTP/1.1 200",11 ) == 0) + return true; + else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) + { + cerr << "URL REDIRECTION" << endl; + return false; + } + cerr << "Bad Request of TYPE:: " << buff << endl; + return false; + + } + + +ParsedUrl HttpsReader::getUrl() + { + return url; + } + +void HttpsReader::closeReader() + { + SSL_shutdown(ssl); + SSL_free(ssl); + SSL_CTX_free(ctx); + close(sock); + } + diff --git a/crawler/Readers/HttpsReader.h b/crawler/Readers/HttpsReader.h new file mode 100644 index 0000000000000000000000000000000000000000..1f8860ec126560b03cf78f38bf1e3bd9086904e8 --- /dev/null +++ b/crawler/Readers/HttpsReader.h @@ -0,0 +1,27 @@ +// +// Created by Ben Bergkamp on 3/13/18. +// + +#pragma once + +#include "StreamReader.h" + +class HttpsReader : public StreamReader + { +public: + + HttpsReader( ParsedUrl url_in ) : url( url_in ) { } + + void request(); + bool fillBuffer(char * buf, size_t buf_size); + string PageToString(); + ParsedUrl getUrl(); + void closeReader(); + bool checkStatus(); + +private: + ParsedUrl url; + int sock; + SSL * ssl; + SSL_CTX * ctx; + }; diff --git a/crawler/Readers/LocalReader.cpp b/crawler/Readers/LocalReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..764c9fc370db13a71befc95dfbfd9931b1009169 --- /dev/null +++ b/crawler/Readers/LocalReader.cpp @@ -0,0 +1,45 @@ +// +// Created by Ben Bergkamp on 3/13/18. +// + +#include "LocalReader.h" + + +void LocalReader::request() + { + //FIXME + //open the file? + } + +bool LocalReader::fillBuffer(char * buf, size_t buf_size){ + + //FIXME + strcpy(buf, util::getFileMap( fileName )) ; + return true; + + } + +string LocalReader::PageToString() + { + //FIXME + string s("fix me"); + return s; + } + +ParsedUrl LocalReader::getUrl() + { + //FIXME + ParsedUrl url(""); + return url; + } + + +bool LocalReader::checkStatus() + { + return true; + } +void LocalReader::closeReader() + { + //FIXME + //close the file? + } \ No newline at end of file diff --git a/crawler/LocalReader.h b/crawler/Readers/LocalReader.h similarity index 52% rename from crawler/LocalReader.h rename to crawler/Readers/LocalReader.h index a58ec7d1243f6308461cfebe146260a52501faf6..08d9661bb434e0fbd7ccce54b41e1a5f954b751a 100644 --- a/crawler/LocalReader.h +++ b/crawler/Readers/LocalReader.h @@ -2,20 +2,25 @@ // Created by Ben Bergkamp on 2/14/18. // -#ifndef EECS398_SEARCH_LOCALREADER_H -#define EECS398_SEARCH_LOCALREADER_H +#pragma once #include "StreamReader.h" class LocalReader : public StreamReader { - string fileName; - void fillBuffer(){ - strcpy(buffer, util::getFileMap( fileName )) ; - } public: + LocalReader( string url_in ) : fileName( url_in ) { } + void request(); + bool fillBuffer(char * buf, size_t buf_size); + bool checkStatus(); + string PageToString(); + ParsedUrl getUrl(); + void closeReader(); + +private: + string fileName; + }; -#endif //EECS398_SEARCH_LOCALREADER_H diff --git a/crawler/Readers/StreamReader.h b/crawler/Readers/StreamReader.h new file mode 100644 index 0000000000000000000000000000000000000000..f7057cba85d957e06ec921e72be558af70e718cd --- /dev/null +++ b/crawler/Readers/StreamReader.h @@ -0,0 +1,37 @@ +// +// Created by Jake Close on 2/13/18. +// + +#pragma once + +#include "../../shared/url.h" +#include "../../util/util.h" +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <iostream> +#include <string.h> +#include <string> +#include <cassert> +#include <openssl/ssl.h> + +using namespace std; + + +class StreamReader + { +public: + StreamReader() {}; + virtual void request() = 0; + virtual bool fillBuffer(char * buf, size_t buf_size) = 0; + virtual bool checkStatus() = 0; + virtual string PageToString() = 0; + virtual ParsedUrl getUrl() =0; + virtual void closeReader() = 0; + }; + + +//fix interface with Parsed URL to parser +//implement getUrl function so parser can just call that +//remove stale code from spider.cpp diff --git a/crawler/SocketReader.cpp b/crawler/SocketReader.cpp deleted file mode 100644 index 172f64b5d3a4b5fc33bccbb4b2dab5d654bbc5ea..0000000000000000000000000000000000000000 --- a/crawler/SocketReader.cpp +++ /dev/null @@ -1,186 +0,0 @@ -// -// Created by Ben Bergkamp on 2/14/18. -// - -#include "SocketReader.h" - -char * GetArbitrarySizeBuffer(SSL* ssl) - { - - int buf_size = 10240; - int current_size = buf_size; - char* ssl_buffer = new char[buf_size]; - char* front = ssl_buffer; - int bytes; - - while ( ( bytes = SSL_read( ssl, front, buf_size ) ) > 0 ) - { - - current_size += buf_size; - char *temp = new char[current_size]; - strcpy(temp, ssl_buffer); - - front = temp + strlen(ssl_buffer); - delete[] ssl_buffer; - ssl_buffer = temp; - } - - return ssl_buffer; - } - - -char * GetArbitrarySizeBuffer(int s ) - { - - int buf_size = 10240; - int current_size = buf_size; - char* http_buff = new char[buf_size]; - char* front = http_buff; - int bytes; - - while ( ( bytes = recv( s, front, buf_size, 0 ) ) > 0 ) - { - - current_size += buf_size; - char *temp = new char[current_size]; - strcpy(temp, http_buff); - - front = temp + strlen(http_buff); - delete[] http_buff; - http_buff = temp; - - } - - return http_buff; - } - - - - -void SocketReader::httpRequest() - { - int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); - assert( s != -1 ); - - // Get the host address. - - struct hostent *host = gethostbyname( url.Host ); - assert( host ); - - struct sockaddr_in address; - memset( &address, 0, sizeof( address ) ); - address.sin_family = AF_INET; - address.sin_port = htons( 80 ); - memcpy( &address.sin_addr, host->h_addr, host->h_length ); - - // Connect to the host. - - int connectResult = connect( s, ( struct sockaddr * )&address, - sizeof( address ) ); - assert( connectResult == 0 ); - - // Send a GET message for the desired page. - - cout << "Socket Reader is pulling from the web" << endl; - - string getMessage = "GET "; - getMessage += url.CompleteUrl; - getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.Host; - getMessage += "\r\nConnection: close\r\n\r\n"; - - cout << getMessage << endl; - send( s, getMessage.c_str( ), getMessage.length( ), 0 ); - - // Read from the socket until there's no more data. - - char HTTPbuffer[ 10240 ]; - int bytes; - - - while ( ( bytes = recv( s, buffer, sizeof( buffer ), 0 ) ) > 0 ) - write( 1, buffer, bytes ); - - buffer = GetArbitrarySizeBuffer(s); - - close( s ); - return; - } - - -void SocketReader::httpsRequest(){ - - struct hostent *host = gethostbyname( url.Host ); - assert( host ); - - struct sockaddr_in address; - memset( &address, 0, sizeof( address ) ); - address.sin_family = AF_INET; - address.sin_port = htons( 443 ); - memcpy( &address.sin_addr, host->h_addr, host->h_length ); - - // Create a TCP/IP socket. - - int s = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); - assert( s != -1 ); - - // Connect the socket to the host address. - - int connectResult = connect( s, ( struct sockaddr * )&address, - sizeof( address ) ); - assert( connectResult == 0 ); - - // Build an SSL layer and set it to read/write - // to the socket we've connected. - - - SSL_library_init( ); - - SSL_CTX *ctx = SSL_CTX_new( SSLv23_method( ) ); - assert( ctx ); - SSL *ssl = SSL_new( ctx ); - assert( ssl ); - - SSL_set_fd( ssl, s ); - - // Establish an SSL connection. - - int sslConnectResult = SSL_connect( ssl ); - assert( sslConnectResult == 1 ); - - // Send a GET message for the desired page through the SSL. - - string getMessage = "GET "; - getMessage += url.CompleteUrl; - getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.Host; - getMessage += "\r\nConnection: close\r\n\r\n"; - - cout << getMessage << endl; - SSL_write( ssl, getMessage.c_str( ), getMessage.length( ) ); - - // Read from the SSL until there's no more data. - - buffer = GetArbitrarySizeBuffer(ssl); - - SSL_shutdown( ssl ); - SSL_free( ssl ); - SSL_CTX_free( ctx ); - close( s ); - - - } - - - - - -void SocketReader::fillBuffer( ) - { - if ( !strcmp(url.Service , "http" ) ) - httpRequest(); - else - httpsRequest(); - - - } \ No newline at end of file diff --git a/crawler/SocketReader.h b/crawler/SocketReader.h deleted file mode 100644 index 0eea515995711edd955ce89605af698f88ac70b0..0000000000000000000000000000000000000000 --- a/crawler/SocketReader.h +++ /dev/null @@ -1,25 +0,0 @@ -// -// Created by Ben Bergkamp on 2/14/18. -// - -#pragma once - -#include "StreamReader.h" - - -class SocketReader : public StreamReader - { -public: - SocketReader( ParsedUrl url_in ) : url( url_in ) { } - virtual void fillBuffer(); - void httpRequest(); - void httpsRequest(); - - - //virtual void fillBuffer(char ssl); - -private: - ParsedUrl url; - }; - - diff --git a/crawler/StreamReader.h b/crawler/StreamReader.h deleted file mode 100644 index 6945069b96ed5be3e2e005b2460a167202e0c17f..0000000000000000000000000000000000000000 --- a/crawler/StreamReader.h +++ /dev/null @@ -1,30 +0,0 @@ -// -// Created by Jake Close on 2/13/18. -// - -#pragma once - -#include "../shared/url.h" -#include "../util/util.h" -#include <unistd.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <netdb.h> -#include <iostream> -#include <string.h> -#include <string> -#include <cassert> -#include <openssl/ssl.h> - - - -class StreamReader - { -public: - StreamReader() {}; - virtual void fillBuffer() = 0; - //virtual void close() = 0; - //virtual void request(); - char *buffer; - - }; \ No newline at end of file diff --git a/crawler/crawler.cpp b/crawler/crawler.cpp index 55105ae576965acc81db09082b265cb25247c0d7..94efa3605d82ef34b4a61d35ed5f77c2fc4885df 100644 --- a/crawler/crawler.cpp +++ b/crawler/crawler.cpp @@ -4,11 +4,11 @@ #include "crawler.h" -void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup ) +void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap ) { for ( size_t i = 0; i < num_spiders; i++ ) { - Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup ); + Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap ); temp->StartThread( ); this->spiders.push_back( temp ); } @@ -17,8 +17,21 @@ void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *d void Crawler::WaitOnAllSpiders() { + cout << "Waiting for spiders to finish...\n"; for ( Spider *spider : spiders ) { - spider->WaitForFinish( ); + spider->WaitForFinish( ); + delete spider; //FIXME do this in destructor? + } + } + + +void Crawler::KillAllSpiders() + { + cout << "Waiting for spiders to finish...\n"; + for ( Spider *spider : spiders ) + { + spider->Die( ); + delete spider; //FIXME do this in destructor? } } diff --git a/crawler/crawler.h b/crawler/crawler.h index 9b6c93880c28fa6d8c3575561bd9e2a479404c0f..f88cddf95e97bed337465d16216532204376fb0c 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -21,12 +21,13 @@ public: { }; //spawns a number of works - void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup ); + void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap); //Creates a housekeeping thread void houseKeeper(); - void WaitOnAllSpiders(); + void KillAllSpiders( ); + void WaitOnAllSpiders( ); private: vector < Spider * > spiders; diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 84043df06fb7e0715bf73493a030889691759ac1..769991288655cb90fab632b4a06323df2a621274 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -1,25 +1,62 @@ -// -// Created by Ben Bergkamp on 1/31/18. -// - - - - #include "spider.h" #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> - -#include <unistd.h> - -#include "LocalReader.h" -#include "SocketReader.h" #include "../shared/Document.h" #include "../parser/Parser.h" +#include "Readers/HttpsReader.h" +#include "Readers/HttpReader.h" +#include "Readers/LocalReader.h" +#include "../parser/Parser.h" + +using DocIndex = const unordered_map< string, vector< unsigned long > >; -size_t Spider::hash ( const char *s ) +// FIND A BETTER PLACE TO PUT THIS FUNCTION + +StreamReader* SR_factory(ParsedUrl url, string mode) { + string localFile; + + StreamReader *newReader = nullptr + ; + if ( mode == "local" ) + { + newReader = new LocalReader( url.CompleteUrl ); + } + else if ( mode == "web" ) + { + if(!strcmp(url.Service, "http")) { + newReader = new HttpReader(url); + } + else if(!strcmp(url.Service,"https")){ + newReader = new HttpsReader(url); + } + else{ + cerr << "Error reading service type\n"; + } + } + + return newReader; + } + +void printDocIndex( DocIndex* dict ) + { + for ( auto it = dict->begin( ); it != dict->end( ); it++ ) + { + cout << it->first << " : "; + for ( int i = 0; i < it->second.size( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; + } + cout << std::endl; + + } + + +size_t Spider::hash(const char * s) { // http://www.cse.yorku.ca/~oz/hash.html size_t h = 5381; @@ -28,7 +65,6 @@ size_t Spider::hash ( const char *s ) h = ( ( h << 5 ) + h ) + c; return h; } - } ParsedUrl Spider::getUrl ( ) @@ -40,68 +76,31 @@ void Spider::FuncToRun ( ) { std::cout << "Spider is crawling" << endl; - bool cond = true; - + int cond = 0; - while ( cond ) + while ( cond < 25 ) { - - - // ParsedUrl stringUrl = getUrl( ); //get url from url frontier - char *fileMap; - ParsedUrl currentUrl = getUrl( ); - //url has not seen before or time since seen is past certain criteria - if ( shouldURLbeCrawled( currentUrl ) ) + ParsedUrl currentUrl = getUrl(); + size_t docID = hash(currentUrl.CompleteUrl); + if ( shouldURLbeCrawled( docID )) { - //bool success = writeDocToDisk(currentUrl); - //if ( success && cond ) - if ( cond ) - { - - - StreamReader *reader = request( currentUrl ); - size_t docID = hash( currentUrl.CompleteUrl ); - string localPath = util::GetCurrentWorkingDir( ); - // don't include debug in file path - unsigned long debug = findPrev( "cmake-build-debug", localPath.size( ) - 1, localPath ); - if ( debug < localPath.size( ) ) - { - localPath = subStr( localPath, 0, debug); - } - - string pathToDisk = localPath + "/crawlerOutput/" + to_string( docID ) + ".txt"; - int fd = util::writeToNewFileToLocation( reader->buffer, pathToDisk ); - - Document document( currentUrl, reader->buffer ); - auto dict = parser.execute( &document ); - - cout << "docID: " << docID << endl; - for ( auto it = dict->begin( ); it != dict->end( ); it++ ) - { - cout << it->first << " : "; - for ( int i = 0; i < it->second.size( ); ++i ) - { - cout << it->second[ i ] << " "; - } - cout << std::endl; - } - cout << std::endl; - delete dict; - dict = nullptr; - cond = true; - } - else - { - cerr << "Error connecting"; - } + StreamReader *reader = SR_factory( currentUrl, this->mode ); + DocIndex * dict = parser.execute (reader); + printDocIndex(dict); + reader->closeReader(); - } + delete reader; + delete dict; + cond++; + } } } + + /* Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, check file on disk to see if its been crawled successfully if it has been indexed, (check last time index, maybe reindex?) return false (ignore this url) @@ -141,10 +140,21 @@ bool Spider::writeDocToDisk ( ParsedUrl url ) * and returns true */ - -bool Spider::shouldURLbeCrawled ( ParsedUrl url ) +bool Spider::shouldURLbeCrawled( size_t docID ) { + + if(this->duplicateUrlMap->find(docID) != this->duplicateUrlMap->end()){ + return false; + } + else + { + this->duplicateUrlMap->insert(std::make_pair(docID, 1)); + return true; + } + /* //search for url in doc cache + + auto locationOnDisk = this->docMapLookup->find( url.CompleteUrl ); //bool protectedByRobots = checkRobots( url ); @@ -159,9 +169,11 @@ bool Spider::shouldURLbeCrawled ( ParsedUrl url ) Document::PrintDocMap( url.CompleteUrl, locationOnDisk->second ); } return false; + */ + return true; } - +/* //check if path in url is in the robots txt bool Spider::checkRobots ( ParsedUrl url ) { @@ -202,33 +214,8 @@ int Spider::getRobots ( ParsedUrl url ) cerr << "issue filling buffer from robots.txt" << endl; return -1; - - }; - -/* -returns true if fileMap was created, otherwise false - Modifies the filemap to be a char* of the file of the url passed -*/ - -// make this become a stream reader factory -StreamReader *Spider::request ( ParsedUrl url ) - { - string localFile; - - StreamReader *newReader; - if ( this->mode == "local" ) - { - newReader = new LocalReader( url.CompleteUrl ); - } - else if ( this->mode == "web" ) - { - newReader = new SocketReader( url ); - } - - //remove fill buffer/ change to get request - newReader->fillBuffer( ); - return newReader; } +*/ //request function that handles sending over get request via socket or trying to open file diff --git a/crawler/spider.h b/crawler/spider.h index cb58d793886d557e8dc145ff42aa5b56f32bf77f..39384dc6ccb3dca5eaea9e4273a0a56fdf4df801 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -6,9 +6,9 @@ #include <fstream> #include "../shared/ProducerConsumerQueue.h" #include "../shared/ThreadClass.h" -#include<iostream> +#include <iostream> #include <unordered_map> -#include "StreamReader.h" +#include "Readers/StreamReader.h" #include "../util/util.h" #include "../parser/Parser.h" @@ -24,8 +24,8 @@ class Spider : public ThreadClass public: Spider( string mode_in, ProducerConsumerQueue < ParsedUrl > *url_q_in, - unordered_map < string, int > *doc_map_lookup_in ) - : mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( url_q_in) + unordered_map < string, int > *doc_map_lookup_in, unordered_map < size_t, int > *duplicate_url_map_in ) + : mode( mode_in ), urlFrontier( url_q_in ), docMapLookup( doc_map_lookup_in ), parser( url_q_in), duplicateUrlMap(duplicate_url_map_in) { }; @@ -35,23 +35,18 @@ public: virtual void FuncToRun(); - //Makes request to given url - // if successful, writes file to disk, stores location to memeber value - // else return false and error information, retry if necessary - StreamReader *request( ParsedUrl url ); - bool writeDocToDisk(ParsedUrl url); - bool shouldURLbeCrawled( ParsedUrl URL ); + bool shouldURLbeCrawled( size_t docId ); size_t hash(const char * s); - int getRobots(ParsedUrl url ); + //int getRobots(ParsedUrl url ); bool checkRobots(ParsedUrl url); - private: int locationOnDisk; ProducerConsumerQueue < ParsedUrl > *urlFrontier; + unordered_map < size_t, int > *duplicateUrlMap; string mode; unordered_map < string, int > *docMapLookup; Parser parser; diff --git a/main.cpp b/main.cpp index 29b800a0163d9457e9bf7ebb09dc9474fce10a64..e5f67990fd9d1004fb5c7f28c287031ce1731e59 100644 --- a/main.cpp +++ b/main.cpp @@ -10,6 +10,7 @@ #include <pthread.h> #include <queue> #include "crawler/crawler.h" +#include <openssl/ssl.h> #include <string> //#include "crawler/CrawlerStatistics.h" #include <unordered_map> @@ -55,7 +56,7 @@ int main( int argc, char *argv[] ) int choice; int option_index = 0; option long_options[] = { - {"mode", optional_argument, nullptr, 'm'}, + {"mode", optional_argument, nullptr, 'm'}, {"num_crawlers", optional_argument, nullptr, 'c'} }; @@ -89,45 +90,47 @@ int main( int argc, char *argv[] ) bool restoreFromLog; - ProducerConsumerQueue < ParsedUrl > urlFrontier; + unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( ); + + ProducerConsumerQueue<ParsedUrl> *urlFrontier = new ProducerConsumerQueue<ParsedUrl>(); - cout << "Pushed File\n"; char *seeds; - if ( mode == "local" ) - seeds = util::getFileMap( "/tests/localSeed.txt" ); - else - seeds = util::getFileMap( "/tests/webSeed.txt" ); + if (mode == "local") + seeds = util::getFileMap("/tests/localSeed.txt"); + else { + seeds = util::getFileMap("/tests/webSeed.txt"); + SSL_library_init( ); + + } string testFile; - while ( *seeds ) - { - if ( *seeds == '\n') - { - cout << "Pushing to Url Frontier..." << endl; + while (*seeds) { + if (*seeds == '\n') { + ParsedUrl url = ParsedUrl(testFile); - urlFrontier.Push(url); + cout << "Pushing: " << testFile << " to queue\n"; + urlFrontier->Push(url); testFile = ""; - } - - else + } else testFile.push_back(*seeds); ++seeds; } - cout << "Pushing to Url Frontier..." << endl; - urlFrontier.Push(testFile); -//urlFrontier.Push("tests/store.html"); - - + if (testFile != "") { + cout << "Pushing: " << testFile << " to queue\n"; + ParsedUrl url = ParsedUrl(testFile); + urlFrontier->Push(url); + } unordered_map < string, int > *docMapLookUp = new unordered_map < string, int >( ); +Crawler crawler( mode, urlFrontier ); -Crawler crawler( mode, &urlFrontier ); - -crawler.SpawnSpiders(numberOfSpiders , docMapLookUp); +crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap); -crawler. +crawler.WaitOnAllSpiders(); -WaitOnAllSpiders(); + auto f = urlFrontier->Pop(); + int x = 0; + delete urlFrontier; } \ No newline at end of file diff --git a/makefile b/makefile index 8d2bafb418c9a23c8614806be7639e34fa9cd93a..236a6cee6926ec4572b59db75afd9120ca4f85a6 100644 --- a/makefile +++ b/makefile @@ -110,32 +110,6 @@ depend: $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 .PHONY : depend -#============================================================================= -# Target rules for targets named ISRWord-tests - -# Build rule for target. -ISRWord-tests: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 ISRWord-tests -.PHONY : ISRWord-tests - -# fast build rule for target. -ISRWord-tests/fast: - $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/build -.PHONY : ISRWord-tests/fast - -#============================================================================= -# Target rules for targets named StemmerTest - -# Build rule for target. -StemmerTest: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 StemmerTest -.PHONY : StemmerTest - -# fast build rule for target. -StemmerTest/fast: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/build -.PHONY : StemmerTest/fast - #============================================================================= # Target rules for targets named StringProcessingTest @@ -150,17 +124,17 @@ StringProcessingTest/fast: .PHONY : StringProcessingTest/fast #============================================================================= -# Target rules for targets named TokenizerTest +# Target rules for targets named ISRWord-tests # Build rule for target. -TokenizerTest: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 TokenizerTest -.PHONY : TokenizerTest +ISRWord-tests: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 ISRWord-tests +.PHONY : ISRWord-tests # fast build rule for target. -TokenizerTest/fast: - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/build -.PHONY : TokenizerTest/fast +ISRWord-tests/fast: + $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/build +.PHONY : ISRWord-tests/fast #============================================================================= # Target rules for targets named URLTEST @@ -188,6 +162,19 @@ SharedTableTest/fast: $(MAKE) -f CMakeFiles/SharedTableTest.dir/build.make CMakeFiles/SharedTableTest.dir/build .PHONY : SharedTableTest/fast +#============================================================================= +# Target rules for targets named crawler-parser-test + +# Build rule for target. +crawler-parser-test: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 crawler-parser-test +.PHONY : crawler-parser-test + +# fast build rule for target. +crawler-parser-test/fast: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/build +.PHONY : crawler-parser-test/fast + #============================================================================= # Target rules for targets named ParserTest @@ -201,6 +188,32 @@ ParserTest/fast: $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/build .PHONY : ParserTest/fast +#============================================================================= +# Target rules for targets named StemmerTest + +# Build rule for target. +StemmerTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 StemmerTest +.PHONY : StemmerTest + +# fast build rule for target. +StemmerTest/fast: + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/build +.PHONY : StemmerTest/fast + +#============================================================================= +# Target rules for targets named TokenizerTest + +# Build rule for target. +TokenizerTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 TokenizerTest +.PHONY : TokenizerTest + +# fast build rule for target. +TokenizerTest/fast: + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/build +.PHONY : TokenizerTest/fast + #============================================================================= # Target rules for targets named search-engine @@ -215,17 +228,17 @@ search-engine/fast: .PHONY : search-engine/fast #============================================================================= -# Target rules for targets named crawler-parser-test +# Target rules for targets named QueueTest # Build rule for target. -crawler-parser-test: cmake_check_build_system - $(MAKE) -f CMakeFiles/Makefile2 crawler-parser-test -.PHONY : crawler-parser-test +QueueTest: cmake_check_build_system + $(MAKE) -f CMakeFiles/Makefile2 QueueTest +.PHONY : QueueTest # fast build rule for target. -crawler-parser-test/fast: - $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/build -.PHONY : crawler-parser-test/fast +QueueTest/fast: + $(MAKE) -f CMakeFiles/QueueTest.dir/build.make CMakeFiles/QueueTest.dir/build +.PHONY : QueueTest/fast constraintSolver/ISRWord.o: constraintSolver/ISRWord.cpp.o @@ -281,32 +294,86 @@ constraintSolver/tests/ISRWordTests.cpp.s: $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/constraintSolver/tests/ISRWordTests.cpp.s .PHONY : constraintSolver/tests/ISRWordTests.cpp.s -crawler/SocketReader.o: crawler/SocketReader.cpp.o +crawler/Readers/HttpReader.o: crawler/Readers/HttpReader.cpp.o + +.PHONY : crawler/Readers/HttpReader.o + +# target to build an object file +crawler/Readers/HttpReader.cpp.o: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpReader.cpp.o +.PHONY : crawler/Readers/HttpReader.cpp.o + +crawler/Readers/HttpReader.i: crawler/Readers/HttpReader.cpp.i + +.PHONY : crawler/Readers/HttpReader.i + +# target to preprocess a source file +crawler/Readers/HttpReader.cpp.i: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpReader.cpp.i +.PHONY : crawler/Readers/HttpReader.cpp.i + +crawler/Readers/HttpReader.s: crawler/Readers/HttpReader.cpp.s + +.PHONY : crawler/Readers/HttpReader.s + +# target to generate assembly for a file +crawler/Readers/HttpReader.cpp.s: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpReader.cpp.s +.PHONY : crawler/Readers/HttpReader.cpp.s + +crawler/Readers/HttpsReader.o: crawler/Readers/HttpsReader.cpp.o -.PHONY : crawler/SocketReader.o +.PHONY : crawler/Readers/HttpsReader.o # target to build an object file -crawler/SocketReader.cpp.o: - $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/SocketReader.cpp.o -.PHONY : crawler/SocketReader.cpp.o +crawler/Readers/HttpsReader.cpp.o: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpsReader.cpp.o +.PHONY : crawler/Readers/HttpsReader.cpp.o -crawler/SocketReader.i: crawler/SocketReader.cpp.i +crawler/Readers/HttpsReader.i: crawler/Readers/HttpsReader.cpp.i -.PHONY : crawler/SocketReader.i +.PHONY : crawler/Readers/HttpsReader.i # target to preprocess a source file -crawler/SocketReader.cpp.i: - $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/SocketReader.cpp.i -.PHONY : crawler/SocketReader.cpp.i +crawler/Readers/HttpsReader.cpp.i: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpsReader.cpp.i +.PHONY : crawler/Readers/HttpsReader.cpp.i -crawler/SocketReader.s: crawler/SocketReader.cpp.s +crawler/Readers/HttpsReader.s: crawler/Readers/HttpsReader.cpp.s -.PHONY : crawler/SocketReader.s +.PHONY : crawler/Readers/HttpsReader.s # target to generate assembly for a file -crawler/SocketReader.cpp.s: - $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/SocketReader.cpp.s -.PHONY : crawler/SocketReader.cpp.s +crawler/Readers/HttpsReader.cpp.s: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/HttpsReader.cpp.s +.PHONY : crawler/Readers/HttpsReader.cpp.s + +crawler/Readers/LocalReader.o: crawler/Readers/LocalReader.cpp.o + +.PHONY : crawler/Readers/LocalReader.o + +# target to build an object file +crawler/Readers/LocalReader.cpp.o: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/LocalReader.cpp.o +.PHONY : crawler/Readers/LocalReader.cpp.o + +crawler/Readers/LocalReader.i: crawler/Readers/LocalReader.cpp.i + +.PHONY : crawler/Readers/LocalReader.i + +# target to preprocess a source file +crawler/Readers/LocalReader.cpp.i: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/LocalReader.cpp.i +.PHONY : crawler/Readers/LocalReader.cpp.i + +crawler/Readers/LocalReader.s: crawler/Readers/LocalReader.cpp.s + +.PHONY : crawler/Readers/LocalReader.s + +# target to generate assembly for a file +crawler/Readers/LocalReader.cpp.s: + $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/crawler/Readers/LocalReader.cpp.s +.PHONY : crawler/Readers/LocalReader.cpp.s crawler/crawler.o: crawler/crawler.cpp.o @@ -395,8 +462,8 @@ parser/Parser.o: parser/Parser.cpp.o # target to build an object file parser/Parser.cpp.o: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/parser/Parser.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.o .PHONY : parser/Parser.cpp.o parser/Parser.i: parser/Parser.cpp.i @@ -405,8 +472,8 @@ parser/Parser.i: parser/Parser.cpp.i # target to preprocess a source file parser/Parser.cpp.i: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/parser/Parser.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.i .PHONY : parser/Parser.cpp.i parser/Parser.s: parser/Parser.cpp.s @@ -415,8 +482,8 @@ parser/Parser.s: parser/Parser.cpp.s # target to generate assembly for a file parser/Parser.cpp.s: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/parser/Parser.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/parser/Parser.cpp.s .PHONY : parser/Parser.cpp.s parser/tests/parserTest.o: parser/tests/parserTest.cpp.o @@ -506,8 +573,8 @@ shared/Document.o: shared/Document.cpp.o # target to build an object file shared/Document.cpp.o: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/shared/Document.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.o .PHONY : shared/Document.cpp.o shared/Document.i: shared/Document.cpp.i @@ -516,8 +583,8 @@ shared/Document.i: shared/Document.cpp.i # target to preprocess a source file shared/Document.cpp.i: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/shared/Document.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.i .PHONY : shared/Document.cpp.i shared/Document.s: shared/Document.cpp.s @@ -526,10 +593,37 @@ shared/Document.s: shared/Document.cpp.s # target to generate assembly for a file shared/Document.cpp.s: - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/shared/Document.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/shared/Document.cpp.s .PHONY : shared/Document.cpp.s +shared/ProducerConsumerQueue_test.o: shared/ProducerConsumerQueue_test.cpp.o + +.PHONY : shared/ProducerConsumerQueue_test.o + +# target to build an object file +shared/ProducerConsumerQueue_test.cpp.o: + $(MAKE) -f CMakeFiles/QueueTest.dir/build.make CMakeFiles/QueueTest.dir/shared/ProducerConsumerQueue_test.cpp.o +.PHONY : shared/ProducerConsumerQueue_test.cpp.o + +shared/ProducerConsumerQueue_test.i: shared/ProducerConsumerQueue_test.cpp.i + +.PHONY : shared/ProducerConsumerQueue_test.i + +# target to preprocess a source file +shared/ProducerConsumerQueue_test.cpp.i: + $(MAKE) -f CMakeFiles/QueueTest.dir/build.make CMakeFiles/QueueTest.dir/shared/ProducerConsumerQueue_test.cpp.i +.PHONY : shared/ProducerConsumerQueue_test.cpp.i + +shared/ProducerConsumerQueue_test.s: shared/ProducerConsumerQueue_test.cpp.s + +.PHONY : shared/ProducerConsumerQueue_test.s + +# target to generate assembly for a file +shared/ProducerConsumerQueue_test.cpp.s: + $(MAKE) -f CMakeFiles/QueueTest.dir/build.make CMakeFiles/QueueTest.dir/shared/ProducerConsumerQueue_test.cpp.s +.PHONY : shared/ProducerConsumerQueue_test.cpp.s + shared/SharedHashMapTest.o: shared/SharedHashMapTest.cpp.o .PHONY : shared/SharedHashMapTest.o @@ -590,11 +684,11 @@ util/Stemmer.o: util/Stemmer.cpp.o # target to build an object file util/Stemmer.cpp.o: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.o $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/Stemmer.cpp.o - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.o - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Stemmer.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.o + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.o + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.o .PHONY : util/Stemmer.cpp.o util/Stemmer.i: util/Stemmer.cpp.i @@ -603,11 +697,11 @@ util/Stemmer.i: util/Stemmer.cpp.i # target to preprocess a source file util/Stemmer.cpp.i: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.i $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/Stemmer.cpp.i - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.i - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Stemmer.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.i + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.i + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.i .PHONY : util/Stemmer.cpp.i util/Stemmer.s: util/Stemmer.cpp.s @@ -616,11 +710,11 @@ util/Stemmer.s: util/Stemmer.cpp.s # target to generate assembly for a file util/Stemmer.cpp.s: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.s $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/Stemmer.cpp.s - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.s - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Stemmer.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Stemmer.cpp.s + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/Stemmer.cpp.s + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Stemmer.cpp.s .PHONY : util/Stemmer.cpp.s util/Tokenizer.o: util/Tokenizer.cpp.o @@ -629,9 +723,9 @@ util/Tokenizer.o: util/Tokenizer.cpp.o # target to build an object file util/Tokenizer.cpp.o: - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.o - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Tokenizer.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.o + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.o .PHONY : util/Tokenizer.cpp.o util/Tokenizer.i: util/Tokenizer.cpp.i @@ -640,9 +734,9 @@ util/Tokenizer.i: util/Tokenizer.cpp.i # target to preprocess a source file util/Tokenizer.cpp.i: - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.i - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Tokenizer.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.i + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.i .PHONY : util/Tokenizer.cpp.i util/Tokenizer.s: util/Tokenizer.cpp.s @@ -651,9 +745,9 @@ util/Tokenizer.s: util/Tokenizer.cpp.s # target to generate assembly for a file util/Tokenizer.cpp.s: - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.s - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/Tokenizer.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/Tokenizer.cpp.s + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/Tokenizer.cpp.s .PHONY : util/Tokenizer.cpp.s util/stringProcessing.o: util/stringProcessing.cpp.o @@ -662,11 +756,11 @@ util/stringProcessing.o: util/stringProcessing.cpp.o # target to build an object file util/stringProcessing.cpp.o: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.o $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.o - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.o - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.o + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.o + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.o .PHONY : util/stringProcessing.cpp.o util/stringProcessing.i: util/stringProcessing.cpp.i @@ -675,11 +769,11 @@ util/stringProcessing.i: util/stringProcessing.cpp.i # target to preprocess a source file util/stringProcessing.cpp.i: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.i $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.i - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.i - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.i + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.i + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.i .PHONY : util/stringProcessing.cpp.i util/stringProcessing.s: util/stringProcessing.cpp.s @@ -688,11 +782,11 @@ util/stringProcessing.s: util/stringProcessing.cpp.s # target to generate assembly for a file util/stringProcessing.cpp.s: - $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.s $(MAKE) -f CMakeFiles/StringProcessingTest.dir/build.make CMakeFiles/StringProcessingTest.dir/util/stringProcessing.cpp.s - $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.s - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/stringProcessing.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/stringProcessing.cpp.s + $(MAKE) -f CMakeFiles/StemmerTest.dir/build.make CMakeFiles/StemmerTest.dir/util/stringProcessing.cpp.s + $(MAKE) -f CMakeFiles/TokenizerTest.dir/build.make CMakeFiles/TokenizerTest.dir/util/stringProcessing.cpp.s .PHONY : util/stringProcessing.cpp.s util/tests/stemmerTest.o: util/tests/stemmerTest.cpp.o @@ -783,8 +877,8 @@ util/util.o: util/util.cpp.o # target to build an object file util/util.cpp.o: $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/util/util.cpp.o - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.o $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/util.cpp.o + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.o .PHONY : util/util.cpp.o util/util.i: util/util.cpp.i @@ -794,8 +888,8 @@ util/util.i: util/util.cpp.i # target to preprocess a source file util/util.cpp.i: $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/util/util.cpp.i - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.i $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/util.cpp.i + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.i .PHONY : util/util.cpp.i util/util.s: util/util.cpp.s @@ -805,8 +899,8 @@ util/util.s: util/util.cpp.s # target to generate assembly for a file util/util.cpp.s: $(MAKE) -f CMakeFiles/ISRWord-tests.dir/build.make CMakeFiles/ISRWord-tests.dir/util/util.cpp.s - $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.s $(MAKE) -f CMakeFiles/crawler-parser-test.dir/build.make CMakeFiles/crawler-parser-test.dir/util/util.cpp.s + $(MAKE) -f CMakeFiles/ParserTest.dir/build.make CMakeFiles/ParserTest.dir/util/util.cpp.s .PHONY : util/util.cpp.s # Help Target @@ -816,25 +910,32 @@ help: @echo "... clean" @echo "... depend" @echo "... edit_cache" - @echo "... ISRWord-tests" @echo "... rebuild_cache" - @echo "... StemmerTest" @echo "... StringProcessingTest" - @echo "... TokenizerTest" + @echo "... ISRWord-tests" @echo "... URLTEST" @echo "... SharedTableTest" + @echo "... crawler-parser-test" @echo "... ParserTest" + @echo "... StemmerTest" + @echo "... TokenizerTest" @echo "... search-engine" - @echo "... crawler-parser-test" + @echo "... QueueTest" @echo "... constraintSolver/ISRWord.o" @echo "... constraintSolver/ISRWord.i" @echo "... constraintSolver/ISRWord.s" @echo "... constraintSolver/tests/ISRWordTests.o" @echo "... constraintSolver/tests/ISRWordTests.i" @echo "... constraintSolver/tests/ISRWordTests.s" - @echo "... crawler/SocketReader.o" - @echo "... crawler/SocketReader.i" - @echo "... crawler/SocketReader.s" + @echo "... crawler/Readers/HttpReader.o" + @echo "... crawler/Readers/HttpReader.i" + @echo "... crawler/Readers/HttpReader.s" + @echo "... crawler/Readers/HttpsReader.o" + @echo "... crawler/Readers/HttpsReader.i" + @echo "... crawler/Readers/HttpsReader.s" + @echo "... crawler/Readers/LocalReader.o" + @echo "... crawler/Readers/LocalReader.i" + @echo "... crawler/Readers/LocalReader.s" @echo "... crawler/crawler.o" @echo "... crawler/crawler.i" @echo "... crawler/crawler.s" @@ -859,6 +960,9 @@ help: @echo "... shared/Document.o" @echo "... shared/Document.i" @echo "... shared/Document.s" + @echo "... shared/ProducerConsumerQueue_test.o" + @echo "... shared/ProducerConsumerQueue_test.i" + @echo "... shared/ProducerConsumerQueue_test.s" @echo "... shared/SharedHashMapTest.o" @echo "... shared/SharedHashMapTest.i" @echo "... shared/SharedHashMapTest.s" diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 25e606662feaefde5ee4c874be89be439cd86b6b..67bf55fdab4e0e5c92fb7fbd356cfe5e96f9f471 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -15,10 +15,10 @@ Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn ) * Executes the Parser * @return */ -const unordered_map< string, vector< unsigned long > > *Parser::execute ( Document *document ) +const unordered_map< string, vector< unsigned long > > *Parser::execute ( StreamReader* reader) { Tokenizer tokenizer; - parse( document->DocToString( ), document->getUrl( ), &tokenizer ); + parse(reader, &tokenizer); return tokenizer.get( ); } @@ -27,56 +27,74 @@ const unordered_map< string, vector< unsigned long > > *Parser::execute ( Docume * @param inFile * @return */ -void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) +void Parser::parse ( StreamReader* reader, Tokenizer *tokenizer ) { unsigned long htmlIt = 0; unsigned long offsetTitle = 0; unsigned long offsetURL = 0; unsigned long offsetAnchor = 0; + ParsedUrl currentUrl = reader->getUrl(); // tokenize url offsetURL = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offsetURL, Tokenizer::URL ); // tokenize anchor + // TODO ParsedUrl with anchor text string anchorText = currentUrl.getAnchorText( ); if ( anchorText != "" ) { offsetAnchor = tokenizer->execute( anchorText, offsetAnchor, Tokenizer::ANCHOR ); } - // find titles - while ( htmlIt < html.size( ) ) + reader->request(); + bool success = reader->checkStatus(); + if(success) { - // if open bracket - if ( html[ htmlIt ] == '<' ) + string html = reader->PageToString(); + + while ( htmlIt < html.size( ) ) { - unsigned long begCloseTag = findNext( "</", htmlIt, html ); - unsigned long endCloseTag = findNext( ">", begCloseTag, html ); - string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt ); - htmlIt = endCloseTag + 2; - - // check if line is url - string url = extractUrl( line ); - if ( url != "" ) + // if open bracket + if ( html[ htmlIt ] == '<' ) { - - pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true ); + unsigned long begCloseTag = findNext( "</", htmlIt, html ); + unsigned long endCloseTag = findNext( ">", begCloseTag, html ); + string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt ); + htmlIt = endCloseTag + 2; + + // check if line is url + string url = extractUrl( line ); + if ( url != "" ) + { + if ( isLocal( url ) ) + { + string completeUrl = ""; + completeUrl.assign( currentUrl.CompleteUrl ); + url = completeUrl + url; + } + if ( isValid( url ) ) + { + ParsedUrl pUrl = ParsedUrl( url ); + urlFrontier->Push( pUrl ); + cout << url << endl; + } + } + // check if line is title + else + { + string title = extractTitle( line ); + if ( title != "" ) + { + offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE ); + } + } } - // check if line is title else { - string title = extractTitle( line ); - if ( title != "" ) - { - offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE ); - } + ++htmlIt; } } - else - { - ++htmlIt; - } } } diff --git a/parser/Parser.h b/parser/Parser.h index b38a91efd502c7e5351510ee70507da6e4f8d1d9..330783ebda1bd5627fe3e66fe5b813e8a60d4b15 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -10,6 +10,7 @@ #include "../util/stringProcessing.h" #include "../shared/Document.h" #include "../shared/ProducerConsumerQueue.h" +#include "../crawler/Readers/StreamReader.h" using namespace std; @@ -34,7 +35,7 @@ public: * Executes the Parser * @return */ - const unordered_map< string, vector< unsigned long > > *execute ( Document *document ); + const unordered_map< string, vector< unsigned long > > *execute ( StreamReader* reader ); private: @@ -45,7 +46,7 @@ private: * @param inFile * @return */ - void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ); + void parse ( StreamReader* reader, Tokenizer* tokenizer ); /** * Returns anchor text if found diff --git a/shared/ProducerConsumerQueue_test.cpp b/shared/ProducerConsumerQueue_test.cpp index 97180a4ffa2c16f1f32f0f8269cbd3c5da3160ce..0b6e2426b33c492d4cbd7481989e8031b47d2653 100644 --- a/shared/ProducerConsumerQueue_test.cpp +++ b/shared/ProducerConsumerQueue_test.cpp @@ -14,12 +14,8 @@ void* Producer(void* p) { ProducerConsumerQueue<int> * queue = (ProducerConsumerQueue<int>*)p; - for(int i = 0; i < 10; i++) + for(int i = 0; i < 500; i++) { - if(queue->Size()){ - pthread_yield_np(); // let the consumer thread run - } - queue->Push(i); pthread_mutex_lock(&cout_lock); @@ -34,7 +30,7 @@ void* Consumer(void* p) { ProducerConsumerQueue<int> * queue = (ProducerConsumerQueue<int>*)p; - for(int i = 0; i < 10; i++) + while(true) { int r = queue->Pop(); @@ -64,9 +60,6 @@ int main(int argc, const char * argv[]) pthread_mutex_unlock(&cout_lock); pthread_create(&producer, NULL, Producer, queue); - pthread_mutex_lock(&cout_lock); - std::cout << "Waiting for Producer and Consumer\n"; - pthread_mutex_unlock(&cout_lock); pthread_join(producer, NULL); pthread_join(consumer, NULL); diff --git a/shared/SharedHashMap.h b/shared/SharedHashMap.h index 03246b6fcab4a07e468a76af263b56eaf695627c..b42c24144f39b02652e812bf82d3c3d8993a04cb 100644 --- a/shared/SharedHashMap.h +++ b/shared/SharedHashMap.h @@ -4,6 +4,8 @@ #ifndef EECS398_SEARCH_SHAREDHASHMAP_H #define EECS398_SEARCH_SHAREDHASHMAP_H + +#pragma once #include <unordered_map> #include <pthread.h> diff --git a/shared/ThreadClass.h b/shared/ThreadClass.h index 8d594467373a76440c66b177086a402cf0047a39..5250e2d2f2be7a2a4e1c2d927475e1f70213b6e8 100644 --- a/shared/ThreadClass.h +++ b/shared/ThreadClass.h @@ -14,15 +14,20 @@ public: //Returns true if thread was created successfully bool StartThread() - { + { return (pthread_create(&thread, NULL, StaticFuncToRun, this) == 0); - } + } //Blocks until thread finishes void WaitForFinish() - { + { pthread_join(thread, NULL); - } + } + + void Die() + { + pthread_cancel(thread); + } protected: //IMPLEMENT THIS METHOD IN YOUR SUB CLASS WITH CODE YOU WANT YOUR THREAD TO RUN diff --git a/tests/localSeed.txt b/tests/localSeed.txt index 64f07d0719974c1fe6f53d3c736bea8fe32f4e62..4ee0d2d4bf6d26201437c131f954c14d5bb0f34a 100644 --- a/tests/localSeed.txt +++ b/tests/localSeed.txt @@ -1,2 +1 @@ -tests/cats.html -tests/store.html \ No newline at end of file +tests/cats.html \ No newline at end of file diff --git a/tests/plaintext.txt b/tests/plaintext.txt new file mode 100644 index 0000000000000000000000000000000000000000..63affee6bb3b3474adb533bfbf8092766a543c47 --- /dev/null +++ b/tests/plaintext.txt @@ -0,0 +1,2 @@ +<title> This is the title </title> +<body> This is the body </body> \ No newline at end of file diff --git a/tests/webSeed.txt b/tests/webSeed.txt index 0030d8e0b92971a9821336f4fd2fffbd92c4c0a1..1a77674a9f18e75e3c6acfe7b1187cb55845e4a0 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,5 +1,3 @@ -https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards -https://www.nytimes.com/ http://www.bbc.com/ -http://umich.edu -https://en.wikipedia.org/wiki/North_Ronaldsay_sheep \ No newline at end of file +https://www.nytimes.com/ +https://en.wikipedia.org/wiki/71st_British_Academy_Film_Awards \ No newline at end of file