diff --git a/CMakeLists.txt b/CMakeLists.txt index 210ca2f9e7bac367b7d32629764f62f36badfb7c..3bedf1fc146c110c18a4c8787a1e33566f482fa5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,27 @@ add_executable(QueueTest shared/ProducerConsumerQueue.h shared/ProducerConsumerQueue_test.cpp) +add_executable(TryPopTest + shared/TryPopTest.cpp + shared/ProducerConsumerQueue.h + shared/ThreadClass.h + shared/url.h + crawler/crawler.cpp + crawler/UrlFrontier.cpp + crawler/Readers/StreamReader.h + crawler/Readers/HttpReader.cpp + crawler/Readers/HttpsReader.cpp + crawler/Readers/LocalReader.cpp + crawler/spider.cpp + util/util.cpp + shared/Document.cpp + parser/Parser.cpp + util/Stemmer.cpp + util/Tokenizer.cpp + util/stringProcessing.cpp + indexer/Indexer.cpp + ) + add_executable(crawler-parser-Test main.cpp shared/ProducerConsumerQueue.h @@ -215,6 +236,8 @@ add_executable(MasterReader-tests find_package(OpenSSL REQUIRED) +target_link_libraries(TryPopTest OpenSSL::SSL) + target_link_libraries(ParserTest OpenSSL::SSL) target_link_libraries(isolated-integration OpenSSL::SSL pthread) diff --git a/crawler-parser-indexer-test b/crawler-parser-indexer-test index 2c6a3ff64c0191422cf70d8bc862d91c1fe4427c..238c146b7c13ae787c082a532ce10cd60eeb0778 100755 Binary files a/crawler-parser-indexer-test and b/crawler-parser-indexer-test differ diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp index 6e24e523c58238f0713c60eb930e8e903ff002b7..28b28d6411e697e041bc89ac42a9d66d96efb9ed 100644 --- a/crawler/Readers/HttpReader.cpp +++ b/crawler/Readers/HttpReader.cpp @@ -1,5 +1,6 @@ #include "HttpReader.h" +#include <sys/time.h> std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" ); @@ -55,6 +56,13 @@ bool HttpReader::request ( ) send( sock, getMessage.c_str( ), getMessage.length( ), 0 ); bool isSuccess = checkStatus( ); + + //set timeout option + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv, sizeof(tv)); + return isSuccess; } diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp index 6c080e4d8a701e16c2e49ee225c2e11a2307901b..c35374c717e7aadc0315754b7c1ccee1d16d30e4 100644 --- a/crawler/Readers/HttpsReader.cpp +++ b/crawler/Readers/HttpsReader.cpp @@ -40,6 +40,12 @@ bool HttpsReader::request ( ) assert( connectResult == 0 ); + // set timeout val before binding the ssl to the sock + struct timeval tv; + tv.tv_sec = 10; + tv.tv_usec = 0; + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (const char*)&tv, sizeof(tv)); + // Build an SSL layer and set it to read/write // to the socket we've connected. diff --git a/crawler/UrlFrontier.cpp b/crawler/UrlFrontier.cpp index 6b0d306779e72a00c3ccca37a4839a5a30d55f8f..c53491d5b2514e6c06f96a77d31dfb09db00ae02 100644 --- a/crawler/UrlFrontier.cpp +++ b/crawler/UrlFrontier.cpp @@ -16,21 +16,21 @@ // then adds both to the url map and the host map -bool UrlFrontier::checkUrl( ParsedUrl *url ) +bool UrlFrontier::checkUrl( ParsedUrl url ) { - if( Blacklist.find( url->getHost( ) ) != Blacklist.end( ) ) + if( Blacklist.find( url.getHost( ) ) != Blacklist.end( ) ) return false; //Looks to see if the complete url already exists, if so return - if ( this->duplicateUrlMap->find( url->getCompleteUrl( )) != this->duplicateUrlMap->end( )) + if ( this->duplicateUrlMap->find( url.getCompleteUrl( )) != this->duplicateUrlMap->end( )) { //update the anchor text - if ( !url->getAnchorText( ).empty( ) || url->getAnchorText( ) != "") + if ( !url.getAnchorText( ).empty( ) || url.getAnchorText( ) != "") { pthread_mutex_lock( &m ); - (*duplicateUrlMap)[ url->getCompleteUrl( ) ][ url->getAnchorText( ) ]++; + (*duplicateUrlMap)[ url.getCompleteUrl( ) ][ url.getAnchorText( ) ]++; pthread_mutex_unlock( &m ); } //add the new @@ -44,26 +44,26 @@ bool UrlFrontier::checkUrl( ParsedUrl *url ) time( &now ); double difference = 0; //Has the domain been seen? - if ( this->domainMap->find( url->getHost( )) != this->domainMap->end( )) + if ( this->domainMap->find( url.getHost( )) != this->domainMap->end( )) { //get the last time it was seen and find the time difference - time_t lastSeen = this->domainMap->at( url->getHost( )); + time_t lastSeen = this->domainMap->at( url.getHost( )); difference = difftime( now, lastSeen ); if ( difference == 0 ) difference = .01; else difference = difference / 100; - url->updateScore( difference ); + url.updateScore( difference ); pthread_mutex_lock( &m ); - (*domainMap)[ url->getHost( ) ] = now; + (*domainMap)[ url.getHost( ) ] = now; pthread_mutex_unlock( &m ); } else { pthread_mutex_lock( &m ); - this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time + this->domainMap->insert( std::make_pair( url.getHost( ), now )); //otherwise add to the map the current time pthread_mutex_unlock( &m ); @@ -72,7 +72,7 @@ bool UrlFrontier::checkUrl( ParsedUrl *url ) //add url to the duplicate url map pthread_mutex_lock( &m ); - (*duplicateUrlMap)[ url->getCompleteUrl( ) ][ url->getAnchorText( ) ] = 1; + (*duplicateUrlMap)[ url.getCompleteUrl( ) ][ url.getAnchorText( ) ] = 1; pthread_mutex_unlock( &m ); return true; @@ -80,10 +80,10 @@ bool UrlFrontier::checkUrl( ParsedUrl *url ) } -void UrlFrontier::Push( ParsedUrl *url ) +void UrlFrontier::Push( ParsedUrl url ) { //if the url has been seen? if so, dont add it - if ( url->isValid ) + if ( url.isValid ) { if ( checkUrl( url )) @@ -104,10 +104,37 @@ void UrlFrontier::Push( ParsedUrl *url ) } } - -ParsedUrl *UrlFrontier::Pop() +bool UrlFrontier::try_pop( ParsedUrl& result ) { + gettimeofday(&now, NULL); + timeToWait.tv_sec = now.tv_sec + 5; + timeToWait.tv_nsec = (now.tv_usec+1000UL*100)*1000UL; + + int retval; + + pthread_mutex_lock(&m); + + while(queue.empty()){ + retval = pthread_cond_timedwait(&consumer_cv, &m, &timeToWait); + if(retval != 0){ + fprintf(stderr, "pthread_cond_timedwait %s\n", + strerror(retval)); + pthread_mutex_unlock(&m); + return false; + } + } + + result = std::move(queue.top()); + queue.pop(); + + pthread_mutex_unlock(&m); + return true; + } + + +ParsedUrl UrlFrontier::Pop() + { pthread_mutex_lock( &m ); @@ -116,7 +143,7 @@ ParsedUrl *UrlFrontier::Pop() pthread_cond_wait( &consumer_cv, &m ); } - ParsedUrl *front = queue.top( ); + ParsedUrl front = queue.top( ); queue.pop( ); pthread_mutex_unlock( &m ); @@ -167,12 +194,11 @@ void UrlFrontier::writeDataToDisk() while ( !queue.empty( )) { - ParsedUrl *url = queue.top( ); + ParsedUrl url = queue.top( ); queue.pop( ); - string url_disk = url->getCompleteUrl() + "\n"; + string url_disk = url.getCompleteUrl() + "\n"; write( file, url_disk.c_str( ), strlen( url_disk.c_str( ) )); - url = 0; - delete url; + } pthread_mutex_unlock( &m ); @@ -196,7 +222,7 @@ void UrlFrontier::readDataFromDisk( ) if ( *files == '\n' ) { - ParsedUrl *url = new ParsedUrl( testFile ); + ParsedUrl url(testFile); cout << "Pushing: " << testFile << " to queue\n"; Push( url ); testFile = ""; @@ -212,8 +238,8 @@ void UrlFrontier::readDataFromDisk( ) void UrlFrontier::readBlackList() { - string blackListFile = "/crawler/blacklist.txt" - char *hosts = util::getFileMap( fileName ); + string blackListFile = "/crawler/blacklist.txt"; + char *hosts = util::getFileMap( blackListFile ); string toBlackList; while ( *hosts ) @@ -221,7 +247,7 @@ void UrlFrontier::readBlackList() if ( *hosts == '\n' ) { - Blacklist.insert(toBlackList) + Blacklist.insert(toBlackList); toBlackList = ""; } else @@ -231,7 +257,7 @@ void UrlFrontier::readBlackList() } } - } + diff --git a/crawler/UrlFrontier.h b/crawler/UrlFrontier.h index 0d6ddf753b733d4c6f72ae21e7d332244bfde3ad..c9040d8a2b9aa8c487f5677dc2df1d01f8a36f80 100644 --- a/crawler/UrlFrontier.h +++ b/crawler/UrlFrontier.h @@ -17,15 +17,15 @@ typedef unordered_map<string , anchorToCountMap> urlMap; class ComparisonClass { public: - bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) { + bool operator() (ParsedUrl lhs , ParsedUrl rhs) { //comparison code here - return lhs->getScore() > rhs->getScore(); + return lhs.getScore() > rhs.getScore(); } }; -class UrlFrontier +class UrlFrontier : public ProducerConsumerQueue<ParsedUrl> { public: @@ -33,16 +33,18 @@ class UrlFrontier readBlackList(); }; - void Push ( ParsedUrl * url ); - bool checkUrl(ParsedUrl * url); + void Push ( ParsedUrl url ) override; + bool try_pop( ParsedUrl& result ) override; + ParsedUrl Pop ( ) override ; + size_t Size() override; + + bool checkUrl(ParsedUrl url); void readBlackList( ); + void printAnchorTable( ); set < string > Blacklist ; - ParsedUrl * Pop ( ); - size_t Size(); - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - pthread_cond_t consumer_cv = PTHREAD_COND_INITIALIZER; - std::priority_queue<ParsedUrl *, std::vector<ParsedUrl*>, ComparisonClass> queue; + + std::priority_queue<ParsedUrl , std::vector<ParsedUrl>, ComparisonClass> queue; //Writes the duplicate url map and priorty queue from disk void writeDataToDisk( ); diff --git a/crawler/crawler.cpp b/crawler/crawler.cpp index 0ab5e240fc9da2deabec2b4fffbaaabf79a0650d..2f4571a0247004763a032131eb50c723844ca625 100644 --- a/crawler/crawler.cpp +++ b/crawler/crawler.cpp @@ -13,11 +13,11 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >; */ -void Crawler::SpawnSpiders ( size_t num_spiders , atomic_bool * alive) +void Crawler::SpawnSpiders ( size_t num_spiders , atomic_bool * alive, int numdocs) { for ( size_t i = 0; i < num_spiders; i++ ) { - Spider *temp = new Spider( this->mode, this->urlFrontier, this->IndexerQueue , alive); + Spider *temp = new Spider( this->mode, this->urlFrontier, this->IndexerQueue , alive, numdocs); temp->StartThread( ); this->spiders.push_back( temp ); } diff --git a/crawler/crawler.h b/crawler/crawler.h index f67e8f77da75cad0fc807cfe11208037bbe5ae00..8ab6cfd3db4ce9e53011fb175b6748be50aa872a 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -30,7 +30,7 @@ public: { }; //spawns a number of works - void SpawnSpiders ( size_t num_spiders, atomic_bool* alive ); + void SpawnSpiders ( size_t num_spiders, atomic_bool* alive, int numdocs ); diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 9d56276cf3e335ae5855ca2c84026d1aabd4c4a5..ea969ce93579345e6d940aac7261175d53334800 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -84,7 +84,7 @@ size_t Spider::hash ( const char *s ) * */ -ParsedUrl * Spider::getUrl ( ) +ParsedUrl Spider::getUrl ( ) { return urlFrontier->Pop( ); } @@ -105,45 +105,37 @@ void Spider::run ( ) { std::cout << "Spider is crawling" << endl; int cond = 0; + ParsedUrl currentUrl; - while (*alive && cond < 100) - { - if(cond % 25 == 0) - { - cout << "Spider has crawled" << to_string(cond) << endl; - } - - if(urlFrontier->Size() > 0) - { - - ParsedUrl * currentUrl = getUrl( ); - size_t docID = hash( currentUrl->getCompleteUrl().c_str() ); - if ( shouldURLbeCrawled( docID ) ) - { - StreamReader *reader = SR_factory( currentUrl, this->mode ); - if(reader) - { - bool success = reader->request( ); - if ( success ) - { - cout << "Parsing " << currentUrl->getCompleteUrl(); - DocIndex *dict = parser.execute( reader ); - IndexerQueue->Push( dict ); - - reader->closeReader( ); - //delete dict; - - cond++; - } + while (*alive && cond < docs_to_crawl) + { + bool not_empty = urlFrontier->try_pop(currentUrl); + + if(not_empty) { + size_t docID = hash(currentUrl.getCompleteUrl().c_str()); + if (shouldURLbeCrawled(docID)) { + StreamReader *reader = SR_factory(¤tUrl, this->mode); + if (reader) { + bool success = reader->request(); + if (success) { + cout << "Parsing " << currentUrl.getCompleteUrl(); + DocIndex *dict = parser.execute(reader); + IndexerQueue->Push(dict); + + reader->closeReader(); + //delete dict; + + cond++; } + } - delete reader; + delete reader; - } } + } } cout << "Spider has finished running " << endl; return; diff --git a/crawler/spider.h b/crawler/spider.h index fabd1d0abec963b34d9b75da16442122e9e7f2a5..b0f194847242cecc925858c271dd82b8ec7101c8 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -22,22 +22,24 @@ class Spider : public ThreadClass public: Spider ( string mode_in, - UrlFrontier *url_q_in, + UrlFrontier *url_q_in, ProducerConsumerQueue< DocIndex * > *doc_index_queue_in, - atomic_bool * bool_in + atomic_bool * bool_in, + int numdocs ) : mode( mode_in ), urlFrontier( url_q_in ), parser( url_q_in ), IndexerQueue( doc_index_queue_in ), - alive( bool_in ) + alive( bool_in ), + docs_to_crawl(numdocs) { }; //Takes a url off of the url frontier - ParsedUrl * getUrl ( ); + ParsedUrl getUrl ( ); virtual void run ( ); @@ -61,5 +63,6 @@ private: string mode; Parser parser; atomic_bool* alive; + int docs_to_crawl; }; \ No newline at end of file diff --git a/crawler/tests/urlFrontierTest.cpp b/crawler/tests/urlFrontierTest.cpp index 8fee275fdfd99216db39b9f7feab917f1f86b623..b9d1ea179c66d06271e5818136ad9f7168e58f85 100644 --- a/crawler/tests/urlFrontierTest.cpp +++ b/crawler/tests/urlFrontierTest.cpp @@ -45,8 +45,8 @@ int main ( int argc, char *argv[] ) //string url2 = "https:"; //string bad_url = "http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\" />"; - ParsedUrl * url = new ParsedUrl(url1); - ParsedUrl * url_1 = new ParsedUrl(url2); + ParsedUrl url(url1); + ParsedUrl url_1(url2); urlFrontier->Push(url); urlFrontier->Push(url_1); diff --git a/main.cpp b/main.cpp index 432c09779e950d6249fdafc0362da435750de28c..903106feb9c97bd8ad3467800f9fd1c5c8e517b6 100644 --- a/main.cpp +++ b/main.cpp @@ -23,13 +23,37 @@ #include <chrono> #include <future> #include <ctime> + using DocIndex = const unordered_map< string, vector< unsigned long > >; using namespace std; atomic_bool *alive = new atomic_bool(true); +//atomic_bool has_shutdown = false; + +/* +void wait_to_shutdown(Indexer& indexer, Crawler* crawler, UrlFrontier* urlFrontier, ProducerConsumerQueue< DocIndex * > *IndexerQueue) + { + cout << "Press anything to quit" << endl; + char c = 't'; + while(c != 'Q' && !has_shutdown) + { + c = getchar(); + } + if(has_shutdown) return; + crawler->passAnchorTextToIndex( ); + indexer.Kill(); + indexer.WaitForFinish( ); + urlFrontier->writeDataToDisk(); + delete urlFrontier; + delete IndexerQueue; +>>>>>>> QueueRefactor + + cout << "Indexer has finished running " << endl; + } +*/ void signalHandler( int signum ) { cout << "Interrupt signal (" << signum << ") received.\n"; @@ -145,7 +169,7 @@ int main ( int argc, char *argv[] ) if ( *seeds == '\n' ) { - ParsedUrl * url = new ParsedUrl( testFile ); + ParsedUrl url(testFile); cout << "Pushing: " << testFile << " to queue\n"; urlFrontier->Push( url ); testFile = ""; @@ -157,8 +181,8 @@ int main ( int argc, char *argv[] ) if ( testFile != "" ) { cout << "Pushing: " << testFile << " to queue\n"; - ParsedUrl * url = new ParsedUrl( testFile ); - urlFrontier->Push( url ); + ParsedUrl url1(testFile); + urlFrontier->Push( url1 ); } } else @@ -171,7 +195,10 @@ int main ( int argc, char *argv[] ) indexer.StartThread( ); Crawler *crawler = new Crawler( mode, urlFrontier, IndexerQueue, AnchorQueue ); - crawler->SpawnSpiders( numberOfSpiders , alive); + + //atomic_bool *alive = new atomic_bool(true); + crawler->SpawnSpiders( numberOfSpiders , alive, DocsToCrawl); + string input; @@ -179,8 +206,9 @@ int main ( int argc, char *argv[] ) if(DocsToCrawl > 0 ) { - cout << "Crawling 100,000 documents for each spider" << endl; + cout << "Crawling: " << DocsToCrawl << " documents for each spider" << endl; crawler->WaitOnAllSpiders( ); + //has_shutdown = true; crawler->passAnchorTextToIndex( ); indexer.Kill(); indexer.WaitForFinish( ); diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 771a470e297763abdd3e48f8ac8eaceea11232ca..3e8313b8cadf5ee74fd71c728974366349ae8197 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -361,9 +361,9 @@ void Parser::pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorT { try { - ParsedUrl *pUrl = new ParsedUrl( url ); - pUrl->setAnchorText( anchorText ); - urlFrontier->Push( pUrl ); + ParsedUrl url_(url); + url_.setAnchorText( anchorText ); + urlFrontier->Push( url_ ); if ( debug ) { cout << url << endl; diff --git a/seedCreate.py b/seedCreate.py new file mode 100644 index 0000000000000000000000000000000000000000..d739fa2b1e8fb51884dbda23bd7adfb15fe9ba81 --- /dev/null +++ b/seedCreate.py @@ -0,0 +1,6 @@ +s = "http://www." +for i in range(0,10): + x = s + str(i) + ".com" + print(x) + + diff --git a/shared/ProducerConsumerQueue.cpp b/shared/ProducerConsumerQueue.cpp index 4b1c6bff6fa106bf58a92f08cb9c086785218b1f..511b1d9aed0d499d2d6a17a43d011da54b89953d 100644 --- a/shared/ProducerConsumerQueue.cpp +++ b/shared/ProducerConsumerQueue.cpp @@ -9,9 +9,9 @@ void ProducerConsumerQueue< T >::Push ( T obj ) { pthread_mutex_lock( &m ); - queue.push( obj ); + queue_.push( obj ); - if ( queue.size( ) == 1 ) + if ( queue_.size( ) == 1 ) { pthread_cond_broadcast( &consumer_cv ); } @@ -19,18 +19,48 @@ void ProducerConsumerQueue< T >::Push ( T obj ) pthread_mutex_unlock( &m ); } +template< class T > +bool ProducerConsumerQueue< T >::try_pop(T &result) + { + + gettimeofday(&now, NULL); + timeToWait.tv_sec = now.tv_sec + 5; + timeToWait.tv_nsec = (now.tv_usec+1000UL*100)*1000UL; + + + int retval; + + pthread_mutex_lock(&m); + + while(queue_.empty()){ + retval = pthread_cond_timedwait(&consumer_cv, &m, &timeToWait); + if(retval != 0){ + fprintf(stderr, "pthread_cond_timedwait %s\n", + strerror(retval)); + pthread_mutex_unlock(&m); + return false; + } + } + + result = std::move(queue_.front()); + queue_.pop(); + + pthread_mutex_unlock(&m); + return true; + } + template< class T > T ProducerConsumerQueue< T >::Pop ( ) { pthread_mutex_lock( &m ); - while ( queue.empty( ) == true ) + while ( queue_.empty( ) == true ) { pthread_cond_wait( &consumer_cv, &m ); } - T front = queue.front( ); - queue.pop( ); + T front = queue_.front( ); + queue_.pop( ); pthread_mutex_unlock( &m ); @@ -41,7 +71,7 @@ template< class T > size_t ProducerConsumerQueue< T >::Size ( ) { pthread_mutex_lock( &m ); - size_t size = queue.size( ); + size_t size = queue_.size( ); pthread_mutex_unlock( &m ); return size; } \ No newline at end of file diff --git a/shared/ProducerConsumerQueue.h b/shared/ProducerConsumerQueue.h index fbc5a29a66603ebebc9bcfd0f6d7e151afd7f343..d36c96ff880f6f916b064131cd4b20e83bc46719 100644 --- a/shared/ProducerConsumerQueue.h +++ b/shared/ProducerConsumerQueue.h @@ -7,6 +7,9 @@ #include <queue> #include <pthread.h> +#include <chrono> +#include <sys/time.h> + //for now use STL queue, create better one later @@ -14,10 +17,6 @@ template< class T > class ProducerConsumerQueue { -private: - std::queue< T > queue; - pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; - pthread_cond_t consumer_cv = PTHREAD_COND_INITIALIZER; public: @@ -25,15 +24,20 @@ public: { } - void Push ( T obj ); - - T Pop ( ); - - size_t Size ( ); + virtual void Push ( T obj ); + virtual bool try_pop(T& result); + virtual T Pop ( ); + virtual size_t Size ( ); //Right now these pass objects by value but // probably should pass pointers in future +protected: + std::queue< T > queue_; + pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + pthread_cond_t consumer_cv = PTHREAD_COND_INITIALIZER; + struct timespec timeToWait; + struct timeval now; }; //Necessary because this class is templated diff --git a/shared/TryPopTest.cpp b/shared/TryPopTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6301e10337f796f4f06cf781956de43ad1c43409 --- /dev/null +++ b/shared/TryPopTest.cpp @@ -0,0 +1,54 @@ +// +// Created by Ben Bergkamp on 4/5/18. +// + +#include "ProducerConsumerQueue.h" +#include "../crawler/UrlFrontier.h" +#include <iostream> + +using namespace std; + +int main() + { + ProducerConsumerQueue< int > queue; + + queue.Push(2); + + int x; + + bool ret; + + cout << "-----Testing Producer Consumer Queue-----\n"; + cout << "Expecting: 1, 2, 0\n"; + + + ret = queue.try_pop(x); + + cout << "success: " << ret << endl; + cout << "val: " << x << endl; + + ret = queue.try_pop(x); + + cout << "success: " << ret << endl; + + cout << "-----Now Testing Url Frontier-----\n"; + cout << "Expecting: 1, http://www.espn.com, 0\n"; + + UrlFrontier fr; + + ParsedUrl ps("http://www.espn.com"); + ParsedUrl result; + + fr.Push(ps); + + ret = fr.try_pop(result); + + cout << "success: " << ret << endl; + cout << "val: " << result.getCompleteUrl() << endl; + + ret = queue.try_pop(x); + cout << "success: " << ret << endl; + + + + } \ No newline at end of file diff --git a/tests/testwebSeed.txt b/tests/testwebSeed.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaf1f9f0e53bd62c4fb8e2d61d2d5ee9befb8dba --- /dev/null +++ b/tests/testwebSeed.txt @@ -0,0 +1 @@ +http://www.alkdjhfalkd.com \ No newline at end of file diff --git a/tests/testwebSeed2.txt b/tests/testwebSeed2.txt new file mode 100644 index 0000000000000000000000000000000000000000..0350cf313294d25e319e9f156c79b18284bfbdc5 --- /dev/null +++ b/tests/testwebSeed2.txt @@ -0,0 +1,6 @@ +http://www.0.com +1 +http://www.1.com +akjdlhfad + +