diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b582313732d930377913beea730db08519ed949..2dab300962cc69b259b5cb347901d6698d1d7e46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ add_executable(crawler-parser-test shared/ThreadClass.h shared/url.h crawler/crawler.cpp + crawler/UrlFrontier.cpp crawler/Readers/StreamReader.h crawler/Readers/HttpReader.cpp crawler/Readers/HttpsReader.cpp @@ -53,6 +54,7 @@ add_executable(isolated-integration crawler/tests/crawlerTest.cpp shared/ProducerConsumerQueue.h shared/ThreadClass.h + crawler/UrlFrontier.cpp shared/url.h crawler/crawler.cpp crawler/Readers/StreamReader.h diff --git a/crawler-parser-test b/crawler-parser-test index a10a9df06cf72703b91379bc6bad67723c8c33df..a051b684568e23fa1e132fc36ccf5bcf3c6768fa 100755 Binary files a/crawler-parser-test and b/crawler-parser-test differ diff --git a/crawler/tests/crawlerTest.cpp b/crawler/tests/crawlerTest.cpp index 2fc947bd2c9ee80dcf81bdd56a93eb107b2fea21..e67f48a5b42436b9076c1922541f0dc2d1f565ae 100644 --- a/crawler/tests/crawlerTest.cpp +++ b/crawler/tests/crawlerTest.cpp @@ -31,7 +31,7 @@ int main ( int argc, char *argv[] ) char *seeds; int numberOfSpiders = 1; unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( ); - ProducerConsumerQueue < ParsedUrl > *urlFrontier = new ProducerConsumerQueue < ParsedUrl >( ); + UrlFrontier *urlFrontier = new UrlFrontier( ); ProducerConsumerQueue < DocIndex * > *IndexerQueue = new ProducerConsumerQueue < DocIndex * >( ); Indexer indexer( IndexerQueue ); string path = util::GetCurrentWorkingDir() +"/crawler/tests/testSeeds.txt"; @@ -66,9 +66,9 @@ int main ( int argc, char *argv[] ) //ParsedUrl url = ParsedUrl(bad_url); ParsedUrl url1 = ParsedUrl(bad_url); ParsedUrl url2 = ParsedUrl(bad_url2); - urlFrontier->Push(url1); + urlFrontier->Push(&url1); - urlFrontier->Push(url2); + urlFrontier->Push(&url2); indexer.StartThread( ); Crawler crawler( mode, urlFrontier, IndexerQueue ); diff --git a/isolated-integration b/isolated-integration index 85c7aa99ed2c98423473342fb196a327b89df840..b7acc4d01e0064e8a3f6e6a4a770401ad971e207 100755 Binary files a/isolated-integration and b/isolated-integration differ diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index be53342c472741aa7d0e2e0ea2b4ffbc56fc3b2c..76589eac58d251441a4cc2eccf05d5088a5df413 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -12,7 +12,7 @@ #include "../../crawler/Readers/HttpReader.h" #include "../../crawler/Readers/HttpsReader.h" #include "../../util/util.h" - +#include "../../crawler/UrlFrontier.h" using namespace std; void testSimple( ); @@ -52,7 +52,7 @@ void printDictionary ( const unordered_map< string, vector< unsigned long > > di void testSimple ( ) { cout << "Testing Simple: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://www.cats.com" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/plaintext.txt"; @@ -93,11 +93,11 @@ void testSimple ( ) void testHttp( ) { cout << "Testing Complex: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl httpURL = ParsedUrl( "http://veronicacday.com/" ); - HttpReader reader( httpURL ); + HttpReader reader( &httpURL ); auto success = reader.request( ); if ( !success ) { @@ -109,10 +109,10 @@ void testHttp( ) printDictionary( *dictionary ); urlFrontierTest.Pop( ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://trove.com/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://arcinnovations.xyz/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://gwydion.co/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" ); + assert( urlFrontierTest.Pop( )->getCompleteUrl( ) == "https://trove.com/" ); + assert( urlFrontierTest.Pop( )->getCompleteUrl( ) == "http://arcinnovations.xyz/" ); + assert( urlFrontierTest.Pop( )->getCompleteUrl( ) == "https://gwydion.co/" ); + assert( urlFrontierTest.Pop( )->getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" ); assert ( dictionary != nullptr ); assert ( dictionary->size( ) == 67 ); @@ -136,7 +136,7 @@ void testHttp( ) void testURL ( ) { cout << "Testing URL: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest ; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://testurl.com" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/urlTest.html"; @@ -156,7 +156,7 @@ void testURL ( ) assert ( dictionary != nullptr ); assert ( dictionary->size( ) == 3 ); assert ( dictionary->at( "=testurl.com/" )[ 0 ] == 0 ); - assert ( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://www.bafta.org/" ); + assert ( urlFrontierTest.Pop( )->getCompleteUrl( ) == "http://www.bafta.org/" ); assert ( dictionary->find( "$bafta" ) == dictionary->end( ) ); assert ( dictionary->at( "$testurl" )[ 0 ] == 0 ); assert ( dictionary->at( "$com" )[ 0 ] == 1 ); @@ -170,7 +170,7 @@ void testURL ( ) void testBody ( ) { cout << "Testing Body: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html"; @@ -220,7 +220,7 @@ void testBody ( ) void testExtractBody ( ) { cout << "Testing ExtractBody: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "https://developer.mozilla.org/en-US/docs/Learn" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/testExtractBodyTest.html";