diff --git a/CMakeLists.txt b/CMakeLists.txt index 4803feeaaad9d1628f81a4251d0d2c94ea4ee81e..80b2fbe42b88ae822eb153e5114f4d45b4d8c3b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,29 @@ add_executable(isolated-integration util/stringProcessing.cpp indexer/Indexer.cpp) + + + +add_executable(url-frontier-test + crawler/tests/urlFrontierTest.cpp + shared/ProducerConsumerQueue.h + shared/ThreadClass.h + shared/url.h + crawler/crawler.cpp + crawler/UrlFrontier.cpp + crawler/Readers/StreamReader.h + crawler/Readers/HttpReader.cpp + crawler/Readers/HttpsReader.cpp + crawler/Readers/LocalReader.cpp + crawler/spider.cpp + util/util.cpp + shared/Document.cpp + parser/Parser.cpp + util/Stemmer.cpp + util/Tokenizer.cpp + util/stringProcessing.cpp + indexer/Indexer.cpp) + add_executable(StringProcessingTest util/stringProcessing.cpp util/Stemmer.cpp @@ -118,6 +141,7 @@ find_package(OpenSSL REQUIRED) target_link_libraries(ParserTest OpenSSL::SSL) target_link_libraries(isolated-integration OpenSSL::SSL pthread) +target_link_libraries(url-frontier-test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-indexer-test OpenSSL::SSL pthread) diff --git a/ISRWord-tests b/ISRWord-tests index 41505cee0a4e4ce14664ef97256d66e1933e5621..c3787edeb5887451deb66e12db3e9707938054a0 100755 Binary files a/ISRWord-tests and b/ISRWord-tests differ diff --git a/crawler-parser-indexer-test b/crawler-parser-indexer-test index f997cca9827a90356576c20360b05e735d8bc86d..5d5083a568f57d11c40a1a010bc84ae4b201deb1 100755 Binary files a/crawler-parser-indexer-test and b/crawler-parser-indexer-test differ diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp index 0424d16003a1dd5fecfb6aac00eb64c1dc58cbdc..e9a9f8f106c71e191268da28c9098bb90e7b0f8a 100644 --- a/crawler/Readers/HttpReader.cpp +++ b/crawler/Readers/HttpReader.cpp @@ -14,11 +14,11 @@ bool HttpReader::request ( ) // Get the host address. - struct hostent *host = gethostbyname( url.getHost().c_str() ); + struct hostent *host = gethostbyname( url->getHost().c_str() ); if ( host == nullptr ) throw HTTPConnectionError; - if(url.getService() != "http") + if(url->getService() != "http") throw HTTPConnectionError; assert( host ); @@ -40,9 +40,9 @@ bool HttpReader::request ( ) cout << "Socket Reader is pulling from the web" << endl; string getMessage = "GET "; - getMessage += url.getCompleteUrl(); + getMessage += url->getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.getHost(); + getMessage += url->getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -78,7 +78,7 @@ string HttpReader::PageToString ( ) return temp; } -ParsedUrl HttpReader::getUrl ( ) +ParsedUrl * HttpReader::getUrl ( ) { return url; } diff --git a/crawler/Readers/HttpReader.h b/crawler/Readers/HttpReader.h index 6f1a88a6e821f080aafa8075239d4e9e92960bbf..a20fd74340167cbe5c55c5816142ef1ebe4c14ed 100644 --- a/crawler/Readers/HttpReader.h +++ b/crawler/Readers/HttpReader.h @@ -9,7 +9,7 @@ class HttpReader : public StreamReader { public: - HttpReader ( ParsedUrl url_in ) : url( url_in ) + HttpReader ( ParsedUrl * url_in ) : url( url_in ) { } bool request ( ); @@ -20,14 +20,14 @@ public: string PageToString ( ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); void closeReader ( ); private: - ParsedUrl url; + ParsedUrl * url; int sock; }; diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp index 31b9528573207b3ec29733342c24c4cef5668eb7..951d9149bc5811b553b3cf0a2928ba23324b0dfc 100644 --- a/crawler/Readers/HttpsReader.cpp +++ b/crawler/Readers/HttpsReader.cpp @@ -7,12 +7,12 @@ bool HttpsReader::request ( ) { try { - struct hostent *host = gethostbyname( url.getHost().c_str() ); + struct hostent *host = gethostbyname( url->getHost().c_str() ); if ( host == nullptr ) throw HTTPSconnectionError; - if( url.getService() != "https") + if( url->getService() != "https") throw HTTPSconnectionError; assert( host ); @@ -54,9 +54,9 @@ bool HttpsReader::request ( ) // Send a GET message for the desired page through the SSL. string getMessage = "GET "; - getMessage += url.getCompleteUrl(); + getMessage += url->getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.getHost(); + getMessage += url->getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -115,7 +115,7 @@ bool HttpsReader::checkStatus ( ) } -ParsedUrl HttpsReader::getUrl ( ) +ParsedUrl * HttpsReader::getUrl ( ) { return url; } diff --git a/crawler/Readers/HttpsReader.h b/crawler/Readers/HttpsReader.h index c993f62e8586d301d5ff6424a5b973b8e0d5d8a1..3d5e6cbf48c96bc046b53af125857db4d259afc6 100644 --- a/crawler/Readers/HttpsReader.h +++ b/crawler/Readers/HttpsReader.h @@ -10,7 +10,7 @@ class HttpsReader : public StreamReader { public: - HttpsReader ( ParsedUrl url_in ) : url( url_in ) + HttpsReader ( ParsedUrl * url_in ) : url( url_in ) { } bool request ( ); @@ -19,14 +19,14 @@ public: string PageToString ( ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); void closeReader ( ); bool checkStatus ( ); private: - ParsedUrl url; + ParsedUrl * url; int sock; SSL *ssl; SSL_CTX *ctx; diff --git a/crawler/Readers/LocalReader.cpp b/crawler/Readers/LocalReader.cpp index cef70bae71e13f472349b1950a57834e7298514e..217a2b480e680b211459d74b78162de34496130f 100644 --- a/crawler/Readers/LocalReader.cpp +++ b/crawler/Readers/LocalReader.cpp @@ -29,10 +29,10 @@ string LocalReader::PageToString ( ) return temp; } -ParsedUrl LocalReader::getUrl ( ) +ParsedUrl * LocalReader::getUrl ( ) { ParsedUrl url(test_url); - return url; + return &url; } bool LocalReader::checkStatus ( ) diff --git a/crawler/Readers/LocalReader.h b/crawler/Readers/LocalReader.h index 034e459422c4bd26cb4e448156fb87043ce55267..dbb716ba86bbeb522b4a0c255789be9e5d418f01 100644 --- a/crawler/Readers/LocalReader.h +++ b/crawler/Readers/LocalReader.h @@ -17,7 +17,7 @@ public: bool fillBuffer ( char *buf, size_t buf_size ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); bool checkStatus ( ); diff --git a/crawler/Readers/StreamReader.h b/crawler/Readers/StreamReader.h index 0ebd689c3a1b9ae54f2d05753e3171687572c309..621a809ff1ee1d4daace9176cb6014281c4cf632 100644 --- a/crawler/Readers/StreamReader.h +++ b/crawler/Readers/StreamReader.h @@ -30,7 +30,7 @@ public: virtual string PageToString ( ) = 0; - virtual ParsedUrl getUrl ( ) =0; + virtual ParsedUrl * getUrl ( ) =0; virtual void closeReader ( ) = 0; }; diff --git a/crawler/UrlFrontier.cpp b/crawler/UrlFrontier.cpp index 9a0532a38aa3a5b50974a2d5dd9015376f912004..b9b8bdb4e96f1cb2bcec9f3f678492db473d5cf8 100644 --- a/crawler/UrlFrontier.cpp +++ b/crawler/UrlFrontier.cpp @@ -2,4 +2,88 @@ // Created by Jake Close on 3/26/18. // -#include "UrlFrontier.h" +#include "urlFrontier.h" + + + +void UrlFrontier::checkUrl(ParsedUrl* url) + { + + if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) ) + return ; + + else + { + time_t now; + time( &now ); + double difference = 0; + //Has the domain been seen? + if ( this->domainMap->find( url->getHost( )) != this->domainMap->end( )) + { + //get the last time it was seen and find the time difference + time_t lastSeen = this->domainMap->at( url->getHost( )); + difference = difftime( lastSeen, now ); + } + else + this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time + + + //add url to the duplicate url map + url->updateScore( difference ); + this->duplicateUrlMap->insert( std::make_pair( url->getCompleteUrl( ), 1 )); + return; + } + } + + +void UrlFrontier::Push( ParsedUrl * url ) + { + //if the url has been seen? if so, dont add it + + checkUrl(url); + + + //set the value of the last time the domain was seen to score + //url.setTime(difference); + //url.setScore(); + pthread_mutex_lock( &m ); + + queue.push( url ); + + if ( queue.size( ) == 1 ) + { + pthread_cond_broadcast( &consumer_cv ); + } + + pthread_mutex_unlock( &m ); + } + + + +ParsedUrl * UrlFrontier::Pop() + { + + + pthread_mutex_lock( &m ); + + while ( queue.empty( ) == true ) + { + pthread_cond_wait( &consumer_cv, &m ); + } + + ParsedUrl * front = queue.top( ); + queue.pop( ); + + pthread_mutex_unlock( &m ); + + return front; + + } + +size_t UrlFrontier::Size ( ) + { + pthread_mutex_lock( &m ); + size_t size = queue.size( ); + pthread_mutex_unlock( &m ); + return size; + } diff --git a/crawler/urlFrontierTest.h b/crawler/UrlFrontier.h similarity index 54% rename from crawler/urlFrontierTest.h rename to crawler/UrlFrontier.h index 301145aa12a570519c803a25c4dd3042d2c4d7ed..61adc3e82f51f811e27ada13f5bc019a9bffd0f5 100644 --- a/crawler/urlFrontierTest.h +++ b/crawler/UrlFrontier.h @@ -11,27 +11,34 @@ using namespace std; class ComparisonClass { - bool operator() (ParsedUrl lhs , ParsedUrl rhs) { +public: + bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) { //comparison code here - return lhs.getScore() < rhs.getScore(); + return lhs->getScore() < rhs->getScore(); } }; -class UrlFrontier : public ProducerConsumerQueue<ParsedUrl> +class UrlFrontier { public: - void Add ( ParsedUrl url ); + void Push ( ParsedUrl * url ); + void checkUrl(ParsedUrl * url); + + ParsedUrl * Pop ( ); + size_t Size(); + pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + pthread_cond_t consumer_cv = PTHREAD_COND_INITIALIZER; + std::priority_queue<ParsedUrl *, std::vector<ParsedUrl*>, ComparisonClass> queue; + - ParsedUrl Get ( ); private: unordered_map< string , bool > *duplicateUrlMap = new unordered_map< string, bool >( ); unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( ); - std::priority_queue<ParsedUrl, std::vector<ParsedUrl>, ComparisonClass> pq; }; diff --git a/crawler/crawler.h b/crawler/crawler.h index 1aec53195b1ade3e2b255e54776012a70687d679..b2727e2dedec2ab99abcaebfc314ae63c3fc503f 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -18,7 +18,7 @@ class Crawler public: Crawler ( string mode_in, - ProducerConsumerQueue< ParsedUrl > *url_q_in, + ProducerConsumerQueue< ParsedUrl * > *url_q_in, ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ) : IndexerQueue( doc_index_queue_in ), mode( mode_in ), @@ -38,7 +38,7 @@ public: private: vector< Spider * > spiders; - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + ProducerConsumerQueue< ParsedUrl * > *urlFrontier; ProducerConsumerQueue< DocIndex * > *IndexerQueue; //CrawlerStatistics housekeeper; string mode; diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 8cbda419b69df25a569315c1fef9acf5fa0f7a31..7b2f53a014251f3965140d1978ab003f8a7867b7 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -14,29 +14,29 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >; // FIND A BETTER PLACE TO PUT THIS FUNCTION -StreamReader *SR_factory ( ParsedUrl url, string mode ) +StreamReader *SR_factory ( ParsedUrl * url, string mode ) { string localFile; StreamReader *newReader = nullptr; if ( mode == "local" ) { - newReader = new LocalReader( url.getCompleteUrl() ); + newReader = new LocalReader( url->getCompleteUrl() ); } else if ( mode == "web" ) { - if ( url.getService() == "http" ) + if ( url->getService() == "http" ) { newReader = new HttpReader( url ); } - else if ( url.getService() == "https" ) + else if ( url->getService() == "https" ) { newReader = new HttpsReader( url ); } else { cerr << "Error reading service type\n"; - cerr << "Service Type: " << url.getService() << "\n"; + cerr << "Service Type: " << url->getService() << "\n"; } } @@ -70,7 +70,7 @@ size_t Spider::hash ( const char *s ) } -ParsedUrl Spider::getUrl ( ) +ParsedUrl * Spider::getUrl ( ) { return urlFrontier->Pop( ); } @@ -82,8 +82,8 @@ void Spider::run ( ) while ( cond < 250 ) { - ParsedUrl currentUrl = getUrl( ); - size_t docID = hash( currentUrl.getCompleteUrl().c_str() ); + ParsedUrl * currentUrl = getUrl( ); + size_t docID = hash( currentUrl->getCompleteUrl().c_str() ); if ( shouldURLbeCrawled( docID ) ) { StreamReader *reader = SR_factory( currentUrl, this->mode ); @@ -92,11 +92,11 @@ void Spider::run ( ) bool success = reader->request( ); if ( success ) { - cout << "Parsing " << currentUrl.getCompleteUrl(); + cout << "Parsing " << currentUrl->getCompleteUrl(); DocIndex *dict = parser.execute( reader ); IndexerQueue->Push( dict ); - printDocIndex(dict); + //printDocIndex(dict); reader->closeReader( ); //delete dict; diff --git a/crawler/spider.h b/crawler/spider.h index 03b40f9a3e5d9fb55e9e9d96b3e30063c1655882..01cdc82d6b3dade0bf043bc2ba90f34e41dc25e6 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -23,7 +23,7 @@ class Spider : public ThreadClass public: Spider ( string mode_in, - ProducerConsumerQueue< ParsedUrl > *url_q_in, + ProducerConsumerQueue< ParsedUrl* > *url_q_in, unordered_map< size_t, int > *duplicate_url_map_in, ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ) @@ -38,7 +38,7 @@ public: //Takes a url off of the url frontier - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); virtual void run ( ); @@ -54,7 +54,7 @@ public: private: int locationOnDisk; - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + ProducerConsumerQueue< ParsedUrl * > *urlFrontier; ProducerConsumerQueue< DocIndex * > *IndexerQueue; unordered_map< size_t, int > *duplicateUrlMap; string mode; diff --git a/crawler/tests/crawlerTest.cpp b/crawler/tests/crawlerTest.cpp index 4c94db4ff982e9a6ed9fbbce57279104ce100dc1..991c319ce2bdec4fb6d786836cf1d78740df69a4 100644 --- a/crawler/tests/crawlerTest.cpp +++ b/crawler/tests/crawlerTest.cpp @@ -58,10 +58,17 @@ int main ( int argc, char *argv[] ) */ SSL_library_init( ); - string url1 = "https://fivethirtyeight.com/features/fear-not-readers-we-have-rss-feeds/"; - string url2 = "https:"; - ParsedUrl url = ParsedUrl(url2); - urlFrontier->Push(url); + //string url1 = "https://fivethirtyeight.com"; + //string url2 = "https:"; + + string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1"; + string bad_url2 ="http-equiv=Content-Type"; + //ParsedUrl url = ParsedUrl(bad_url); + ParsedUrl url1 = ParsedUrl(bad_url); + ParsedUrl url2 = ParsedUrl(bad_url2); + urlFrontier->Push(url1); + + urlFrontier->Push(url2); indexer.StartThread( ); Crawler crawler( mode, urlFrontier, IndexerQueue ); diff --git a/crawler/tests/urlFrontierTest.cpp b/crawler/tests/urlFrontierTest.cpp index 8b3907e5cb52df59e3f30294a6f08cde3df98aa8..829fbe3750571c359f7affd1027d0d9c169fca2c 100644 --- a/crawler/tests/urlFrontierTest.cpp +++ b/crawler/tests/urlFrontierTest.cpp @@ -16,8 +16,7 @@ #include "../../util/util.h" #include <getopt.h> #include "../../indexer/Indexer.h" -#include "../urlFrontierTest.h" - +#include "../UrlFrontier.h" using DocIndex = const unordered_map< string, vector< unsigned long > >; @@ -44,8 +43,8 @@ int main ( int argc, char *argv[] ) //string url2 = "https:"; //string bad_url = "http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\" />"; - ParsedUrl url = ParsedUrl(url1); - urlFrontier->Add(url); + ParsedUrl * url = new ParsedUrl(url1); + urlFrontier->Push(url); indexer.StartThread( ); Crawler crawler( mode, urlFrontier, IndexerQueue ); diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 2ee12072b43f2603e6143e398bdfc0bf8af80968..ae7f2e1880d94c696686ab3eed56f6585bf3cfef 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -42,7 +42,7 @@ void Indexer::run ( ) docEnd.docNumWords = indexedCount; docEndings.push_back(docEnd); - if(currentBlockNumberWords >= 10000) { + if(currentBlockNumberWords >= 100000) { save(); reset(); } diff --git a/isolated-integration b/isolated-integration index e0ed25804c897030d5baf01a57fdfdced4f559bd..85c7aa99ed2c98423473342fb196a327b89df840 100755 Binary files a/isolated-integration and b/isolated-integration differ diff --git a/main.cpp b/main.cpp index dfc234ec6aeeb71a12316316afc1b6b58680ed79..4e23cfe4178812232f31db621ae2b9c0648c7f84 100644 --- a/main.cpp +++ b/main.cpp @@ -97,7 +97,7 @@ int main ( int argc, char *argv[] ) unordered_map< size_t, int > *duplicateUrlMap = new unordered_map< size_t, int >( ); - ProducerConsumerQueue< ParsedUrl > *urlFrontier = new ProducerConsumerQueue< ParsedUrl >( ); + ProducerConsumerQueue< ParsedUrl * > *urlFrontier = new ProducerConsumerQueue< ParsedUrl * >( ); ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( ); @@ -117,7 +117,7 @@ int main ( int argc, char *argv[] ) if ( *seeds == '\n' ) { - ParsedUrl url = ParsedUrl( testFile ); + ParsedUrl * url = new ParsedUrl( testFile ); cout << "Pushing: " << testFile << " to queue\n"; urlFrontier->Push( url ); testFile = ""; @@ -129,7 +129,7 @@ int main ( int argc, char *argv[] ) if ( testFile != "" ) { cout << "Pushing: " << testFile << " to queue\n"; - ParsedUrl url = ParsedUrl( testFile ); + ParsedUrl * url = new ParsedUrl( testFile ); urlFrontier->Push( url ); } diff --git a/parser/Parser.cpp b/parser/Parser.cpp index f0f56c028c296b7966a9544844f8df67b9d06dac..1fed6c1f40d91412181efcc0e0e1ca8fd8881cfa 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -5,7 +5,7 @@ * Parser Cstor * @param urlFrontierIn */ -Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn ) +Parser::Parser ( ProducerConsumerQueue< ParsedUrl* > *urlFrontierIn ) { urlFrontier = urlFrontierIn; } @@ -31,17 +31,17 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) unsigned long htmlIt = 0; unsigned long offset = 0; - ParsedUrl currentUrl = reader->getUrl( ); + ParsedUrl * currentUrl = reader->getUrl( ); // tokenize anchor // TODO ParsedUrl with anchor text - string anchorText = currentUrl.getAnchorText( ); + string anchorText = currentUrl->getAnchorText( ); if ( anchorText != "" ) { offset = tokenizer->execute( anchorText, offset, Tokenizer::ANCHOR ); } // tokenize url - offset = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offset, Tokenizer::URL ); + offset = tokenizer->execute( currentUrl->getHost( ) + "/" + currentUrl->getPath( ), offset, Tokenizer::URL ); string html = reader->PageToString( ); while ( htmlIt < html.size( ) ) @@ -256,22 +256,30 @@ bool Parser::isValid ( string url ) * @param anchorText --> will be "null" if empty * @param debug --> will print urls to std::cout */ -void Parser::pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug ) +void Parser::pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorText, bool debug ) { - if ( isLocal( url ) ) + if ( isLocal( url )) { - url = currentUrl.getCompleteUrl( ) + url; + url = currentUrl->getCompleteUrl( ) + url; } - if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) ) + if ( isValid( url ) && url != currentUrl->getCompleteUrl( )) { - ParsedUrl pUrl = ParsedUrl( url ); - pUrl.setAnchorText( anchorText ); - urlFrontier->Push( pUrl ); - if ( debug ) + try { - cout << url << endl; - cout << anchorText << endl; + ParsedUrl *pUrl = new ParsedUrl( url ); + pUrl->setAnchorText( anchorText ); + urlFrontier->Push( pUrl ); + if ( debug ) + { + cout << url << endl; + cout << anchorText << endl; + } } + catch (exception e) + { + cerr << "HTML url parsed from web page had issue creating object" << endl; + } + } } @@ -308,7 +316,7 @@ void Parser::removeTag ( string & html, unsigned long & htmlIt, unsigned long sa */ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ) + ParsedUrl * currentUrl ) { // check if line is url string title = extractTitle( line ); @@ -372,7 +380,7 @@ bool Parser::isTag ( string html, string tag ) */ string Parser::extractBody ( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ) + ParsedUrl * currentUrl ) { string body = ""; unsigned long startParTag = findNext( "<p", 0, html ); diff --git a/parser/Parser.h b/parser/Parser.h index 33f3bbecaba4e091bf438a6c76ae1a3add9d4014..3f9642105cc483aa7bdc3ec3f1dd9af502ad51c1 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -27,7 +27,7 @@ public: * Parser Cstor * @param urlFrontierIn */ - Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn ); + Parser ( ProducerConsumerQueue< ParsedUrl* > *urlFrontierIn ); /** * Executes the Parser @@ -37,7 +37,7 @@ public: private: - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + ProducerConsumerQueue< ParsedUrl* > *urlFrontier; /** * Parses file @@ -92,7 +92,7 @@ private: * @param anchorText * @param debug --> will print urls to std::cout */ - void pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug ); + void pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorText, bool debug ); /** * Returns true if given tag @@ -116,7 +116,7 @@ private: */ string extractBody ( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ); + ParsedUrl * currentUrl ); /** @@ -131,7 +131,7 @@ private: */ void extractAll ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ); + ParsedUrl * currentUrl ); //TODO delete?? may not need /** diff --git a/shared/url.h b/shared/url.h index 7f7de34157cc6a0acdf77cf7a67d998ea9d4fbe8..a5dc6810b85a4dbd6a6af24df66f3fcc1853b3d5 100644 --- a/shared/url.h +++ b/shared/url.h @@ -5,6 +5,7 @@ #include <iostream> #include "../util/util.h" #include <math.h> +#include <time.h> //#include "../crawler/SocketReader.h" using namespace std; @@ -28,6 +29,7 @@ private: AnchorText; double Score; + public: ParsedUrl() {} @@ -36,87 +38,99 @@ public: { // Assumes url points to static text but // does not check. - char *temp_CompleteUrl, - *temp_Service, - *temp_Host, - *temp_Domain, - *temp_Path, - *temp_AnchorText, - *temp_pathBuffer; + try + { - //intialize anchor text to "" - char *null = new char[2]; - strcpy( null, string( "" ).c_str( ) ); - temp_AnchorText = null; + char *temp_CompleteUrl, + *temp_Service, + *temp_Host, + *temp_Domain, + *temp_Path, + *temp_AnchorText, + *temp_pathBuffer; - char *url = new char[input_url.length( ) + 1]; - strcpy( url, input_url.c_str( ) ); + //intialize anchor text to "" + char *null = new char[2]; + strcpy( null, string( "" ).c_str( ) ); + temp_AnchorText = null; - temp_CompleteUrl = url; + char *url = new char[input_url.length( ) + 1]; + strcpy( url, input_url.c_str( ) ); - temp_pathBuffer = new char[strlen( url ) + 1]; - char *f, *t; - for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); ); + temp_CompleteUrl = url; - temp_Service = temp_pathBuffer; + temp_pathBuffer = new char[strlen( url ) + 1]; + char *f, *t; + for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); ); - const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; - char *p; - for ( p = temp_pathBuffer; *p && *p != Colon; p++ ); + temp_Service = temp_pathBuffer; - if ( *p ) - { - // Mark the end of the Service. - *p++ = 0; + const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; + char *p; + for ( p = temp_pathBuffer; *p && *p != Colon; p++ ); - if ( *p == Slash ) - p++; - if ( *p == Slash ) - p++; + if ( *p ) + { + // Mark the end of the Service. + *p++ = 0; - temp_Host = p; + if ( *p == Slash ) + p++; + if ( *p == Slash ) + p++; - for ( ; *p && *p != Slash; p++ ); + temp_Host = p; - if ( *p ) - // Mark the end of the Host. - *p++ = 0; + for ( ; *p && *p != Slash; p++ ); - //char * domainBuffer = new char[ 20 ]; - //get the domain: - char *i = temp_Host; - temp_Domain = null; - if(i) - { - for ( ; *i; i++ ) + if ( *p ) + // Mark the end of the Host. + *p++ = 0; + + //char * domainBuffer = new char[ 20 ]; + //get the domain: + char *i = temp_Host; + temp_Domain = null; + if(i) { - if ( *i == Period ) - temp_Domain = i; + for ( ; *i; i++ ) + { + if ( *i == Period ) + temp_Domain = i; + } } + + // Whatever remains is the Path. // need to remove fragments + + temp_Path = p; + for ( ; *p && *p != HashTag; p++ ); + + if ( *p ) + // Mark the end of the Path, remove fragments. + *p++ = 0; } + else + temp_Host = temp_Path = p; - // Whatever remains is the Path. // need to remove fragments - temp_Path = p; - for ( ; *p && *p != HashTag; p++ ); + CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); + Service = string(temp_Service, strlen(temp_Service)); + Host = string(temp_Host, strlen(temp_Host)); + Domain = string(temp_Domain, strlen(temp_Domain)); + Path = string(temp_Path, strlen(temp_Path)); + AnchorText = string(temp_AnchorText, strlen(temp_AnchorText)); + pathBuffer = temp_pathBuffer; - if ( *p ) - // Mark the end of the Path, remove fragments. - *p++ = 0; - } - else - temp_Host = temp_Path = p; + setScore( ); + } + catch (exception e) + { + cerr << "Error constructing a ParsedUrl from string url "<< endl; + throw std::runtime_error("Unable to construct ParsedUrl Object"); - CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); - Service = string(temp_Service, strlen(temp_Service)); - Host = string(temp_Host, strlen(temp_Host)); - Domain = string(temp_Domain, strlen(temp_Domain)); - Path = string(temp_Path, strlen(temp_Path)); - AnchorText = string(temp_AnchorText, strlen(temp_AnchorText)); - pathBuffer = temp_pathBuffer; + } - setScore( ); } void printUrl ( ) @@ -186,6 +200,18 @@ public: return Path; } + + double getScore ( ) + { + return Score; + } + + void updateScore( double time ) + { + + Score += 3 * time; + } + std::string getAnchorText ( ) { return AnchorText; diff --git a/url-frontier-test b/url-frontier-test new file mode 100755 index 0000000000000000000000000000000000000000..c380a78ab01564d2afaf0db2fd816644b584051e Binary files /dev/null and b/url-frontier-test differ