diff --git a/CMakeLists.txt b/CMakeLists.txt index 412b88970990bc2d1d99d0b7f30af467f2a324c3..a3c0a916fef24037cb926cd96ecd45faa3f23ed7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ add_executable(crawler-parser-Test shared/ThreadClass.h shared/url.h crawler/crawler.cpp + crawler/UrlFrontier.cpp crawler/Readers/StreamReader.h crawler/Readers/HttpReader.cpp crawler/Readers/HttpsReader.cpp @@ -33,6 +34,8 @@ add_executable(crawler-parser-indexer-Test shared/ThreadClass.h shared/url.h crawler/crawler.cpp + crawler/UrlFrontier.cpp + crawler/HouseKeeper.cpp crawler/Readers/StreamReader.h crawler/Readers/HttpReader.cpp crawler/Readers/HttpsReader.cpp @@ -52,6 +55,7 @@ add_executable(isolated-integration crawler/tests/crawlerTest.cpp shared/ProducerConsumerQueue.h shared/ThreadClass.h + crawler/UrlFrontier.cpp shared/url.h crawler/crawler.cpp crawler/Readers/StreamReader.h @@ -67,6 +71,29 @@ add_executable(isolated-integration util/stringProcessing.cpp indexer/Indexer.cpp) + + + +add_executable(url-frontier-test + crawler/tests/urlFrontierTest.cpp + shared/ProducerConsumerQueue.h + shared/ThreadClass.h + shared/url.h + crawler/crawler.cpp + crawler/UrlFrontier.cpp + crawler/Readers/StreamReader.h + crawler/Readers/HttpReader.cpp + crawler/Readers/HttpsReader.cpp + crawler/Readers/LocalReader.cpp + crawler/spider.cpp + util/util.cpp + shared/Document.cpp + parser/Parser.cpp + util/Stemmer.cpp + util/Tokenizer.cpp + util/stringProcessing.cpp + indexer/Indexer.cpp) + add_executable(StringProcessingTest util/stringProcessing.cpp util/Stemmer.cpp @@ -92,6 +119,7 @@ add_executable(ParserTest shared/ProducerConsumerQueue.h util/stringProcessing.cpp util/Stemmer.cpp + crawler/UrlFrontier.cpp parser/tests/parserTest.cpp crawler/Readers/StreamReader.h crawler/Readers/LocalReader.cpp @@ -113,11 +141,26 @@ add_executable(ISRWord-tests util/stringProcessing.cpp util/Stemmer.cpp ) + + +add_executable(ISROR-tests + util/util.cpp + constraintSolver/ISR.cpp + constraintSolver/ISRWord.cpp + constraintSolver/ISROr.cpp + constraintSolver/tests/ISROrTests.cpp + constraintSolver/ISREndDoc.cpp + util/stringProcessing.cpp + util/Stemmer.cpp ) + + + find_package(OpenSSL REQUIRED) target_link_libraries(ParserTest OpenSSL::SSL) target_link_libraries(isolated-integration OpenSSL::SSL pthread) +target_link_libraries(url-frontier-test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-Test OpenSSL::SSL pthread) target_link_libraries(crawler-parser-indexer-Test OpenSSL::SSL pthread) diff --git a/ISRWord-tests b/ISRWord-tests deleted file mode 100755 index 41505cee0a4e4ce14664ef97256d66e1933e5621..0000000000000000000000000000000000000000 Binary files a/ISRWord-tests and /dev/null differ diff --git a/constraintSolver/ISR.h b/constraintSolver/ISR.h index a776245a23a0b2498aaef77aeaac98395b16d7d0..148909d2afb156ae09293cbfc99f458b38040b94 100644 --- a/constraintSolver/ISR.h +++ b/constraintSolver/ISR.h @@ -5,8 +5,15 @@ #pragma once //#include "Post.h" +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> -typedef size_t Location; // Location 0 is the null location. +typedef size_t Location; // Location 0 is the null location. class ISR { @@ -29,7 +36,6 @@ public: //Returns first instance of word after target location virtual Location Seek ( Location target ); - virtual ISR *GetDocumentISR ( ); //Returns the location of the end of the document virtual Location GetEndDocument ( ); diff --git a/constraintSolver/ISROr.cpp b/constraintSolver/ISROr.cpp index 433269b93fd1bc04c09386d30c4665c761f7ab3c..6a1aa56627a18c234c25dff884c040c56a723d16 100644 --- a/constraintSolver/ISROr.cpp +++ b/constraintSolver/ISROr.cpp @@ -10,6 +10,10 @@ Location ISROr::GetStartLocation ( ) return nearestStartLocation; } +Location ISROr::GetCurrentLocation(){ + return nearestStartLocation; + } + Location ISROr::GetEndLocation ( ) { @@ -17,6 +21,8 @@ Location ISROr::GetEndLocation ( ) } + + Location ISROr::Seek ( Location target ) { @@ -30,38 +36,39 @@ Location ISROr::Seek ( Location target ) - + return 1; } /* Returns the location of the next document that is a match */ -ISR *ISROr::Next ( ) +Location ISROr::Next ( ) { Location nearestEnd = this->nearestTerm->GetEndDocument( ); - while ( *Terms ) + for(auto Term : Terms) { - Location newSeekLocation = *Terms->Seek( nearestEnd + 1 ); + Location newSeekLocation = Term->Seek( nearestEnd + 1 ); if ( newSeekLocation < nearestStartLocation ) { nearestStartLocation = newSeekLocation; - nearestTerm = *Term; + nearestTerm = Term; } - *Terms++; } - return this->nearestTerm->GetDocumentISR( ); + return this->nearestTerm->currentLocation; } -ISR *ISROR::GetCurrentEndDoc ( ) + +/* +ISR *ISROr::GetCurrentEndDoc ( ) { return this->nearestTerm->GetDocumentISR( ); } - +*/ diff --git a/constraintSolver/ISROr.h b/constraintSolver/ISROr.h index aa74f680b54becfd74950a749a03bca469af260b..9486a3c471ae0a55b0ed8aef742ce6a8bb2689a1 100644 --- a/constraintSolver/ISROr.h +++ b/constraintSolver/ISROr.h @@ -5,24 +5,28 @@ #pragma once #include "ISR.h" - +#include <vector> // Find occurrences of any child ISR. - - -class ISROr : publicISR +using namespace std; +class ISROr : public ISR { public: - ISR **Terms; + vector<ISR*>Terms; unsigned NumberOfTerms; + Location GetCurrentLocation(); + Location GetStartLocation ( ); Location GetEndLocation ( ); Location Seek ( Location target ); - ISR *GetCurrentEndDoc ( ); + //ISR *GetCurrentEndDoc ( ); + + + Location First ( ) ; Location Next ( ); //{ Do a next on the nearest term, then return// the new nearest match.} @@ -34,12 +38,11 @@ public: // { Seek all the ISRs to the first occurrence just past the end of this document.returnSeek( DocumentEnd->GetEndLocation( ) + 1 );} - ISROr ( ISR **InputTerms ) : Terms( InputTerms ) + ISROr ( vector<ISR * > InputTerms ) : Terms( InputTerms ) { - ISR *currentTerm = *InputTerms; - While( *currentTerm ) - { + for(auto currentTerm : InputTerms) + { currentTerm->First( ); Location currentLocation = currentTerm->currentLocation; if ( currentLocation < nearestStartLocation ) @@ -53,7 +56,7 @@ public: nearestEndLocation = currentLocation; } ++NumberOfTerms; - *currentTerm++; + currentTerm++; } diff --git a/constraintSolver/ISRWord.cpp b/constraintSolver/ISRWord.cpp index 7be6255fdf4d4911590dd09741b6b54ffc210933..2c180c50eedb71c05babe0a0c45e74cc2902fe0c 100644 --- a/constraintSolver/ISRWord.cpp +++ b/constraintSolver/ISRWord.cpp @@ -12,7 +12,7 @@ ISRWord::ISRWord ( char *word ) : term( word ) { getChunks( ); currentChunk = 0; - currentLocation = first( ); + currentLocation = First( ); } // put into util file @@ -115,7 +115,7 @@ void ISRWord::getChunks() { //set current memory map //returns offset into corpus -Location ISRWord::first ( ) +Location ISRWord::First ( ) { string currentChunkSeekFileLocation = util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + @@ -147,7 +147,7 @@ Location ISRWord::first ( ) //find way to increment to next delta //return new location -Location ISRWord::next ( ) +Location ISRWord::Next ( ) { if ( *currentMemMap == '\n' ) { @@ -158,7 +158,7 @@ Location ISRWord::next ( ) return currentLocation; } - currentLocation = first( ); + currentLocation = First( ); } else { @@ -184,7 +184,7 @@ Location ISRWord::getCurrentLocation() //check seek lookup table to find if offset+absulte is bigger than target //if so, set location to that big chunk //go to next chunk -Location ISRWord::seek( Location target ) { +Location ISRWord::Seek( Location target ) { if(!wordSeekLookupTable.empty()) { auto best = wordSeekLookupTable.front(); for(auto entry : wordSeekLookupTable) { @@ -201,7 +201,7 @@ Location ISRWord::seek( Location target ) { } } } else { - while(next() <= target) { + while(Next() <= target) { } return currentLocation; } diff --git a/constraintSolver/ISRWord.h b/constraintSolver/ISRWord.h index 834e5373b197ac39c3634e2aec483acfbf79c3c1..b7a848df47663abd5eb02194063e850b1f0a38c1 100644 --- a/constraintSolver/ISRWord.h +++ b/constraintSolver/ISRWord.h @@ -12,51 +12,50 @@ #include <sys/types.h> #include "WordSeek.h" #include "../util/util.h" - +#include "ISR.h" using namespace std; //Find occurrences of individual words -typedef size_t Location; -class ISRWord +class ISRWord : public ISR { -public: - ISRWord ( char *word ); + public: + ISRWord ( char *word ); - vector< size_t > getSeekContents ( string fileName ); + vector< size_t > getSeekContents ( string fileName ); - unsigned GetDocumentCount ( ); + unsigned GetDocumentCount ( ); - unsigned GetNumberOfOccurrences ( ); + unsigned GetNumberOfOccurrences ( ); - // ISR* DocumentEnd; - Location first ( ); + // ISR* DocumentEnd; + Location First ( ); - Location next ( ); + Location Next ( ); - Location nextDocument ( ); + Location nextDocument ( ); - Location seek ( Location target ); + Location Seek ( Location target ); - // ISR *GetDocumentISR( ); + // ISR *GetDocumentISR( ); - Location GetEndDocument ( ); - Location currentLocation; - char *term; - char *masterIndex; - vector< size_t > listOfChunks; - vector< WordSeek > wordSeekLookupTable; - size_t currentChunk; - char *currentMemMap; + Location GetEndDocument ( ); + Location currentLocation; + char *term; + char *masterIndex; + vector< size_t > listOfChunks; + vector< WordSeek > wordSeekLookupTable; + size_t currentChunk; + char *currentMemMap; - //set member variables to all of the chunks that occur, update current chunk - void getChunks ( ); - Location getCurrentLocation(); + //set member variables to all of the chunks that occur, update current chunk + void getChunks ( ); + Location getCurrentLocation(); -private: + private: }; diff --git a/constraintSolver/tests/ISROrTests.cpp b/constraintSolver/tests/ISROrTests.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2db362b0a3e4f036aeaf68dc8b941e8e6d4ace7b --- /dev/null +++ b/constraintSolver/tests/ISROrTests.cpp @@ -0,0 +1,45 @@ +// +// Created by Jake Close on 3/16/18. +// + +#include <iostream> +#include <set> +#include "../../indexer/DocumentEnding.h" +#include "../ISRWord.h" +#include "../ISREndDoc.h" +#include "../ISROr.h" +#include <vector> +using namespace std; + +int main ( ) + { + char* query; + ISRWord *q1 = new ISRWord("iphone"); + ISRWord *q2 = new ISRWord("apple"); + vector< ISR* > input; + input.push_back(q1); + input.push_back(q2); + ISROr *queryOr = new ISROr(input); + ISREndDoc endDocs; + vector<size_t> locations; + vector<DocumentEnding> docEnds; + set<string> urls; + while(queryOr->GetCurrentLocation() != 9999999999999) { + locations.push_back(queryOr->Next()); + } + while(endDocs.next().url != "aaa") + { + for(auto locs : locations) + { + if(locs < endDocs.getCurrentDoc().docEndPosition && + locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) { + urls.insert(endDocs.getCurrentDoc().url); + } + } + + } + for(auto urrl : urls) { + cout << urrl << endl; + } + return 0; + } \ No newline at end of file diff --git a/constraintSolver/tests/ISRWordTests.cpp b/constraintSolver/tests/ISRWordTests.cpp index 7cbc0401b135536cec0c758fc7d7ac88a8766b5a..7f773e46b676e1502cde3dbc446517fab2f31531 100644 --- a/constraintSolver/tests/ISRWordTests.cpp +++ b/constraintSolver/tests/ISRWordTests.cpp @@ -20,17 +20,19 @@ int main ( ) vector<DocumentEnding> docEnds; set<string> urls; while(queryWord.getCurrentLocation() != 9999999999999) { - locations.push_back(queryWord.next()); + locations.push_back(queryWord.Next()); } - while(endDocs.next().url != "aaa") { - for(auto locs : locations) { + while(endDocs.next().url != "aaa") + { + for(auto locs : locations) + { if(locs < endDocs.getCurrentDoc().docEndPosition && locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) { urls.insert(endDocs.getCurrentDoc().url); } } - } + } for(auto urrl : urls) { cout << urrl << endl; } diff --git a/crawler-parser-indexer-test b/crawler-parser-indexer-test index f997cca9827a90356576c20360b05e735d8bc86d..274ebf64d038a2796444faa538c51f6b2bb9c77a 100755 Binary files a/crawler-parser-indexer-test and b/crawler-parser-indexer-test differ diff --git a/crawler-parser-test b/crawler-parser-test deleted file mode 100755 index a10a9df06cf72703b91379bc6bad67723c8c33df..0000000000000000000000000000000000000000 Binary files a/crawler-parser-test and /dev/null differ diff --git a/crawler/HouseKeeper.cpp b/crawler/HouseKeeper.cpp index 49907d11bf4f1362282ef7c85a58cb6b1810608f..6d2274588789e3457f67c9e61d415a04fb61b95f 100644 --- a/crawler/HouseKeeper.cpp +++ b/crawler/HouseKeeper.cpp @@ -1,11 +1,21 @@ // -// Created by Ben Bergkamp on 2/1/18. +// Created by Jake Close on 2/1/18. // - +#include <thread> // std::this_thread::sleep_for +#include <chrono> // std::chrono::seconds #include "HouseKeeper.h" -void HouseKeeper::FuncToRun ( ) - { +void HouseKeeper::run(){ //Sleep(3 minutes) //Gather data + cout << "SAVING STATE OF URL FRONTIER " << endl; + while(true) + { + std::this_thread::sleep_for (std::chrono::seconds(30)); + + crawler->urlFrontier->writeDataToDisk(); + + } + + } \ No newline at end of file diff --git a/crawler/HouseKeeper.h b/crawler/HouseKeeper.h index 700c786112c8a1e102ca58f0e0b3abd093b648c2..a9b27ccf1af689efc6d7fb755f2056e29235dfc3 100644 --- a/crawler/HouseKeeper.h +++ b/crawler/HouseKeeper.h @@ -8,20 +8,20 @@ #include<string> #include <pthread.h> #include <iostream> - +#include "crawler.h" class HouseKeeper : public ThreadClass { public: - HouseKeeper ( ) + HouseKeeper ( Crawler * crawler_in ) : crawler(crawler_in) { }; - virtual void FuncToRun ( ); + void run( ); private: //members + Crawler* crawler; }; -#endif //EECS398_SEARCH_CRAWLERSTATISTICS_H diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp index 0424d16003a1dd5fecfb6aac00eb64c1dc58cbdc..e9a9f8f106c71e191268da28c9098bb90e7b0f8a 100644 --- a/crawler/Readers/HttpReader.cpp +++ b/crawler/Readers/HttpReader.cpp @@ -14,11 +14,11 @@ bool HttpReader::request ( ) // Get the host address. - struct hostent *host = gethostbyname( url.getHost().c_str() ); + struct hostent *host = gethostbyname( url->getHost().c_str() ); if ( host == nullptr ) throw HTTPConnectionError; - if(url.getService() != "http") + if(url->getService() != "http") throw HTTPConnectionError; assert( host ); @@ -40,9 +40,9 @@ bool HttpReader::request ( ) cout << "Socket Reader is pulling from the web" << endl; string getMessage = "GET "; - getMessage += url.getCompleteUrl(); + getMessage += url->getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.getHost(); + getMessage += url->getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -78,7 +78,7 @@ string HttpReader::PageToString ( ) return temp; } -ParsedUrl HttpReader::getUrl ( ) +ParsedUrl * HttpReader::getUrl ( ) { return url; } diff --git a/crawler/Readers/HttpReader.h b/crawler/Readers/HttpReader.h index 6f1a88a6e821f080aafa8075239d4e9e92960bbf..a20fd74340167cbe5c55c5816142ef1ebe4c14ed 100644 --- a/crawler/Readers/HttpReader.h +++ b/crawler/Readers/HttpReader.h @@ -9,7 +9,7 @@ class HttpReader : public StreamReader { public: - HttpReader ( ParsedUrl url_in ) : url( url_in ) + HttpReader ( ParsedUrl * url_in ) : url( url_in ) { } bool request ( ); @@ -20,14 +20,14 @@ public: string PageToString ( ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); void closeReader ( ); private: - ParsedUrl url; + ParsedUrl * url; int sock; }; diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp index 31b9528573207b3ec29733342c24c4cef5668eb7..951d9149bc5811b553b3cf0a2928ba23324b0dfc 100644 --- a/crawler/Readers/HttpsReader.cpp +++ b/crawler/Readers/HttpsReader.cpp @@ -7,12 +7,12 @@ bool HttpsReader::request ( ) { try { - struct hostent *host = gethostbyname( url.getHost().c_str() ); + struct hostent *host = gethostbyname( url->getHost().c_str() ); if ( host == nullptr ) throw HTTPSconnectionError; - if( url.getService() != "https") + if( url->getService() != "https") throw HTTPSconnectionError; assert( host ); @@ -54,9 +54,9 @@ bool HttpsReader::request ( ) // Send a GET message for the desired page through the SSL. string getMessage = "GET "; - getMessage += url.getCompleteUrl(); + getMessage += url->getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.getHost(); + getMessage += url->getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -115,7 +115,7 @@ bool HttpsReader::checkStatus ( ) } -ParsedUrl HttpsReader::getUrl ( ) +ParsedUrl * HttpsReader::getUrl ( ) { return url; } diff --git a/crawler/Readers/HttpsReader.h b/crawler/Readers/HttpsReader.h index c993f62e8586d301d5ff6424a5b973b8e0d5d8a1..3d5e6cbf48c96bc046b53af125857db4d259afc6 100644 --- a/crawler/Readers/HttpsReader.h +++ b/crawler/Readers/HttpsReader.h @@ -10,7 +10,7 @@ class HttpsReader : public StreamReader { public: - HttpsReader ( ParsedUrl url_in ) : url( url_in ) + HttpsReader ( ParsedUrl * url_in ) : url( url_in ) { } bool request ( ); @@ -19,14 +19,14 @@ public: string PageToString ( ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); void closeReader ( ); bool checkStatus ( ); private: - ParsedUrl url; + ParsedUrl * url; int sock; SSL *ssl; SSL_CTX *ctx; diff --git a/crawler/Readers/LocalReader.cpp b/crawler/Readers/LocalReader.cpp index cef70bae71e13f472349b1950a57834e7298514e..217a2b480e680b211459d74b78162de34496130f 100644 --- a/crawler/Readers/LocalReader.cpp +++ b/crawler/Readers/LocalReader.cpp @@ -29,10 +29,10 @@ string LocalReader::PageToString ( ) return temp; } -ParsedUrl LocalReader::getUrl ( ) +ParsedUrl * LocalReader::getUrl ( ) { ParsedUrl url(test_url); - return url; + return &url; } bool LocalReader::checkStatus ( ) diff --git a/crawler/Readers/LocalReader.h b/crawler/Readers/LocalReader.h index 034e459422c4bd26cb4e448156fb87043ce55267..dbb716ba86bbeb522b4a0c255789be9e5d418f01 100644 --- a/crawler/Readers/LocalReader.h +++ b/crawler/Readers/LocalReader.h @@ -17,7 +17,7 @@ public: bool fillBuffer ( char *buf, size_t buf_size ); - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); bool checkStatus ( ); diff --git a/crawler/Readers/StreamReader.h b/crawler/Readers/StreamReader.h index 0ebd689c3a1b9ae54f2d05753e3171687572c309..621a809ff1ee1d4daace9176cb6014281c4cf632 100644 --- a/crawler/Readers/StreamReader.h +++ b/crawler/Readers/StreamReader.h @@ -30,7 +30,7 @@ public: virtual string PageToString ( ) = 0; - virtual ParsedUrl getUrl ( ) =0; + virtual ParsedUrl * getUrl ( ) =0; virtual void closeReader ( ) = 0; }; diff --git a/crawler/UrlFrontier.cpp b/crawler/UrlFrontier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de1f137b584e489f019f9b6c82dd390ae6c6617a --- /dev/null +++ b/crawler/UrlFrontier.cpp @@ -0,0 +1,153 @@ +// +// Created by Jake Close on 3/26/18. +// + +#include "urlFrontier.h" + + +//checks the current url to see if should be crawled +//first, checks if the exact url has already seen +//if so , doesnt add to the frontier +//then checks if the host has been seen +//if it has, it checks how long ago it was +// gets that difference and then updates the time score so it +// goes back in the queue +// then adds both to the url map and the host map + + +void UrlFrontier::checkUrl(ParsedUrl* url) + { + + //Looks to see if the complete url already exists, if so return + if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) ) + return ; + + else + { + time_t now; + time( &now ); + double difference = 0; + //Has the domain been seen? + if ( this->domainMap->find( url->getHost( )) != this->domainMap->end( )) + { + //get the last time it was seen and find the time difference + time_t lastSeen = this->domainMap->at( url->getHost( )); + difference = difftime( now , lastSeen); + if(difference == 0) + difference = 5 ; + else + difference = difference/10; + url->updateScore( difference ); + + } + else + this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time + + + //add url to the duplicate url map + this->duplicateUrlMap->insert( url->getCompleteUrl( ) ); + return; + } + } + + +void UrlFrontier::Push( ParsedUrl * url ) + { + //if the url has been seen? if so, dont add it + + checkUrl(url); + + + //set the value of the last time the domain was seen to score + //url.setTime(difference); + //url.setScore(); + pthread_mutex_lock( &m ); + + queue.push( url ); + + if ( queue.size( ) == 1 ) + { + pthread_cond_broadcast( &consumer_cv ); + } + + pthread_mutex_unlock( &m ); + } + + + + + +ParsedUrl * UrlFrontier::Pop() + { + + + pthread_mutex_lock( &m ); + + while ( queue.empty( ) == true ) + { + pthread_cond_wait( &consumer_cv, &m ); + } + + ParsedUrl * front = queue.top( ); + queue.pop( ); + + pthread_mutex_unlock( &m ); + + return front; + + } + +size_t UrlFrontier::Size ( ) + { + pthread_mutex_lock( &m ); + size_t size = queue.size( ); + pthread_mutex_unlock( &m ); + return size; + } + +// Get current date/time, format is YYYY-MM-DD.HH:mm:ss +const std::string currentDateTime() { + time_t now = time(0); + struct tm tstruct; + char buf[80]; + tstruct = *localtime(&now); + // Visit http://en.cppreference.com/w/cpp/chrono/c/strftime + // for more information about date/time format + strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct); + + return buf; + } + + +void UrlFrontier::writeDataToDisk( ) + { + + + cout << "Writing queue to disk" << endl; + + string fileName = util::GetCurrentWorkingDir( ) + "/crawler/savedQueue.txt"; + + if( remove( fileName.c_str() ) != 0 ) + perror( "Error deleting file" ); + else + puts( "File successfully deleted" ); + int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU ); + + pthread_mutex_lock( &m ); + string currentTime = currentDateTime(); + write( file, currentTime.c_str( ), strlen( currentTime.c_str( ) ) ); + + while(! queue.empty() ) + { + ParsedUrl * url = queue.top( ); + queue.pop( ); + write( file, url->getCompleteUrl().c_str( ), strlen( url->getCompleteUrl().c_str( ) ) ); + url = 0; + delete url; + } + pthread_mutex_unlock( &m ); + + close( file ); + + return; + } diff --git a/crawler/UrlFrontier.h b/crawler/UrlFrontier.h new file mode 100644 index 0000000000000000000000000000000000000000..c8a48dee6c6bfa0508b959387c55d2d17e5b5a02 --- /dev/null +++ b/crawler/UrlFrontier.h @@ -0,0 +1,50 @@ +// +// Created by Jake Close on 3/26/18. +// +#pragma once +#include "../shared/ProducerConsumerQueue.h" +//#include "../shared/SharedHashMap.h" +#include "../shared/url.h" +#include <time.h> +#include <unordered_map> +#include <set> +using namespace std; + + +class ComparisonClass { +public: + bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) { + //comparison code here + return lhs->getScore() > rhs->getScore(); + } + }; + + + +class UrlFrontier + { + + public: + void Push ( ParsedUrl * url ); + void checkUrl(ParsedUrl * url); + + ParsedUrl * Pop ( ); + size_t Size(); + pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + pthread_cond_t consumer_cv = PTHREAD_COND_INITIALIZER; + std::priority_queue<ParsedUrl *, std::vector<ParsedUrl*>, ComparisonClass> queue; + + //Writes the duplicate url map and priorty queue from disk + void writeDataToDisk( ); + //Constructs the priority queue and duplicate map from stored data + void readDataFromDisk(); + + + private: + set< string > *duplicateUrlMap = new set< string>( ); + unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( ); + + }; + + + diff --git a/crawler/crawler.cpp b/crawler/crawler.cpp index c03a939ac88e772aaa6de298f290d3f727687eac..0662e0d0978c5f4a720d0b84c57cbf261ecf00c8 100644 --- a/crawler/crawler.cpp +++ b/crawler/crawler.cpp @@ -12,17 +12,16 @@ */ -void Crawler::SpawnSpiders ( size_t num_spiders, - unordered_map< size_t, int > *duplicateUrlMap - ) +void Crawler::SpawnSpiders ( size_t num_spiders , atomic_bool * alive) { for ( size_t i = 0; i < num_spiders; i++ ) { - Spider *temp = new Spider( this->mode, this->urlFrontier , duplicateUrlMap, this->IndexerQueue ); + Spider *temp = new Spider( this->mode, this->urlFrontier, this->IndexerQueue , alive); temp->StartThread( ); this->spiders.push_back( temp ); } + } /* @@ -33,20 +32,36 @@ void Crawler::SpawnSpiders ( size_t num_spiders, void Crawler::WaitOnAllSpiders ( ) { cout << "Waiting for spiders to finish...\n"; + /* for ( Spider *spider : spiders ) { spider->WaitForFinish( ); + delete spider; //FIXME do this in destructor? } } + */ + + while( ! spiders.empty( ) ) + { + Spider *spider = spiders.back(); + spiders.pop_back(); + + + spider->WaitForFinish(); + spider = 0; + delete spider; + } + + } void Crawler::KillAllSpiders ( ) { - cout << "Waiting for spiders to finish...\n"; + //cout << "Waiting for spiders to finish...\n"; for ( Spider *spider : spiders ) { - spider->Die( ); - delete spider; //FIXME do this in destructor? + spider->kill( ); + //delete spider; //FIXME do this in destructor? } } diff --git a/crawler/crawler.h b/crawler/crawler.h index 1aec53195b1ade3e2b255e54776012a70687d679..a3240ff24cbb5eb6ec63dfef4fab3eb0aa8e16ac 100644 --- a/crawler/crawler.h +++ b/crawler/crawler.h @@ -5,6 +5,7 @@ #include<string> #include "../shared/ProducerConsumerQueue.h" #include <unordered_map> +#include "UrlFrontier.h" //#include "CrawlerStatistics.h" /* @@ -18,7 +19,7 @@ class Crawler public: Crawler ( string mode_in, - ProducerConsumerQueue< ParsedUrl > *url_q_in, + UrlFrontier *url_q_in, ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ) : IndexerQueue( doc_index_queue_in ), mode( mode_in ), @@ -26,8 +27,7 @@ public: { }; //spawns a number of works - void SpawnSpiders ( size_t num_spiders, - unordered_map< size_t, int > *duplicateUrlMap ); + void SpawnSpiders ( size_t num_spiders, atomic_bool* alive ); //Creates a housekeeping thread void houseKeeper ( ); @@ -35,10 +35,12 @@ public: void KillAllSpiders ( ); void WaitOnAllSpiders ( ); + UrlFrontier *urlFrontier; + private: vector< Spider * > spiders; - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + //UrlFrontier *urlFrontier; ProducerConsumerQueue< DocIndex * > *IndexerQueue; //CrawlerStatistics housekeeper; string mode; diff --git a/crawler/savedQueue.txt b/crawler/savedQueue.txt new file mode 100755 index 0000000000000000000000000000000000000000..0d57631726560f5f518915d0051d5d160a27e9b1 --- /dev/null +++ b/crawler/savedQueue.txt @@ -0,0 +1 @@ +2018-03-29.17:01:29https://www.boston.com/help/privacy-policy/http://www.bostonglobe.com?p1=hat_re_bghttp://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/category/location-location-location/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/west-roxbury/http://www.boston.com/weather?p1=BGMenu_Subnavhttp://realestate.boston.com/section/renting/http://realestate.boston.com/section/luxury/http://realestate.boston.com/section/new-developments/https://advertising.bostonglobemedia.com/http://realestate.boston.com/section/ask-the-experthttp://www.aconcordcarpenter.com/http://pages.email.bostonglobe.com/AddressSignUphttps://twitter.com/GlobeHomeshttp://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.bostonglobe.com%2Fmetro%2F2017%2F11%2F30%2Ffour-men-allege-sexual-misconduct-senate-president-husband%2F40ABgRdciNITE1kAYrWsUN%2Fstory.html%3Fevent%3Devent25&t=Fourhttp://www.boston.com?p1=hat_homehttps://www.boston.com/help/contact-ushttp://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttp://www.bostonglobemedia.com/careers/https://www.boston.com/help/privacy-policy/http://www.bostonglobe.com?p1=hat_re_bghttp://www.boston.com?p1=hat_homehttps://www.boston.com/help/member-agreementhttp://twitter.com/intent/tweet?text=Fourhttp://www.bostonglobe.com?p1=hat_re_bghttp://www.bostonglobe.com?p1=hat_re_bghttp://sponsored.bostonglobe.com/mountwashingtonvalley/kick-back-mt-washington-valley/?p1=BG_SponsoredWellhttp://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="comments-blue-icon"><title id="comments-blue-icon">View Comments</title>http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/gardening/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/category/gardening/http://realestate.boston.com/neighborhood/boston/http://twitter.com/intent/tweet?text=Fourhttp://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/luxury/2017/02/14/location-location-location-newton-centre/http://realestate.boston.com/neighborhood/financial-district/http://realestate.boston.com/neighborhood/jamaica-plain/http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="15px" height="17.063px" viewBox="0 0 15 17.063" enable-background="new 0 0 15 17.063" xml:space="preserve" aria-labelledby="comment-title">http://realestate.boston.com/neighborhood/beacon-hill/http://www.bostonglobe.com?p1=hat_re_bghttp://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/milton/http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="arrow-up-title">http://realestate.boston.com/category/location-location-location/http://realestate.boston.com/neighborhood/braintree/https://www.boston.com/cars/car-guides/2018/03/27/experts-say-2018-audi-rs-3?s_campaign=bg:hp:well:carshttps://www.bostonglobe.com/metro/2017/10/27/the-stories-sexual-harassment-beacon-hill-are-overwhelming/0a4T5VADqH9ffipiXfpwGO/story.htmlhttp://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/east-boston/https://www.boston.com/cars/car-reviews/2018/03/27/the-2018-hyundai-kona-compact-crossover-makes-its-debut?s_campaign=bg:hp:well:carshttp://twitter.com/globehttp://realestate.boston.com/neighborhood/financial-district/http://realestate.boston.com/neighborhood/jamaica-plain/https://advertising.bostonglobemedia.com/http://twitter.com/GlobeAbrahamhttp://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/section/open-houses/https://twitter.com/GlobeHomeshttp://realestate.boston.com/neighborhood/milton/https://www.pinterest.com/bostonglobe/boards/https://www.boston.com/https://www.boston.com/help/contact-ushttp://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttps://play.radiopublic.com/love-letters-GAOxdphttp://apps.bostonglobe.com/spotlight/boston-racism-image-reality/?s_campaign=spotlightrace:HPrailhttp://subscribe.bostonglobe.com/B6704/?p1=BGFooter_DigitalSubscription_Bannerhttp://subscribe.bostonglobe.com/B5759/?p1=BGFooterhttp://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/needham-heights/http://www.bostonglobemedia.com/careers/https://www.boston.com/help/privacy-policy#adchoiceshttp://www.bostonglobe.com?p1=hat_re_bghttp://manage.bostonglobe.com/GiftTheGlobe/LandingPage.htmlhttp://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/category/location-location-location/https://bostonglobe.custhelp.com/app/answers/list?p1=BGFooterhttp://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/wellesley/https://www.bostonglobemedia.com/https://www.facebook.com/globehttp://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/weymouth/https://twitter.com/#!/BostonGlobehttp://realestate.boston.com/neighborhood/winthrop/http://nieonline.com/bostonglobe/https://advertising.bostonglobemedia.com/http://www.aconcordcarpenter.com/https://twitter.com/GlobeHomeshttp://subscribe.bostonglobe.com/B5759/?p1=BGFooterhttps://www.bostonglobemedia.com/careershttp://www.bostonglobe.com/?p1=SCHeader_Logohttps://www.boston.com/http://realestate.boston.com/news/2015/09/18/questions-to-ask-when-buying-a-condo/https://www.boston.com/help/contact-ushttp://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttp://www.bostonglobe.com/sports?p1=SCMenuhttp://www.bostonglobe.com?p1=hat_re_bghttp://www.bostonglobe.com?p1=hat_re_bghttp://realestate.boston.com?p1=hat_rehttp://realestate.boston.com/section/news/http://realestate.boston.com/category/location-location-location/http://realestate.boston.com/category/home-of-the-week/https://bostonglobe.custhelp.com/app/home?p1=BGFooterhttp://www.bostonglobe.com/lifestyle?p1=SCMenuhttp://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/style/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/beacon-hill/https:http://sponsored.bostonglobe.com/mountwashingtonvalley/new-englands-winter-playground/?p1=SC_Article_ReadMorehttps://secure.pqarchiver.com/boston-sub/no_default.html?ss=1&url=%2Fboston-sub%2Fadvancedsearch.htmlhttp://sponsored.bostonglobe.com/category/mountwashingtonvalley/?p1=SC_Article_ReadMore_Sectionhttp://realestate.boston.com/location-location-location/2017/02/14/like-live-melrose/http://pages.email.bostonglobe.com/GroundGameSignUp?p1=BG_homepage_newsletter_signuphttp://realestate.boston.com/category/location-location-location/http://realestate.boston.com?s_campaign=bg:hp:mainnav:realestatehttp://www.boston.com/section/cars?s_campaign=bg:hp:mainnav:carshttp://www.boston.com/section/cars?s_campaign=bg:hp:mainnav:carshttp://realestate.boston.com?s_campaign=bg:hp:mainnav:realestatehttps://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/neighborhood/allston-brighton/https://manage.bostonglobe.com/cs/mc/login.aspx?p1=BGFooterhttp://realestate.boston.com/neighborhood/needham-heights/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/neighborhood/fenway-kenmore/http://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttp://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/somerville/https://www.boston.com/help/privacy-policy#adchoiceshttps://www.boston.com/help/privacy-policy#adchoiceshttps://www.boston.com/help/privacy-policy#adchoiceshttps://www.boston.com/help/privacy-policy#adchoiceshttp://realestate.boston.com/category/my-first-home/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/brookline/https://www.boston.com/help/privacy-policy#adchoiceshttp://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/wakefield/https://www.boston.com/help/privacy-policy#adchoiceshttps://plus.google.com/108227564341535363126/abouthttp://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/winthrop/https://plus.google.com/108227564341535363126/abouthttp://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/melrose/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/weston/http://pages.email.bostonglobe.com/AddressSignUphttp://realestate.boston.com/category/gardening/http://realestate.boston.com/category/gardening/https://www.pinterest.com/bostonglobe/boards/http://realestate.boston.com/section/renting/https://www.boston.com/help/member-agreementhttps://www.boston.com/help/member-agreementhttp://realestate.boston.com/category/style/https://www.boston.com/help/member-agreementhttp://realestate.boston.com/section/luxury/https://www.boston.com/help/member-agreementhttp://realestate.boston.com/category/style/https://www.boston.com/help/member-agreementhttp://realestate.boston.com/category/style/https://www.boston.com/help/privacy-policy/http://www.bostonglobemedia.com/bg-brandlabhttps://www.boston.com/help/privacy-policy/https://www.boston.com/help/privacy-policy/https://www.boston.com/help/privacy-policy/https://www.boston.com/help/privacy-policy/http://www.bostonglobemedia.com/careers/http://www.bostonglobemedia.com/careers/http://www.bostonglobemedia.com/careers/http://www.bostonglobemedia.com/careers/http://www.bostonglobemedia.com/careers/https://www.boston.com/help/contact-ushttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttps://www.bostonglobe.com/bgcshttp://bostonglobe.com/opinionhttp://enable-javascript.com/https://www.conwayscenic.com/http://fivethirtyeight.com/https://www.eecs.umich.edu/https://www.nytimes.com/http://www.statnews.com/https://www.boston.com/https://www.boston.com/https://www.boston.com/https://www.boston.com/https://www.wired.com/http://www.espn.com/http://www.bbc.com/http://www.mtwashhttps://en.wikipedia.org/wiki/United_Stateshttp://gmpg.org/xfn/11http://gmpg.org/xfn/11http://gmpg.org/xfn/11http://gmpg.org/xfn/11http://gmpg.org/xfn/11http://realestate.boston.com/listings/268-l-4960-72298454/buy/25-havelock-rd/worcester/ma/01602/http://realestate.boston.com/section/news/http://www.bostonglobe.com/opinion/2018/03/28/your-commute-stinks-because-greater-boston-can-fathom-its-own-growth/9FMkxTCN2oT8nIdJ8TRXeO/story.html?p1=SCMenu_Articlehttps://www.bostonglobe.com/metro/2014/12/05/senator-partner-was-long-considered-success-story/OMpS5srxO08WO9Tz4kpzcJ/story.htmlhttps://www.boston.com/cars/car-news/2018/03/28/self-driving-cars-are-a-matter-of-life-and-death?s_campaign=bg:hp:well:carshttps://www.boston.com/cars/car-news/2018/03/28/real-id-rmv-wait-times?s_campaign=bg:hp:well:carshttp://www.bostonglobe.com/news/politics/2014/06/05/talk/ffiEx4Mf0sJjGrX3EJP3xL/story.htmlhttps://www.statnews.com/2018/03/29/government-health-data-innovation/https://bostonglobe.custhelp.com/app/answers/list?p1=BGFooterhttp://www.bostonglobe.com/lifestyle/comics?p1=SCMenu_Subnavhttps://manage.bostonglobe.com/cs/mc/login.aspx?p1=BGFooterhttps://www.boston.com/cars?s_campaign=bg:hp:well:carshttp://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/winchester/https://www.boston.com/help/privacy-policy#adchoiceshttp://subscribe.bostonglobe.com/B3428/?p1=BGFooterhttp://www.bostonglobe.com/news/politics?p1=SCMenuhttp://pages.email.bostonglobe.com/AddressSignUphttp://www.bostonglobe.com/lifestyle/crosswordhttps://www.boston.com/help/member-agreementhttp://www.bostonglobe.com/metro?p1=SCMenuhttps://www.bostonglobemedia.com/careershttp://www.bostonglobemedia.com/careers/https://www.facebook.com/globe" />http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="print-blue-icon"><title id="print-blue-icon">Print this Article</title>http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="email-blue-icon"><title id="email-blue-icon">Email to a Friend</title>http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="globe-medium-title">http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><title>Right Arrow</title>http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/category/design-new-england/https://www.boston.com/help/contact-ushttps://www.bostonglobe.com/metro/2017/11/28/amid-reports-rampant-harassment-state-house-senate-leader-says-his-office-has-fielded-only-two-complaints/IanujQF4nR7kDGfJuOR75L/story.htmlhttps://www.bostonglobe.com/news/bigpicture/2018/03/16/paralympic-winter-games/hHVXJtIdly8xVLTDQLC4EO/story.htmlhttps://plus.google.com/share?url=https%3A%2F%2Fwww.bostonglobe.com%2Fmetro%2F2017%2F11%2F30%2Ffour-men-allege-sexual-misconduct-senate-president-husband%2F40ABgRdciNITE1kAYrWsUN%2Fstory.html%3Fevent%3Devent25%26s_campaign%3dsm_gp%26hl=en-UShttps://secure.pqarchiver.com/boston-sub/no_default.html?ss=1&url=%2Fboston-sub%2Fadvancedsearch.htmlhttps://www.bostonglobe.com/bgcshttp://realestate.boston.com/neighborhood/watertown/http://www.boston.com/weather?p1=BGMenu_Subnavhttp://realestate.boston.com/section/news/http://realestate.boston.com/section/news/http://realestate.boston.com/section/news/http://realestate.boston.com/section/news/http://realestate.boston.com/section/news/https://www.pinterest.com/bostonglobe/boards/http://realestate.boston.com?p1=hat_rehttp://loveletters.boston.com?p1=hat_llhttp://realestate.boston.com?p1=hat_rehttp://realestate.boston.com?p1=hat_rehttp://loveletters.boston.com?p1=hat_llhttp://loveletters.boston.com?p1=hat_llhttp://realestate.boston.com?p1=hat_rehttp://loveletters.boston.com?p1=hat_llhttp://loveletters.boston.com?p1=hat_llhttp://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/section/ask-the-experthttp://realestate.boston.com/category/home-improvement/http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/section/open-houseshttp://realestate.boston.com/neighborhood/back-bay/http://subscribe.bostonglobe.com/B6704/?p1=BGFooter_DigitalSubscription_Bannerhttp://www.bostonglobe.com/newsletters?p1=BG_homepage_newsletter_signuphttp://realestate.boston.com?p1=hat_rehttp://loveletters.boston.com?p1=hat_llhttp://realestate.boston.com/category/style/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/luxury/https://www.pinterest.com/bostonglobe/boards/http://www.boston.com?p1=hat_homehttp://www.boston.com?p1=hat_homehttp://realestate.boston.com?p1=hat_rehttp://loveletters.boston.com?p1=hat_llhttp://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttps://twitter.com/GlobeHomeshttps://loveletters.boston.com/podcasthttp://pages.email.bostonglobe.com/GroundGameSignUp?p1=BG_homepage_newsletter_signuphttp://realestate.boston.com/category/style/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/luxury/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/gardening/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/luxury/http://realestate.boston.com/category/location-location-location/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/category/style/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/gardening/http://realestate.boston.com/section/luxury/http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/category/style/http://realestate.boston.com/category/gardening/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/my-first-home/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/section/renting/http://realestate.boston.com/section/luxury/http://realestate.boston.com/category/home-of-the-week/http://realestate.boston.com/category/ask-the-expert/http://realestate.boston.com/category/gardening/http://realestate.boston.com/section/open-houses/http://realestate.boston.com/category/home-improvement/https://www.pinterest.com/bostonglobe/boards/https://www.boston.com/help/contact-ushttp://manage.bostonglobe.com/GiftTheGlobe/LandingPage.htmlhttps://www.boston.com/cars/car-news/2018/03/28/real-id-rmv-wait-times?s_campaign=bg:hp:well:carshttp://realestate.boston.com/category/my-first-home/http://www.bostonglobe.com/news/bigpicture?p1=SCMenu_Subnavhttp://realestate.boston.com?s_campaign=bg:hp:mainnav:realestatehttps://www.facebook.com/Bostoncom-Real-Estate-822266861150702http://realestate.boston.com/neighborhood/medford/http://sponsored.bostonglobe.com/mountwashingtonvalley/kick-back-mt-washington-valley/?p1=BG_SponsoredWellhttp://realestate.boston.com/neighborhood/westwood/http://www.mtwashingtonvalley.orghttp://www.bostonglobe.com/?p1=SCMenu_Logohttp://www.bostonglobe.com/business?p1=SCMenuhttp://www.bostonglobe.com/opinion?p1=SCMenuhttp://www.bostonglobe.com/arts?p1=SCMenuhttps://www.boston.com/http://www.bernerhttp://www.bostonglobe.com/2018/03/06/subscribe-new-love-letters-podcast/m3ENKFbbJpq2eGb4BpOaSM/story.htmlhttp://realestate.boston.com/section/home-improvementhttp://realestate.boston.com/section/home-improvementhttps://www.bostonglobe.com/metro/2014/12/03/senator-moves-quell-turmoil-over-partner-tweets/uCAg3mW916Gs5T0ooAcfAM/story.htmlhttp://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/section/luxury/http://www.bostonglobe.com/news/bigpicturehttp://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.bostonglobe.com%2Fmetro%2F2017%2F11%2F30%2Ffour-men-allege-sexual-misconduct-senate-president-husband%2F40ABgRdciNITE1kAYrWsUN%2Fstory.html%3Fevent%3Devent25&title=Fourhttp://www.friendhttps://www.pinterest.com/bostonglobe/boards/http://realestate.boston.com/section/new-developments/http://realestate.boston.com/ask-the-expert/2018/03/28/buyers-agent-role-explained/?s_campaign=bg:hp:well:realestatehttps://loveletters.boston.com/2018/03/still-feel-guilty.htmlhttps://twitter.com/GlobeHomeshttps://www.politico.com/states/massachusetts/story/2015/12/hefner-considering-senate-run-029063http://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttps://www.boston.com/help/contact-ushttps://www.pinterest.com/bostonglobe/boards/https://www.pinterest.com/bostonglobe/boards/https://www.pinterest.com/bostonglobe/boards/https://www.pinterest.com/bostonglobe/boards/https://www.pinterest.com/bostonglobe/boards/http://twitter.com/intent/tweet?text=Kick%20back%20or%20kick%20into%20high%20gear%20this%20spring%20in%20Mt%20Washington%20Valley&url=http://sponsored.bostonglobe.com/mountwashingtonvalley/kick-back-mt-washington-valley/https://mtwashinghttp://mtwashingthttps://manage.bostonglobe.com/Order/newspaper/Newspaper.aspxhttps://manage.bostonglobe.com/Order/newspaper/Newspaper.aspxhttps://loveletters.boston.com/2018/03/feel-things-woman-never-felt-wife.htmlhttp://www.bostonglobe.com/businesshttp://www.facebook.com/sharer.php?u=http://sponsored.bostonglobe.com/mountwashingtonvalley/kick-back-mt-washington-valley/&t=Kick%20back%20or%20kick%20into%20high%20gear%20this%20spring%20in%20Mt%20Washington%20Valleyhttp://subscribe.bostonglobe.com/B5751/?p1=BG_hp_DigitalSubscriptionhttp://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="globe-logo--big" x="0px" y="0px" width="140px" height="17px" viewBox="0 0 140 17" enable-background="new 0 0 140 18.674" xml:space="preserve" aria-labelledby="globe-title">http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="globe-logo--big" x="0px" y="0px" width="140px" height="17px" viewBox="0 0 140 17" enable-background="new 0 0 140 18.674" xml:space="preserve" aria-labelledby="globe-title">http://realestate.boston.com/luxury/2017/02/14/location-location-location-newton-centre/http://sponsored.bostonglobe.com/category/mountwashingtonvalley/?p1=SC_Article_ReadMore_Sectionhttps://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:carshttp://realestate.boston.com?s_campaign=bg:hp:well:realestatehttp://nieonline.com/bostonglobe/https://www.bostonglobemedia.com/https://twitter.com/#!/BostonGlobehttp://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttp://realestate.boston.com/neighborhood/needham-heights/http://realestate.boston.com/neighborhood/allston-brighton/http://realestate.boston.com/location-location-location/2017/02/14/like-live-melrose/https://epaper.bostonglobe.com/launch.aspx?pbid=2c60291d-c20c-4780-9829-b3d9a12687cfhttps://epaper.bostonglobe.com/launch.aspx?pbid=2c60291d-c20c-4780-9829-b3d9a12687cfhttp://subscribe.bostonglobe.com/B3428/?p1=BGFooterhttp://realestate.boston.com/neighborhood/melrose/https://www.boston.com/https://www.boston.com/help/privacy-policy/http://realestate.boston.com/buying/2017/04/06/90988/http://realestate.boston.com/category/home-improvement/http://realestate.boston.com/buying/2018/03/28/three-must-see-open-houses-under-700000/?s_campaign=bg:hp:well:realestatehttp://www.boston.com/cars?s_campaign=bg:hp:well:carshttp://gmpg.org/xfn/11http://sponsored.bostonglobe.com/category/mountwashingtonvalley/?p1=SC_Article_ReadMore_Sectionhttp://mwvvibe.cohttps:http://www.bostonglobe.com/https://www.boston.com/help/contact-ushttps://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702/http://sponsored.bostonglobe.com/mountwashingtonvalley/new-englands-winter-playground/?p1=SC_Article_ReadMorehttps://www.bostonglobe.com/metro/2017/12/04/rosenberg-step-aside-during-investigation/TIAzKnrot0sISPmDUhG6PL/story.htmlhttp://www.bostonglobemedia.com/bg-brandlabhttp://www.bostonglobe.com/lifestyle/crossword?p1=SCMenu_Subnavhttps://itunes.apple.com/us/podcast/love-letters/id1354140820http://www.bostonglobe.com/todayspaper?p1=SCMenu_TodaysPaperhttp://www.bostonglobe.com/magazine?p1=SCMenu_Subnavhttp://gmpg.org/xfn/11http://gmpg.org/xfn/11http://realestate.boston.com/section/buyinghttp://loveletters.boston.com?p1=hat_llhttp://realestate.boston.com/section/news/http://realestate.boston.com?p1=hat_rehttp://www.boston.com?p1=hat_homehttp://www.mtwashingtonvalley.orghttp://www.bostonglobe.com/news/politics/2018/03/27/koch-effort-wellesley-founders-after-public-attention/72BYUzr8Yzq85CeGVSYfXO/story.html?p1=SCMenu_Articlehttps://www.facebook.com/globehttps://twitter.com/GlobeHomeshttps://twitter.com/GlobeHomeshttps://twitter.com/GlobeHomeshttp://www.bostonglobe.com/sports/redsox?p1=BGHeader_MainNavhttp://www.bostonglobe.com/magazine/2018/03/28/was-son-frightening-injury-owl-attack/dn582URJbdxBFuJ2QPLpsL/story.html?p1=SCMenu_Articlehttp://www.bostonglobe.com/metro/obituaries?p1=SCMenu_Subnavhttps://advertising.bostonglobemedia.com/https://advertising.bostonglobemedia.com/https://advertising.bostonglobemedia.com/https://advertising.bostonglobemedia.com/https://advertising.bostonglobemedia.com/http://realestate.boston.com/buying/2017/04/06/90988/http://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.bostonglobe.com%2Fmetro%2F2017%2F11%2F30%2Ffour-men-allege-sexual-misconduct-senate-president-husband%2F40ABgRdciNITE1kAYrWsUN%2Fstory.html%3Fevent%3Devent25%3Fevent%3Devent25&tFourhttp://realestate.boston.com/section/open-houses/http://go.bostonglobemedia.com/l/36752/2015-01-23/37crkhttp://realestate.boston.com/buying/2017/09/14/what-is-it-like-to-live-in-the-south-end/http://www.bostonglobe.com/metro/2018/03/28/martha-vineyard-woman-files-suit-alleging-her-home-was-used-film-pornography/hfgoBYNGzBnaQ2yoob6I2K/story.html?p1=SCMenu_Articlehttp://www.golfmwhttp://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/charlestown/https://www.linkedin.com/shareArticle?mini=true&url=http://sponsored.bostonglobe.com/mountwashingtonvalley/kick-back-mt-washington-valley/&title=Kick%20back%20or%20kick%20into%20high%20gear%20this%20spring%20in%20Mt%20Washington%20Valleyhttps://bostonglobe.custhelp.com/app/home?p1=BGFooterhttp://realestate.boston.com/buying/2017/09/14/what-is-it-like-to-live-in-the-south-end/https://www.horsehttp://realestate.boston.com/neighborhood/financial-district/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/melrose/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/melrose/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/newton/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/melrose/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/newton/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/needham-heights/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/newton/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/needham-heights/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/dorchester/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/downtown/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/melrose/http://realestate.boston.com/neighborhood/brookline/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/waltham/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/charlestown/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/south-boston/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/newton/http://realestate.boston.com/neighborhood/newton/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/wellesley/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/lexington/http://realestate.boston.com/neighborhood/braintree/http://realestate.boston.com/neighborhood/milton/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/south-end/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/wakefield/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/winthrop/http://realestate.boston.com/neighborhood/belmont/http://realestate.boston.com/neighborhood/somerville/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/cambridge/http://realestate.boston.com/neighborhood/concord/http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/neighborhood/reading/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/needham-heights/http://realestate.boston.com/neighborhood/quincy/http://realestate.boston.com/neighborhood/east-boston/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/west-roxbury/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/watertown/http://realestate.boston.com/neighborhood/hingham/http://realestate.boston.com/neighborhood/cape-cod/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/neighborhood/wayland/http://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/jamaica-plain/http://realestate.boston.com/neighborhood/back-bay/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/fenway-kenmore/http://realestate.boston.com/neighborhood/natick/http://realestate.boston.com/neighborhood/westwood/http://realestate.boston.com/neighborhood/medford/http://realestate.boston.com/neighborhood/arlington/http://realestate.boston.com/neighborhood/weymouth/http://realestate.boston.com/neighborhood/sudbury/http://realestate.boston.com/neighborhood/weston/http://realestate.boston.com/neighborhood/winchester/http://realestate.boston.com/neighborhood/beacon-hill/http://realestate.boston.com/news/2015/09/18/questions-to-ask-when-buying-a-condo/https://www.facebook.com/Bostoncom-Real-Estate-822266861150702https://www.facebook.com/Bostoncom-Real-Estate-822266861150702https://www.facebook.com/Bostoncom-Real-Estate-822266861150702http://realestate.boston.com/neighborhood/boston/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com/category/design-new-england/http://realestate.boston.com?s_campaign=bg:hp:well:realestatehttp://realestate.boston.com/neighborhood/north-end/http://realestate.boston.com/neighborhood/newton/https://www.blackhttp://realestate.boston.com/neighborhood/financial-district/http://realestate.boston.com/neighborhood/financial-district/http://realestate.boston.com/neighborhood/financial-district/http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-labelledby="bg-logo-title">http://realestate.boston.com/category/location-location-location/http://realestate.boston.com/category/location-location-location/https://www.pinterest.com/bostonglobe/boards/ \ No newline at end of file diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 8cbda419b69df25a569315c1fef9acf5fa0f7a31..b12d0cc0a805c8037225534b13b26b5f5e39bf84 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -9,34 +9,36 @@ #include "Readers/HttpReader.h" #include "Readers/LocalReader.h" #include "../parser/Parser.h" +#include "UrlFrontier.h" + using DocIndex = const unordered_map< string, vector< unsigned long > >; // FIND A BETTER PLACE TO PUT THIS FUNCTION -StreamReader *SR_factory ( ParsedUrl url, string mode ) +StreamReader *SR_factory ( ParsedUrl * url, string mode ) { string localFile; StreamReader *newReader = nullptr; if ( mode == "local" ) { - newReader = new LocalReader( url.getCompleteUrl() ); + newReader = new LocalReader( url->getCompleteUrl() ); } else if ( mode == "web" ) { - if ( url.getService() == "http" ) + if ( url->getService() == "http" ) { newReader = new HttpReader( url ); } - else if ( url.getService() == "https" ) + else if ( url->getService() == "https" ) { newReader = new HttpsReader( url ); } else { cerr << "Error reading service type\n"; - cerr << "Service Type: " << url.getService() << "\n"; + cerr << "Service Type: " << url->getService() << "\n"; } } @@ -70,7 +72,7 @@ size_t Spider::hash ( const char *s ) } -ParsedUrl Spider::getUrl ( ) +ParsedUrl * Spider::getUrl ( ) { return urlFrontier->Pop( ); } @@ -80,10 +82,10 @@ void Spider::run ( ) std::cout << "Spider is crawling" << endl; int cond = 0; - while ( cond < 250 ) + while (*alive) { - ParsedUrl currentUrl = getUrl( ); - size_t docID = hash( currentUrl.getCompleteUrl().c_str() ); + ParsedUrl * currentUrl = getUrl( ); + size_t docID = hash( currentUrl->getCompleteUrl().c_str() ); if ( shouldURLbeCrawled( docID ) ) { StreamReader *reader = SR_factory( currentUrl, this->mode ); @@ -92,11 +94,11 @@ void Spider::run ( ) bool success = reader->request( ); if ( success ) { - cout << "Parsing " << currentUrl.getCompleteUrl(); + cout << "Parsing " << currentUrl->getCompleteUrl(); DocIndex *dict = parser.execute( reader ); IndexerQueue->Push( dict ); - printDocIndex(dict); + //printDocIndex(dict); reader->closeReader( ); //delete dict; @@ -110,8 +112,14 @@ void Spider::run ( ) } } + cout << "Spider has finished running " << endl; + return; } +void Spider::kill() + { + *(this->alive) = false; + } /* @@ -159,6 +167,7 @@ bool Spider::writeDocToDisk ( ParsedUrl url ) bool Spider::shouldURLbeCrawled ( size_t docID ) { + /* if ( this->duplicateUrlMap->find( docID ) != this->duplicateUrlMap->end( ) ) { return false; @@ -168,6 +177,8 @@ bool Spider::shouldURLbeCrawled ( size_t docID ) this->duplicateUrlMap->insert( std::make_pair( docID, 1 ) ); return true; } + */ + return true; } /* diff --git a/crawler/spider.h b/crawler/spider.h index 03b40f9a3e5d9fb55e9e9d96b3e30063c1655882..fabd1d0abec963b34d9b75da16442122e9e7f2a5 100644 --- a/crawler/spider.h +++ b/crawler/spider.h @@ -12,7 +12,6 @@ #include "../util/util.h" #include "../parser/Parser.h" - using namespace std; using DocIndex = const unordered_map< string, vector< unsigned long > >; @@ -23,22 +22,22 @@ class Spider : public ThreadClass public: Spider ( string mode_in, - ProducerConsumerQueue< ParsedUrl > *url_q_in, - unordered_map< size_t, int > *duplicate_url_map_in, - ProducerConsumerQueue< DocIndex * > *doc_index_queue_in + UrlFrontier *url_q_in, + ProducerConsumerQueue< DocIndex * > *doc_index_queue_in, + atomic_bool * bool_in ) : mode( mode_in ), urlFrontier( url_q_in ), parser( url_q_in ), - duplicateUrlMap( duplicate_url_map_in ), - IndexerQueue( doc_index_queue_in ) + IndexerQueue( doc_index_queue_in ), + alive( bool_in ) { }; //Takes a url off of the url frontier - ParsedUrl getUrl ( ); + ParsedUrl * getUrl ( ); virtual void run ( ); @@ -48,16 +47,19 @@ public: size_t hash ( const char *s ); + + void kill ( ); + //int getRobots(ParsedUrl url ); bool checkRobots ( ParsedUrl url ); private: int locationOnDisk; - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + UrlFrontier *urlFrontier; ProducerConsumerQueue< DocIndex * > *IndexerQueue; - unordered_map< size_t, int > *duplicateUrlMap; string mode; Parser parser; + atomic_bool* alive; }; \ No newline at end of file diff --git a/crawler/tests/crawlerTest.cpp b/crawler/tests/crawlerTest.cpp index 4c94db4ff982e9a6ed9fbbce57279104ce100dc1..fc34d89d09acb8d7ace178eacd4dd3b17e668e55 100644 --- a/crawler/tests/crawlerTest.cpp +++ b/crawler/tests/crawlerTest.cpp @@ -31,7 +31,7 @@ int main ( int argc, char *argv[] ) char *seeds; int numberOfSpiders = 1; unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( ); - ProducerConsumerQueue < ParsedUrl > *urlFrontier = new ProducerConsumerQueue < ParsedUrl >( ); + UrlFrontier *urlFrontier = new UrlFrontier( ); ProducerConsumerQueue < DocIndex * > *IndexerQueue = new ProducerConsumerQueue < DocIndex * >( ); Indexer indexer( IndexerQueue ); string path = util::GetCurrentWorkingDir() +"/crawler/tests/testSeeds.txt"; @@ -58,14 +58,22 @@ int main ( int argc, char *argv[] ) */ SSL_library_init( ); - string url1 = "https://fivethirtyeight.com/features/fear-not-readers-we-have-rss-feeds/"; - string url2 = "https:"; - ParsedUrl url = ParsedUrl(url2); - urlFrontier->Push(url); + //string url1 = "https://fivethirtyeight.com"; + //string url2 = "https:"; + + string bad_url = "http-equiv=X-UA-Compatiblecontent=IE=edge,chrome=1"; + string bad_url2 ="http-equiv=Content-Type"; + string bad_url3 = "http-equiv=refresh content=1;url=/2.73.0/static/unsupp.html /><![endif]--><!--[if gt IE 9><!--><!--<![endif]--><title>White House says Trump continues to deny Stormy Daniels affair - CNNPolitics</title>"; + //ParsedUrl url = ParsedUrl(bad_url); + ParsedUrl * url1 = new ParsedUrl(bad_url3); + ParsedUrl * url2 = new ParsedUrl(bad_url2); + urlFrontier->Push(url1); + + urlFrontier->Push(url2); indexer.StartThread( ); Crawler crawler( mode, urlFrontier, IndexerQueue ); - crawler.SpawnSpiders( numberOfSpiders , duplicateUrlMap ); + crawler.SpawnSpiders( numberOfSpiders ); crawler.WaitOnAllSpiders( ); indexer.WaitForFinish( ); diff --git a/crawler/tests/urlFrontierTest.cpp b/crawler/tests/urlFrontierTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52db63e296095a6ae88dd75a6755c746316f1087 --- /dev/null +++ b/crawler/tests/urlFrontierTest.cpp @@ -0,0 +1,66 @@ +// +// Created by Jake Close on 3/21/18. +// + +#include "../crawler.h" +#include "../spider.h" +#include <iostream> +#include <stdlib.h> +#include <unistd.h> +#include <pthread.h> +#include <queue> +#include "../crawler.h" +#include <openssl/ssl.h> +#include <string> +#include <unordered_map> +#include "../../util/util.h" +#include <getopt.h> +#include "../../indexer/Indexer.h" +#include "../UrlFrontier.h" + +using DocIndex = const unordered_map< string, vector< unsigned long > >; + +using namespace std; + + +int main ( int argc, char *argv[] ) + { + + string mode = "web"; + char *seeds; + int numberOfSpiders = 1; + unordered_map < size_t, int > *duplicateUrlMap = new unordered_map < size_t, int >( ); + UrlFrontier *urlFrontier = new UrlFrontier( ); + + + ProducerConsumerQueue < DocIndex * > *IndexerQueue = new ProducerConsumerQueue < DocIndex * >( ); + Indexer indexer( IndexerQueue ); + string path = util::GetCurrentWorkingDir() +"/crawler/tests/testSeeds.txt"; + + + SSL_library_init( ); + string url1 = "http://www.bostonglobe.com"; + string url2 = "https://www.wired.com/"; + //string url2 = "https:"; + + //string bad_url = "http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\" />"; + ParsedUrl * url = new ParsedUrl(url1); + ParsedUrl * url_1 = new ParsedUrl(url2); + + urlFrontier->Push(url); + urlFrontier->Push(url_1); + + indexer.StartThread( ); + + Crawler crawler( mode, urlFrontier, IndexerQueue ); + crawler.SpawnSpiders( numberOfSpiders ); + + crawler.WaitOnAllSpiders( ); + indexer.WaitForFinish( ); + + + auto f = urlFrontier->Pop( ); + int x = 0; + delete urlFrontier; + return 1; + } \ No newline at end of file diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 2ee12072b43f2603e6143e398bdfc0bf8af80968..b12b3acb59df283ac13b3b5571d83383bdbfcbe5 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -16,7 +16,8 @@ void Indexer::run ( ) bool cond = true; - while(cond) { + while(alive) { + DocIndex * dictionary = pointerToDictionaries->Pop(); cout << "INDEX GOT A NEW Dictionary" << endl; DocumentEnding docEnd = DocumentEnding(); @@ -42,7 +43,7 @@ void Indexer::run ( ) docEnd.docNumWords = indexedCount; docEndings.push_back(docEnd); - if(currentBlockNumberWords >= 10000) { + if(currentBlockNumberWords >= 100000 || alive == false) { save(); reset(); } @@ -51,6 +52,8 @@ void Indexer::run ( ) save(); reset(); saveChunkDictionary(); + cout << "Indexer has finished running" << endl; + return ; } void Indexer::verbose_run() { @@ -208,3 +211,9 @@ void Indexer::reset ( ) currentBlockNumberWords = 0; currentBlockNumberDocs = 0; } + + +void Indexer::Kill() + { + this->alive = false; + } \ No newline at end of file diff --git a/indexer/Indexer.h b/indexer/Indexer.h index 71546b35552e80ea6e68aac4ea87ad3305ef542f..b77a05a5ed7000fc2be6681dc2209249f7f1a22d 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -38,6 +38,8 @@ public: void run ( ); + void Kill ( ); + void verbose_run ( ); void verbose_save ( ); @@ -62,6 +64,8 @@ private: size_t currentBlockNumberWords; size_t currentBlockNumberDocs; + + bool alive = true; }; #endif /*indexer_h*/ diff --git a/isolated-integration b/isolated-integration index e0ed25804c897030d5baf01a57fdfdced4f559bd..94f0aa413b334e945a45c43dd1ed7c61e1713efb 100755 Binary files a/isolated-integration and b/isolated-integration differ diff --git a/main.cpp b/main.cpp index dfc234ec6aeeb71a12316316afc1b6b58680ed79..25e2980726c634cddd658579bc3c6fb54cd8af72 100644 --- a/main.cpp +++ b/main.cpp @@ -17,17 +17,35 @@ #include "util/util.h" #include <getopt.h> #include "indexer/Indexer.h" - -#define PATH_TO_BLACKLIST = '/bin/blacklist.txt' -#define PATH_TO_VISITED_URL = 'bin/urls.txt' -#define PATH_TO_HTML_DIR = 'bin/html/' -#define PATH_TO_INDEX = 'bin/index/wordIDX' -#define PATH_TO_DOC_INDEX = 'bin/index/docIDX' - +#include "crawler/UrlFrontier.h" +#include <csignal> +#include <iostream> +#include <chrono> +#include <future> +#include <ctime> +#include "crawler/HouseKeeper.h" using DocIndex = const unordered_map< string, vector< unsigned long > >; using namespace std; +string wait_for_user_input() + { + std::string answer; + std::cin >> answer; + return answer; ; + } + + + +void signalHandler( int signum ) { + cout << "Interrupt signal (" << signum << ") received.\n"; + cout << "Ending the Index build" << endl; + // cleanup and close up stuff here + // terminate program + + exit(signum); + } + int main ( int argc, char *argv[] ) { @@ -50,19 +68,22 @@ int main ( int argc, char *argv[] ) // + string mode = "web"; int numberOfSpiders = 1; + bool restart = false; opterr = true; int choice; int option_index = 0; option long_options[] = { { "mode", optional_argument, nullptr, 'm' }, - { "num_crawlers", optional_argument, nullptr, 'c' } + { "num_crawlers", optional_argument, nullptr, 'c' }, + { "from_restart", optional_argument, nullptr, 'r' } }; - while ( ( choice = getopt_long( argc, argv, "m:c:", long_options, &option_index ) ) != -1 ) + while ( ( choice = getopt_long( argc, argv, "m:c:r:", long_options, &option_index ) ) != -1 ) { switch ( choice ) { @@ -85,6 +106,10 @@ int main ( int argc, char *argv[] ) exit( 1 ); } break; + case 'r': + + restart = true; + break; default: cerr << "Unknown input option"; @@ -95,9 +120,8 @@ int main ( int argc, char *argv[] ) bool restoreFromLog; - unordered_map< size_t, int > *duplicateUrlMap = new unordered_map< size_t, int >( ); - ProducerConsumerQueue< ParsedUrl > *urlFrontier = new ProducerConsumerQueue< ParsedUrl >( ); + UrlFrontier *urlFrontier = new UrlFrontier(); ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( ); @@ -108,50 +132,90 @@ int main ( int argc, char *argv[] ) { seeds = util::getFileMap( "/tests/webSeed.txt" ); SSL_library_init( ); - } - string testFile; - while ( *seeds ) + if(restart == false) { - if ( *seeds == '\n' ) + string testFile; + while ( *seeds ) + { + if ( *seeds == '\n' ) + { + + ParsedUrl * url = new ParsedUrl( testFile ); + cout << "Pushing: " << testFile << " to queue\n"; + urlFrontier->Push( url ); + testFile = ""; + } + else + testFile.push_back( *seeds ); + ++seeds; + } + if ( testFile != "" ) { - - ParsedUrl url = ParsedUrl( testFile ); cout << "Pushing: " << testFile << " to queue\n"; + ParsedUrl * url = new ParsedUrl( testFile ); urlFrontier->Push( url ); - testFile = ""; } - else - testFile.push_back( *seeds ); - ++seeds; - } - if ( testFile != "" ) - { - cout << "Pushing: " << testFile << " to queue\n"; - ParsedUrl url = ParsedUrl( testFile ); - urlFrontier->Push( url ); } + //else + //urlFrontier->ReadDataFromDisk(); + + + + + + Indexer indexer( IndexerQueue ); indexer.StartThread( ); - Crawler crawler( mode, urlFrontier, IndexerQueue ); + Crawler *crawler = new Crawler( mode, urlFrontier, IndexerQueue ); + atomic_bool *alive = new atomic_bool(true); // At the beginning of the program + + crawler->SpawnSpiders( numberOfSpiders , alive); + + HouseKeeper logger( crawler ); + //logger.StartThread( ); + + string input; + while(true) + { + cout << "press enter to quit\n" << std::endl ; + //getline (cin, input); + cin >> input; + if(input == "q") + { + + cout << "Shutting down the indexer " << endl ; + crawler->KillAllSpiders(); + crawler->WaitOnAllSpiders( ); + indexer.Kill(); + indexer.WaitForFinish( ); + + urlFrontier->writeDataToDisk(); + + delete urlFrontier; + delete IndexerQueue; + + cout << "Indexer has finished running " << endl; + return 0; + + } + + } + + + //main threads is just reading command + //if it wants work, has to spawn thread to do it + //thread we spawn, periodically pulls should + + + - crawler.SpawnSpiders( numberOfSpiders, duplicateUrlMap ); - crawler.WaitOnAllSpiders( ); - indexer.WaitForFinish( ); - string aa; - cin >> aa; - if(aa == "q") { - return 0; - } - auto f = urlFrontier->Pop( ); - int x = 0; - delete urlFrontier; } \ No newline at end of file diff --git a/parser/Parser.cpp b/parser/Parser.cpp index bd0ea22765165fdc0094bdd4c1c73202088b5917..63b1b6abec15bad89703d52ec64de8b1de4884b1 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -5,7 +5,7 @@ * Parser Cstor * @param urlFrontierIn */ -Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn ) +Parser::Parser ( UrlFrontier *urlFrontierIn ) { urlFrontier = urlFrontierIn; } @@ -31,7 +31,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) unsigned long htmlIt = 0; unsigned long offset = 0; - ParsedUrl currentUrl = reader->getUrl( ); + ParsedUrl * currentUrl = reader->getUrl( ); // tokenize anchor string anchorText = currentUrl.getAnchorText( ); @@ -40,7 +40,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) offset = tokenizer->execute( anchorText, offset, Tokenizer::ANCHOR ); } // tokenize url - offset = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offset, Tokenizer::URL ); + offset = tokenizer->execute( currentUrl->getHost( ) + "/" + currentUrl->getPath( ), offset, Tokenizer::URL ); string html = reader->PageToString( ); while ( htmlIt < html.size( ) ) @@ -344,18 +344,25 @@ bool Parser::isValid ( string url ) * @param anchorText --> will be "null" if empty * @param debug --> will print urls to std::cout */ -void Parser::pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug ) +void Parser::pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorText, bool debug ) { url = isLocal( url, currentUrl ); if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) ) { - ParsedUrl pUrl = ParsedUrl( url ); - pUrl.setAnchorText( anchorText ); - urlFrontier->Push( pUrl ); - if ( debug ) + try { - cout << url << endl; - cout << anchorText << endl; + ParsedUrl *pUrl = new ParsedUrl( url ); + pUrl->setAnchorText( anchorText ); + urlFrontier->Push( pUrl ); + if ( debug ) + { + cout << url << endl; + cout << anchorText << endl; + } + } + catch (exception e) + { + cerr << "HTML url parsed from web page had issue creating object" << endl; } } } @@ -393,7 +400,7 @@ void Parser::removeTag ( string & html, unsigned long & htmlIt, unsigned long sa */ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ) + ParsedUrl * currentUrl ) { // check if line is url string title = extractTitle( line ); @@ -457,7 +464,7 @@ bool Parser::isTag ( string html, string tag ) */ string Parser::extractBody ( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ) + ParsedUrl * currentUrl ) { string body = ""; unsigned long startParTag = findNext( "<p", 0, html ); diff --git a/parser/Parser.h b/parser/Parser.h index 8198532553f683ff69530303a1c02ba951e210c6..7b6463da1f9bd05ba3a1814cdf6a098182795e3b 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -11,7 +11,7 @@ #include "../shared/Document.h" #include "../shared/ProducerConsumerQueue.h" #include "../crawler/Readers/StreamReader.h" - +#include "../crawler/UrlFrontier.h" using namespace std; /** @@ -27,7 +27,7 @@ public: * Parser Cstor * @param urlFrontierIn */ - Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn ); + Parser ( UrlFrontier *urlFrontierIn ); /** * Executes the Parser @@ -37,7 +37,7 @@ public: private: - ProducerConsumerQueue< ParsedUrl > *urlFrontier; + UrlFrontier *urlFrontier; /** * Parses file @@ -101,7 +101,7 @@ private: * @param anchorText * @param debug --> will print urls to std::cout */ - void pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug ); + void pushToUrlQueue ( string url, ParsedUrl * currentUrl, string anchorText, bool debug ); /** * Returns true if given tag @@ -125,7 +125,7 @@ private: */ string extractBody ( string html, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ); + ParsedUrl * currentUrl ); /** @@ -140,7 +140,7 @@ private: */ void extractAll ( string line, unsigned long & offsetTitle, unsigned long & offsetBody, bool isParagraph, Tokenizer *tokenizer, - ParsedUrl & currentUrl ); + ParsedUrl * currentUrl ); //TODO delete?? may not need /** diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index 0f836deb7fe9019409f054cc22ecfe59df933c50..ee9ba83ae01b969117873dcd5bc66343efc75128 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -12,7 +12,7 @@ #include "../../crawler/Readers/HttpReader.h" #include "../../crawler/Readers/HttpsReader.h" #include "../../util/util.h" - +#include "../../crawler/UrlFrontier.h" using namespace std; void testSimple( ); @@ -55,7 +55,7 @@ void printDictionary ( const unordered_map< string, vector< unsigned long > > di void testSimple ( ) { cout << "Testing Simple: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://www.cats.com" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/plaintext.txt"; @@ -96,11 +96,11 @@ void testSimple ( ) void testHttp( ) { cout << "Testing Complex: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl httpURL = ParsedUrl( "http://veronicacday.com/" ); - HttpReader reader( httpURL ); + HttpReader reader( &httpURL ); auto success = reader.request( ); if ( !success ) { @@ -112,10 +112,10 @@ void testHttp( ) printDictionary( *dictionary ); assert( urlFrontierTest.Size( ) == 12 ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://trove.com/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://arcinnovations.xyz/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://gwydion.co/" ); - assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" ); +// assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://trove.com/" ); +// assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://arcinnovations.xyz/" ); +// assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://gwydion.co/" ); +// assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" ); assert ( dictionary != nullptr ); assert ( dictionary->size( ) == 372 ); @@ -138,7 +138,7 @@ void testHttp( ) void testURL ( ) { cout << "Testing URL: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest ; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://testurl.com" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/urlTest.html"; @@ -158,7 +158,7 @@ void testURL ( ) assert ( dictionary != nullptr ); assert ( dictionary->size( ) == 3 ); assert ( dictionary->at( "=testurl.com/" )[ 0 ] == 0 ); - assert ( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://www.bafta.org/" ); + assert ( urlFrontierTest.Pop( )->getCompleteUrl( ) == "http://www.bafta.org/" ); assert ( dictionary->find( "$bafta" ) == dictionary->end( ) ); assert ( dictionary->at( "$testurl" )[ 0 ] == 0 ); assert ( dictionary->at( "$com" )[ 0 ] == 1 ); @@ -172,7 +172,7 @@ void testURL ( ) void testBody ( ) { cout << "Testing Body: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html"; @@ -222,7 +222,7 @@ void testBody ( ) void testExtractBody ( ) { cout << "Testing ExtractBody: " << endl; - ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + UrlFrontier urlFrontierTest; Parser parser( &urlFrontierTest ); ParsedUrl fake_url = ParsedUrl( "https://developer.mozilla.org/en-US/docs/Learn" ); string filepath = util::GetCurrentWorkingDir( ) + "/tests/testExtractBodyTest.html"; diff --git a/shared/url.h b/shared/url.h index 7f7de34157cc6a0acdf77cf7a67d998ea9d4fbe8..8bc51d819905c14c16f5b70f985b6d0b0d2dc2d3 100644 --- a/shared/url.h +++ b/shared/url.h @@ -5,6 +5,7 @@ #include <iostream> #include "../util/util.h" #include <math.h> +#include <time.h> //#include "../crawler/SocketReader.h" using namespace std; @@ -28,6 +29,7 @@ private: AnchorText; double Score; + public: ParsedUrl() {} @@ -36,87 +38,101 @@ public: { // Assumes url points to static text but // does not check. - char *temp_CompleteUrl, - *temp_Service, - *temp_Host, - *temp_Domain, - *temp_Path, - *temp_AnchorText, - *temp_pathBuffer; + try + { - //intialize anchor text to "" - char *null = new char[2]; - strcpy( null, string( "" ).c_str( ) ); - temp_AnchorText = null; + char *temp_CompleteUrl, + *temp_Service, + *temp_Host, + *temp_Domain, + *temp_Path, + *temp_AnchorText, + *temp_pathBuffer; - char *url = new char[input_url.length( ) + 1]; - strcpy( url, input_url.c_str( ) ); + //intialize anchor text to "" + char *null = new char[2]; + strcpy( null, string( "" ).c_str( ) ); + temp_AnchorText = null; - temp_CompleteUrl = url; + char *url = new char[input_url.length( ) + 1]; + strcpy( url, input_url.c_str( ) ); - temp_pathBuffer = new char[strlen( url ) + 1]; - char *f, *t; - for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); ); + temp_CompleteUrl = url; - temp_Service = temp_pathBuffer; + temp_pathBuffer = new char[strlen( url ) + 1]; + char *f, *t; + for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); ); - const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; - char *p; - for ( p = temp_pathBuffer; *p && *p != Colon; p++ ); + temp_Service = temp_pathBuffer; - if ( *p ) - { - // Mark the end of the Service. - *p++ = 0; + const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; + char *p; + for ( p = temp_pathBuffer; *p && *p != Colon; p++ ); - if ( *p == Slash ) - p++; - if ( *p == Slash ) - p++; + if ( *p ) + { + // Mark the end of the Service. + *p++ = 0; - temp_Host = p; + if ( *p == Slash ) + p++; + if ( *p == Slash ) + p++; - for ( ; *p && *p != Slash; p++ ); + temp_Host = p; - if ( *p ) - // Mark the end of the Host. - *p++ = 0; + for ( ; *p && *p != Slash; p++ ); - //char * domainBuffer = new char[ 20 ]; - //get the domain: - char *i = temp_Host; - temp_Domain = null; - if(i) - { - for ( ; *i; i++ ) + if ( *p ) + // Mark the end of the Host. + *p++ = 0; + + //char * domainBuffer = new char[ 20 ]; + //get the domain: + char *i = temp_Host; + temp_Domain = nullptr; + if(i) { - if ( *i == Period ) - temp_Domain = i; + for ( ; *i; i++ ) + { + if ( *i == Period ) + temp_Domain = i; + } } + + // Whatever remains is the Path. // need to remove fragments + + temp_Path = p; + for ( ; *p && *p != HashTag; p++ ); + + if ( *p ) + // Mark the end of the Path, remove fragments. + *p++ = 0; } + else + temp_Host = temp_Path = p; - // Whatever remains is the Path. // need to remove fragments - temp_Path = p; - for ( ; *p && *p != HashTag; p++ ); + CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); + Service = string(temp_Service, strlen(temp_Service)); + Host = string(temp_Host, strlen(temp_Host)); + if( temp_Domain != nullptr ) + Domain = string(temp_Domain, strlen(temp_Domain)); - if ( *p ) - // Mark the end of the Path, remove fragments. - *p++ = 0; - } - else - temp_Host = temp_Path = p; + Path = string(temp_Path, strlen(temp_Path)); + AnchorText = string(temp_AnchorText, strlen(temp_AnchorText)); + pathBuffer = temp_pathBuffer; + setScore( ); - CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); - Service = string(temp_Service, strlen(temp_Service)); - Host = string(temp_Host, strlen(temp_Host)); - Domain = string(temp_Domain, strlen(temp_Domain)); - Path = string(temp_Path, strlen(temp_Path)); - AnchorText = string(temp_AnchorText, strlen(temp_AnchorText)); - pathBuffer = temp_pathBuffer; + } + catch (exception e) + { + cerr << "Error constructing a ParsedUrl from string url "<< endl; + + + } - setScore( ); } void printUrl ( ) @@ -134,7 +150,7 @@ public: void setScore() { double lengthOfUrl = CompleteUrl.length(); - Score += 4 * 1/ log( lengthOfUrl ); + Score += 1/ ( lengthOfUrl ); if(lengthOfUrl > 4) { @@ -143,19 +159,19 @@ public: { if ( strcmp ( Domain.c_str() , ORG ) ) - Score += 5; + Score += .5; else if ( strcmp ( Domain.c_str() , EDU ) ) - Score += 4; + Score += 1; else if ( strcmp ( Domain.c_str() , GOV ) ) - Score += 3; + Score += 1; else if ( strcmp ( Domain.c_str() , COM ) ) Score += 2; else if ( strcmp ( Domain.c_str() , NET ) ) - Score += 1; + Score += 3; else if ( strcmp ( Domain.c_str() , INT ) ) - Score += 1; + Score += 4; else if ( strcmp ( Domain.c_str() , MIL ) ) - Score += .5; + Score += 5; } } @@ -186,6 +202,18 @@ public: return Path; } + + double getScore ( ) + { + return Score; + } + + void updateScore( double time ) + { + + Score += 3 * time; + } + std::string getAnchorText ( ) { return AnchorText; diff --git a/tests/webSeed.txt b/tests/webSeed.txt index 9ba6be04f21fc2426fff6dc8d76ac528199949c7..e54cf8e29710dfeb0a9cdaa4ee1a1ee124a5e100 100644 --- a/tests/webSeed.txt +++ b/tests/webSeed.txt @@ -1,10 +1,8 @@ -https://www.boston.com/cars/new-car-deals?s_campaign=bg:hp:well:cars http://www.fastcompany.com/ http://www.bbc.com/ https://www.eecs.umich.edu/ https://www.nytimes.com/ http://www.bostonglobe.com/ -https://www.huffingtonpost.com/2015/01/14/strangest-wikipedia-entries_n_6463488.html https://www.wired.com/ http://www.espn.com/ http://fivethirtyeight.com/