Skip to content
Snippets Groups Projects
Commit 086ec1d9 authored by vcday's avatar vcday
Browse files

merge conflicts

parents 15992b70 9a1995c4
No related branches found
No related tags found
No related merge requests found
Showing
with 201 additions and 86 deletions
......@@ -13,6 +13,7 @@ add_executable(crawler-parser-Test
shared/ThreadClass.h
shared/url.h
crawler/crawler.cpp
crawler/UrlFrontier.cpp
crawler/Readers/StreamReader.h
crawler/Readers/HttpReader.cpp
crawler/Readers/HttpsReader.cpp
......@@ -33,6 +34,8 @@ add_executable(crawler-parser-indexer-Test
shared/ThreadClass.h
shared/url.h
crawler/crawler.cpp
crawler/UrlFrontier.cpp
crawler/HouseKeeper.cpp
crawler/Readers/StreamReader.h
crawler/Readers/HttpReader.cpp
crawler/Readers/HttpsReader.cpp
......@@ -52,6 +55,7 @@ add_executable(isolated-integration
crawler/tests/crawlerTest.cpp
shared/ProducerConsumerQueue.h
shared/ThreadClass.h
crawler/UrlFrontier.cpp
shared/url.h
crawler/crawler.cpp
crawler/Readers/StreamReader.h
......@@ -67,6 +71,29 @@ add_executable(isolated-integration
util/stringProcessing.cpp
indexer/Indexer.cpp)
add_executable(url-frontier-test
crawler/tests/urlFrontierTest.cpp
shared/ProducerConsumerQueue.h
shared/ThreadClass.h
shared/url.h
crawler/crawler.cpp
crawler/UrlFrontier.cpp
crawler/Readers/StreamReader.h
crawler/Readers/HttpReader.cpp
crawler/Readers/HttpsReader.cpp
crawler/Readers/LocalReader.cpp
crawler/spider.cpp
util/util.cpp
shared/Document.cpp
parser/Parser.cpp
util/Stemmer.cpp
util/Tokenizer.cpp
util/stringProcessing.cpp
indexer/Indexer.cpp)
add_executable(StringProcessingTest
util/stringProcessing.cpp
util/Stemmer.cpp
......@@ -92,6 +119,7 @@ add_executable(ParserTest
shared/ProducerConsumerQueue.h
util/stringProcessing.cpp
util/Stemmer.cpp
crawler/UrlFrontier.cpp
parser/tests/parserTest.cpp
crawler/Readers/StreamReader.h
crawler/Readers/LocalReader.cpp
......@@ -113,11 +141,26 @@ add_executable(ISRWord-tests
util/stringProcessing.cpp
util/Stemmer.cpp )
add_executable(ISROR-tests
util/util.cpp
constraintSolver/ISR.cpp
constraintSolver/ISRWord.cpp
constraintSolver/ISROr.cpp
constraintSolver/tests/ISROrTests.cpp
constraintSolver/ISREndDoc.cpp
util/stringProcessing.cpp
util/Stemmer.cpp )
find_package(OpenSSL REQUIRED)
target_link_libraries(ParserTest OpenSSL::SSL)
target_link_libraries(isolated-integration OpenSSL::SSL pthread)
target_link_libraries(url-frontier-test OpenSSL::SSL pthread)
target_link_libraries(crawler-parser-Test OpenSSL::SSL pthread)
target_link_libraries(crawler-parser-indexer-Test OpenSSL::SSL pthread)
......
File deleted
......@@ -5,8 +5,15 @@
#pragma once
//#include "Post.h"
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
typedef size_t Location; // Location 0 is the null location.
typedef size_t Location; // Location 0 is the null location.
class ISR
{
......@@ -29,7 +36,6 @@ public:
//Returns first instance of word after target location
virtual Location Seek ( Location target );
virtual ISR *GetDocumentISR ( );
//Returns the location of the end of the document
virtual Location GetEndDocument ( );
......
......@@ -10,6 +10,10 @@ Location ISROr::GetStartLocation ( )
return nearestStartLocation;
}
Location ISROr::GetCurrentLocation(){
return nearestStartLocation;
}
Location ISROr::GetEndLocation ( )
{
......@@ -17,6 +21,8 @@ Location ISROr::GetEndLocation ( )
}
Location ISROr::Seek ( Location target )
{
......@@ -30,38 +36,39 @@ Location ISROr::Seek ( Location target )
return 1;
}
/*
Returns the location of the next document that is a match
*/
ISR *ISROr::Next ( )
Location ISROr::Next ( )
{
Location nearestEnd = this->nearestTerm->GetEndDocument( );
while ( *Terms )
for(auto Term : Terms)
{
Location newSeekLocation = *Terms->Seek( nearestEnd + 1 );
Location newSeekLocation = Term->Seek( nearestEnd + 1 );
if ( newSeekLocation < nearestStartLocation )
{
nearestStartLocation = newSeekLocation;
nearestTerm = *Term;
nearestTerm = Term;
}
*Terms++;
}
return this->nearestTerm->GetDocumentISR( );
return this->nearestTerm->currentLocation;
}
ISR *ISROR::GetCurrentEndDoc ( )
/*
ISR *ISROr::GetCurrentEndDoc ( )
{
return this->nearestTerm->GetDocumentISR( );
}
*/
......@@ -5,24 +5,28 @@
#pragma once
#include "ISR.h"
#include <vector>
// Find occurrences of any child ISR.
class ISROr : publicISR
using namespace std;
class ISROr : public ISR
{
public:
ISR **Terms;
vector<ISR*>Terms;
unsigned NumberOfTerms;
Location GetCurrentLocation();
Location GetStartLocation ( );
Location GetEndLocation ( );
Location Seek ( Location target );
ISR *GetCurrentEndDoc ( );
//ISR *GetCurrentEndDoc ( );
Location First ( ) ;
Location Next ( );
//{ Do a next on the nearest term, then return// the new nearest match.}
......@@ -34,12 +38,11 @@ public:
// { Seek all the ISRs to the first occurrence just past the end of this document.returnSeek( DocumentEnd->GetEndLocation( ) + 1 );}
ISROr ( ISR **InputTerms ) : Terms( InputTerms )
ISROr ( vector<ISR * > InputTerms ) : Terms( InputTerms )
{
ISR *currentTerm = *InputTerms;
While( *currentTerm )
{
for(auto currentTerm : InputTerms)
{
currentTerm->First( );
Location currentLocation = currentTerm->currentLocation;
if ( currentLocation < nearestStartLocation )
......@@ -53,7 +56,7 @@ public:
nearestEndLocation = currentLocation;
}
++NumberOfTerms;
*currentTerm++;
currentTerm++;
}
......
......@@ -12,7 +12,7 @@ ISRWord::ISRWord ( char *word ) : term( word )
{
getChunks( );
currentChunk = 0;
currentLocation = first( );
currentLocation = First( );
}
// put into util file
......@@ -115,7 +115,7 @@ void ISRWord::getChunks() {
//set current memory map
//returns offset into corpus
Location ISRWord::first ( )
Location ISRWord::First ( )
{
string currentChunkSeekFileLocation =
util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
......@@ -147,7 +147,7 @@ Location ISRWord::first ( )
//find way to increment to next delta
//return new location
Location ISRWord::next ( )
Location ISRWord::Next ( )
{
if ( *currentMemMap == '\n' )
{
......@@ -158,7 +158,7 @@ Location ISRWord::next ( )
return currentLocation;
}
currentLocation = first( );
currentLocation = First( );
}
else
{
......@@ -184,7 +184,7 @@ Location ISRWord::getCurrentLocation()
//check seek lookup table to find if offset+absulte is bigger than target
//if so, set location to that big chunk
//go to next chunk
Location ISRWord::seek( Location target ) {
Location ISRWord::Seek( Location target ) {
if(!wordSeekLookupTable.empty()) {
auto best = wordSeekLookupTable.front();
for(auto entry : wordSeekLookupTable) {
......@@ -201,7 +201,7 @@ Location ISRWord::seek( Location target ) {
}
}
} else {
while(next() <= target) {
while(Next() <= target) {
}
return currentLocation;
}
......
......@@ -12,51 +12,50 @@
#include <sys/types.h>
#include "WordSeek.h"
#include "../util/util.h"
#include "ISR.h"
using namespace std;
//Find occurrences of individual words
typedef size_t Location;
class ISRWord
class ISRWord : public ISR
{
public:
ISRWord ( char *word );
public:
ISRWord ( char *word );
vector< size_t > getSeekContents ( string fileName );
vector< size_t > getSeekContents ( string fileName );
unsigned GetDocumentCount ( );
unsigned GetDocumentCount ( );
unsigned GetNumberOfOccurrences ( );
unsigned GetNumberOfOccurrences ( );
// ISR* DocumentEnd;
Location first ( );
// ISR* DocumentEnd;
Location First ( );
Location next ( );
Location Next ( );
Location nextDocument ( );
Location nextDocument ( );
Location seek ( Location target );
Location Seek ( Location target );
// ISR *GetDocumentISR( );
// ISR *GetDocumentISR( );
Location GetEndDocument ( );
Location currentLocation;
char *term;
char *masterIndex;
vector< size_t > listOfChunks;
vector< WordSeek > wordSeekLookupTable;
size_t currentChunk;
char *currentMemMap;
Location GetEndDocument ( );
Location currentLocation;
char *term;
char *masterIndex;
vector< size_t > listOfChunks;
vector< WordSeek > wordSeekLookupTable;
size_t currentChunk;
char *currentMemMap;
//set member variables to all of the chunks that occur, update current chunk
void getChunks ( );
Location getCurrentLocation();
//set member variables to all of the chunks that occur, update current chunk
void getChunks ( );
Location getCurrentLocation();
private:
private:
};
//
// Created by Jake Close on 3/16/18.
//
#include <iostream>
#include <set>
#include "../../indexer/DocumentEnding.h"
#include "../ISRWord.h"
#include "../ISREndDoc.h"
#include "../ISROr.h"
#include <vector>
using namespace std;
int main ( )
{
char* query;
ISRWord *q1 = new ISRWord("iphone");
ISRWord *q2 = new ISRWord("apple");
vector< ISR* > input;
input.push_back(q1);
input.push_back(q2);
ISROr *queryOr = new ISROr(input);
ISREndDoc endDocs;
vector<size_t> locations;
vector<DocumentEnding> docEnds;
set<string> urls;
while(queryOr->GetCurrentLocation() != 9999999999999) {
locations.push_back(queryOr->Next());
}
while(endDocs.next().url != "aaa")
{
for(auto locs : locations)
{
if(locs < endDocs.getCurrentDoc().docEndPosition &&
locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
urls.insert(endDocs.getCurrentDoc().url);
}
}
}
for(auto urrl : urls) {
cout << urrl << endl;
}
return 0;
}
\ No newline at end of file
......@@ -20,17 +20,19 @@ int main ( )
vector<DocumentEnding> docEnds;
set<string> urls;
while(queryWord.getCurrentLocation() != 9999999999999) {
locations.push_back(queryWord.next());
locations.push_back(queryWord.Next());
}
while(endDocs.next().url != "aaa") {
for(auto locs : locations) {
while(endDocs.next().url != "aaa")
{
for(auto locs : locations)
{
if(locs < endDocs.getCurrentDoc().docEndPosition &&
locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
urls.insert(endDocs.getCurrentDoc().url);
}
}
}
}
for(auto urrl : urls) {
cout << urrl << endl;
}
......
No preview for this file type
File deleted
//
// Created by Ben Bergkamp on 2/1/18.
// Created by Jake Close on 2/1/18.
//
#include <thread> // std::this_thread::sleep_for
#include <chrono> // std::chrono::seconds
#include "HouseKeeper.h"
void HouseKeeper::FuncToRun ( )
{
void HouseKeeper::run(){
//Sleep(3 minutes)
//Gather data
cout << "SAVING STATE OF URL FRONTIER " << endl;
while(true)
{
std::this_thread::sleep_for (std::chrono::seconds(30));
crawler->urlFrontier->writeDataToDisk();
}
}
\ No newline at end of file
......@@ -8,20 +8,20 @@
#include<string>
#include <pthread.h>
#include <iostream>
#include "crawler.h"
class HouseKeeper : public ThreadClass
{
public:
HouseKeeper ( )
HouseKeeper ( Crawler * crawler_in ) : crawler(crawler_in)
{ };
virtual void FuncToRun ( );
void run( );
private:
//members
Crawler* crawler;
};
#endif //EECS398_SEARCH_CRAWLERSTATISTICS_H
......@@ -14,11 +14,11 @@ bool HttpReader::request ( )
// Get the host address.
struct hostent *host = gethostbyname( url.getHost().c_str() );
struct hostent *host = gethostbyname( url->getHost().c_str() );
if ( host == nullptr )
throw HTTPConnectionError;
if(url.getService() != "http")
if(url->getService() != "http")
throw HTTPConnectionError;
assert( host );
......@@ -40,9 +40,9 @@ bool HttpReader::request ( )
cout << "Socket Reader is pulling from the web" << endl;
string getMessage = "GET ";
getMessage += url.getCompleteUrl();
getMessage += url->getCompleteUrl();
getMessage += " HTTP/1.1\r\nHost: ";
getMessage += url.getHost();
getMessage += url->getHost();
getMessage += "\r\nConnection: close\r\n\r\n";
cout << getMessage << endl;
......@@ -78,7 +78,7 @@ string HttpReader::PageToString ( )
return temp;
}
ParsedUrl HttpReader::getUrl ( )
ParsedUrl * HttpReader::getUrl ( )
{
return url;
}
......
......@@ -9,7 +9,7 @@ class HttpReader : public StreamReader
{
public:
HttpReader ( ParsedUrl url_in ) : url( url_in )
HttpReader ( ParsedUrl * url_in ) : url( url_in )
{ }
bool request ( );
......@@ -20,14 +20,14 @@ public:
string PageToString ( );
ParsedUrl getUrl ( );
ParsedUrl * getUrl ( );
void closeReader ( );
private:
ParsedUrl url;
ParsedUrl * url;
int sock;
};
......@@ -7,12 +7,12 @@ bool HttpsReader::request ( )
{
try
{
struct hostent *host = gethostbyname( url.getHost().c_str() );
struct hostent *host = gethostbyname( url->getHost().c_str() );
if ( host == nullptr )
throw HTTPSconnectionError;
if( url.getService() != "https")
if( url->getService() != "https")
throw HTTPSconnectionError;
assert( host );
......@@ -54,9 +54,9 @@ bool HttpsReader::request ( )
// Send a GET message for the desired page through the SSL.
string getMessage = "GET ";
getMessage += url.getCompleteUrl();
getMessage += url->getCompleteUrl();
getMessage += " HTTP/1.1\r\nHost: ";
getMessage += url.getHost();
getMessage += url->getHost();
getMessage += "\r\nConnection: close\r\n\r\n";
cout << getMessage << endl;
......@@ -115,7 +115,7 @@ bool HttpsReader::checkStatus ( )
}
ParsedUrl HttpsReader::getUrl ( )
ParsedUrl * HttpsReader::getUrl ( )
{
return url;
}
......
......@@ -10,7 +10,7 @@ class HttpsReader : public StreamReader
{
public:
HttpsReader ( ParsedUrl url_in ) : url( url_in )
HttpsReader ( ParsedUrl * url_in ) : url( url_in )
{ }
bool request ( );
......@@ -19,14 +19,14 @@ public:
string PageToString ( );
ParsedUrl getUrl ( );
ParsedUrl * getUrl ( );
void closeReader ( );
bool checkStatus ( );
private:
ParsedUrl url;
ParsedUrl * url;
int sock;
SSL *ssl;
SSL_CTX *ctx;
......
......@@ -29,10 +29,10 @@ string LocalReader::PageToString ( )
return temp;
}
ParsedUrl LocalReader::getUrl ( )
ParsedUrl * LocalReader::getUrl ( )
{
ParsedUrl url(test_url);
return url;
return &url;
}
bool LocalReader::checkStatus ( )
......
......@@ -17,7 +17,7 @@ public:
bool fillBuffer ( char *buf, size_t buf_size );
ParsedUrl getUrl ( );
ParsedUrl * getUrl ( );
bool checkStatus ( );
......
......@@ -30,7 +30,7 @@ public:
virtual string PageToString ( ) = 0;
virtual ParsedUrl getUrl ( ) =0;
virtual ParsedUrl * getUrl ( ) =0;
virtual void closeReader ( ) = 0;
};
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment