Skip to content
Snippets Groups Projects
Commit 58f44a4f authored by jsclose's avatar jsclose
Browse files

code reformat for style

parent 9f3b7562
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,7 @@
#include<string>
#include <pthread.h>
#include <iostream>
class HouseKeeper : public ThreadClass
{
......
......@@ -3,7 +3,8 @@
//
#include "HttpReader.h"
std::runtime_error HTTPConnectionError("Error connecting HTTP to url");
std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" );
bool HttpReader::request()
{
......@@ -15,7 +16,7 @@ bool HttpReader::request()
// Get the host address.
struct hostent *host = gethostbyname( url.Host );
if( host == nullptr)
if ( host == nullptr )
throw HTTPConnectionError;
assert( host );
......@@ -48,14 +49,14 @@ bool HttpReader::request()
return isSuccess;
}
catch (std::exception& e)
catch (std::exception &e)
{
cerr << "Error trying to connect to Host" << endl;
return false;
}
}
bool HttpReader::fillBuffer(char * buf, size_t buf_size)
bool HttpReader::fillBuffer( char *buf, size_t buf_size )
{
return (recv( sock, buf, buf_size, 0 ) == buf_size);
}
......@@ -67,10 +68,10 @@ string HttpReader::PageToString()
char buf[10240];
int bytes = 0;
while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 )
{
temp += string(buf, bytes);
}
while ((bytes = recv( sock, buf, 10240, 0 )) > 0 )
{
temp += string( buf, bytes );
}
return temp;
}
......@@ -86,13 +87,13 @@ bool HttpReader::checkStatus()
char buff[12];
int bytes = 0;
bytes = recv( sock, buff, 12, 0 ) ;
bytes = recv( sock, buff, 12, 0 );
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 )
return true;
else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
else if ( strncmp( buff, "HTTP/1.1 400", 11 ) == 0 )
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
else if ( strncmp( buff, "HTTP/1.1 302", 11 ) == 0 )
{
cerr << "URL REDIRECTION" << endl;
return false;
......
......@@ -9,12 +9,20 @@ class HttpReader : public StreamReader
{
public:
HttpReader( ParsedUrl url_in ) : url( url_in ) { }
HttpReader( ParsedUrl url_in ) : url( url_in )
{ }
bool request();
bool fillBuffer(char * buf, size_t buf_size);
bool fillBuffer( char *buf, size_t buf_size );
bool checkStatus();
string PageToString();
ParsedUrl getUrl();
void closeReader();
......
......@@ -4,7 +4,7 @@
#include "HttpsReader.h"
std::runtime_error HTTPSconnectionError("Error connecting HTTPS to url");
std::runtime_error HTTPSconnectionError( "Error connecting HTTPS to url" );
bool HttpsReader::request()
{
......@@ -12,7 +12,7 @@ bool HttpsReader::request()
{
struct hostent *host = gethostbyname( url.Host );
if(host == nullptr)
if ( host == nullptr )
throw HTTPSconnectionError;
assert( host );
......@@ -47,7 +47,7 @@ bool HttpsReader::request()
// Establish an SSL connection.
int sslConnectResult = SSL_connect( ssl );
if(sslConnectResult != 1)
if ( sslConnectResult != 1 )
throw HTTPSconnectionError;
assert( sslConnectResult == 1 );
......@@ -65,13 +65,14 @@ bool HttpsReader::request()
bool isSuccess = checkStatus( );
return isSuccess;
}
catch (std::exception& e) {
cerr << "Error trying to connect to Host" << endl;
return false;
catch (std::exception &e)
{
cerr << "Error trying to connect to Host" << endl;
return false;
}
}
bool HttpsReader::fillBuffer(char * buf, size_t buf_size)
bool HttpsReader::fillBuffer( char *buf, size_t buf_size )
{
return (SSL_read( ssl, buf, buf_size ) == buf_size);
}
......@@ -83,10 +84,10 @@ string HttpsReader::PageToString()
char buf[10240];
int bytes = 0;
while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 )
{
temp += string(buf, bytes);
}
while ((bytes = SSL_read( ssl, buf, 10240 )) > 0 )
{
temp += string( buf, bytes );
}
return temp;
}
......@@ -99,15 +100,15 @@ bool HttpsReader::checkStatus()
bytes = SSL_read( ssl, buff, 12 );
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 )
return true;
else if ( strncmp( buff, "HTTP/1.1 400", 11 ) == 0 )
return true;
else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{
cerr << "URL REDIRECTION" << endl;
return false;
}
else if ( strncmp( buff, "HTTP/1.1 302", 11 ) == 0 )
{
cerr << "URL REDIRECTION" << endl;
return false;
}
cerr << "Bad Request of TYPE:: " << buff << endl;
return false;
......@@ -122,9 +123,9 @@ ParsedUrl HttpsReader::getUrl()
void HttpsReader::closeReader()
{
SSL_shutdown(ssl);
SSL_free(ssl);
SSL_CTX_free(ctx);
close(sock);
SSL_shutdown( ssl );
SSL_free( ssl );
SSL_CTX_free( ctx );
close( sock );
}
......@@ -10,18 +10,24 @@ class HttpsReader : public StreamReader
{
public:
HttpsReader( ParsedUrl url_in ) : url( url_in ) { }
HttpsReader( ParsedUrl url_in ) : url( url_in )
{ }
bool request();
bool fillBuffer(char * buf, size_t buf_size);
bool fillBuffer( char *buf, size_t buf_size );
string PageToString();
ParsedUrl getUrl();
void closeReader();
bool checkStatus();
private:
ParsedUrl url;
int sock;
SSL * ssl;
SSL_CTX * ctx;
SSL *ssl;
SSL_CTX *ctx;
};
......@@ -12,10 +12,11 @@ bool LocalReader::request()
return true;
}
bool LocalReader::fillBuffer(char * buf, size_t buf_size){
bool LocalReader::fillBuffer( char *buf, size_t buf_size )
{
//FIXME
strcpy(buf, util::getFileMap( fileName )) ;
strcpy( buf, util::getFileMap( fileName ));
return true;
}
......@@ -23,14 +24,14 @@ bool LocalReader::fillBuffer(char * buf, size_t buf_size){
string LocalReader::PageToString()
{
//FIXME
string s("fix me");
string s( "fix me" );
return s;
}
ParsedUrl LocalReader::getUrl()
{
//FIXME
ParsedUrl url("");
ParsedUrl url( "" );
return url;
}
......@@ -39,6 +40,7 @@ bool LocalReader::checkStatus()
{
return true;
}
void LocalReader::closeReader()
{
//FIXME
......
......@@ -10,13 +10,19 @@ class LocalReader : public StreamReader
{
public:
LocalReader( string url_in ) : fileName( url_in ) { }
LocalReader( string url_in ) : fileName( url_in )
{ }
bool request();
bool fillBuffer(char * buf, size_t buf_size);
bool fillBuffer( char *buf, size_t buf_size );
bool checkStatus();
string PageToString();
ParsedUrl getUrl();
void closeReader();
private:
......
......@@ -22,12 +22,19 @@ using namespace std;
class StreamReader
{
public:
StreamReader() {};
StreamReader()
{ };
virtual bool request() = 0;
virtual bool fillBuffer(char * buf, size_t buf_size) = 0;
virtual bool fillBuffer( char *buf, size_t buf_size ) = 0;
virtual bool checkStatus() = 0;
virtual string PageToString() = 0;
virtual ParsedUrl getUrl() =0;
virtual void closeReader() = 0;
};
......
......@@ -4,11 +4,12 @@
#include "crawler.h"
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap )
void Crawler::SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup,
unordered_map < size_t, int > *duplicateUrlMap )
{
for ( size_t i = 0; i < num_spiders; i++ )
{
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap , this->IndexerQueue);
Spider *temp = new Spider( this->mode, this->urlFrontier, docMapLookup, duplicateUrlMap, this->IndexerQueue );
temp->StartThread( );
this->spiders.push_back( temp );
}
......@@ -20,8 +21,8 @@ void Crawler::WaitOnAllSpiders()
cout << "Waiting for spiders to finish...\n";
for ( Spider *spider : spiders )
{
spider->WaitForFinish( );
delete spider; //FIXME do this in destructor?
spider->WaitForFinish( );
delete spider; //FIXME do this in destructor?
}
}
......
......@@ -11,29 +11,32 @@
*
*/
using namespace std;
using DocIndex = const unordered_map< string, vector< unsigned long > >;
using DocIndex = const unordered_map < string, vector < unsigned long > >;
class Crawler
{
public:
Crawler( string mode_in, ProducerConsumerQueue < ParsedUrl > *url_q_in , ProducerConsumerQueue< DocIndex* > *doc_index_queue_in)
: IndexerQueue (doc_index_queue_in), mode( mode_in ), urlFrontier( url_q_in )
Crawler( string mode_in, ProducerConsumerQueue < ParsedUrl > *url_q_in,
ProducerConsumerQueue < DocIndex * > *doc_index_queue_in )
: IndexerQueue( doc_index_queue_in ), mode( mode_in ), urlFrontier( url_q_in )
{ };
//spawns a number of works
void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup , unordered_map < size_t, int > *duplicateUrlMap);
void SpawnSpiders( size_t num_spiders, unordered_map < string, int > *docMapLookup,
unordered_map < size_t, int > *duplicateUrlMap );
//Creates a housekeeping thread
void houseKeeper();
void KillAllSpiders( );
void WaitOnAllSpiders( );
void KillAllSpiders();
void WaitOnAllSpiders();
private:
vector < Spider * > spiders;
ProducerConsumerQueue < ParsedUrl > *urlFrontier;
ProducerConsumerQueue< DocIndex* > *IndexerQueue;
ProducerConsumerQueue < DocIndex * > *IndexerQueue;
//CrawlerStatistics housekeeper;
string mode;
......
......@@ -10,64 +10,66 @@
#include "Readers/LocalReader.h"
#include "../parser/Parser.h"
using DocIndex = const unordered_map< string, vector< unsigned long > >;
using DocIndex = const unordered_map < string, vector < unsigned long > >;
// FIND A BETTER PLACE TO PUT THIS FUNCTION
StreamReader* SR_factory(ParsedUrl url, string mode)
StreamReader *SR_factory( ParsedUrl url, string mode )
{
string localFile;
StreamReader *newReader = nullptr
;
StreamReader *newReader = nullptr;
if ( mode == "local" )
{
{
newReader = new LocalReader( url.CompleteUrl );
}
else if ( mode == "web" )
{
if(!strcmp(url.Service, "http")) {
newReader = new HttpReader(url);
}
else if(!strcmp(url.Service,"https")){
newReader = new HttpsReader(url);
}
else{
else if ( mode == "web" )
{
if ( !strcmp( url.Service, "http" ))
{
newReader = new HttpReader( url );
}
else if ( !strcmp( url.Service, "https" ))
{
newReader = new HttpsReader( url );
}
else
{
cerr << "Error reading service type\n";
}
}
}
return newReader;
}
void printDocIndex( DocIndex* dict )
void printDocIndex( DocIndex *dict )
{
for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{
{
cout << it->first << " : ";
for ( int i = 0; i < it->second.size( ); ++i )
{
{
cout << it->second[ i ] << " ";
}
}
cout << std::endl;
}
}
cout << std::endl;
}
size_t Spider::hash(const char * s)
size_t Spider::hash( const char *s )
{
// http://www.cse.yorku.ca/~oz/hash.html
size_t h = 5381;
int c;
while ( ( c = *s++ ) )
h = ( ( h << 5 ) + h ) + c;
return h;
// http://www.cse.yorku.ca/~oz/hash.html
size_t h = 5381;
int c;
while ((c = *s++))
h = ((h << 5) + h) + c;
return h;
}
ParsedUrl Spider::getUrl ( )
ParsedUrl Spider::getUrl()
{
return urlFrontier->Pop( );
}
......@@ -79,19 +81,19 @@ void Spider::run()
while ( cond < 50 )
{
ParsedUrl currentUrl = getUrl();
size_t docID = hash(currentUrl.CompleteUrl);
ParsedUrl currentUrl = getUrl( );
size_t docID = hash( currentUrl.CompleteUrl );
if ( shouldURLbeCrawled( docID ))
{
StreamReader *reader = SR_factory( currentUrl, this->mode );
bool success = reader->request();
if(success)
bool success = reader->request( );
if ( success )
{
DocIndex * dict = parser.execute (reader);
IndexerQueue->Push(dict);
DocIndex *dict = parser.execute( reader );
IndexerQueue->Push( dict );
// printDocIndex(dict);
reader->closeReader();
reader->closeReader( );
//delete dict;
cond++;
......@@ -120,7 +122,7 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
* Takes in a parsed url, creates a document object, writes information about the document to disk
* returns the begining position of the document on disk, stores that into the in memory lookup hash table
*/
bool Spider::writeDocToDisk ( ParsedUrl url )
bool Spider::writeDocToDisk( ParsedUrl url )
{
Document d( url );
int resultPosition = d.WriteToDocMap( );
......@@ -129,10 +131,10 @@ bool Spider::writeDocToDisk ( ParsedUrl url )
return false;
}
this->docMapLookup->insert( std::pair< string, int >( url.CompleteUrl, resultPosition ) );
this->docMapLookup->insert( std::pair < string, int >( url.CompleteUrl, resultPosition ));
for ( auto it = this->docMapLookup->begin( ); it != this->docMapLookup->end( ); ++it )
{
std::cout << it->first << " => " << it->second << '\n';
std::cout << it->first << " => " << it->second << '\n';
}
return true;
......@@ -148,12 +150,13 @@ bool Spider::writeDocToDisk ( ParsedUrl url )
bool Spider::shouldURLbeCrawled( size_t docID )
{
if(this->duplicateUrlMap->find(docID) != this->duplicateUrlMap->end()){
if ( this->duplicateUrlMap->find( docID ) != this->duplicateUrlMap->end( ))
{
return false;
}
else
{
this->duplicateUrlMap->insert(std::make_pair(docID, 1));
this->duplicateUrlMap->insert( std::make_pair( docID, 1 ));
return true;
}
}
......
......@@ -13,11 +13,9 @@
#include "../parser/Parser.h"
using namespace std;
using DocIndex = const unordered_map< string, vector< unsigned long > >;
using DocIndex = const unordered_map < string, vector < unsigned long > >;
class Spider : public ThreadClass
{
......@@ -27,41 +25,42 @@ public:
Spider( string mode_in,
ProducerConsumerQueue < ParsedUrl > *url_q_in,
unordered_map < string, int > *doc_map_lookup_in,
unordered_map < size_t, int > *duplicate_url_map_in ,
ProducerConsumerQueue < DocIndex* > *doc_index_queue_in
)
unordered_map < size_t, int > *duplicate_url_map_in,
ProducerConsumerQueue < DocIndex * > *doc_index_queue_in
)
: mode( mode_in ),
urlFrontier( url_q_in ),
docMapLookup( doc_map_lookup_in ),
parser ( url_q_in),
duplicateUrlMap(duplicate_url_map_in),
IndexerQueue(doc_index_queue_in)
parser( url_q_in ),
duplicateUrlMap( duplicate_url_map_in ),
IndexerQueue( doc_index_queue_in )
{
};
//Takes a url off of the url frontier
ParsedUrl getUrl();
virtual void run();
bool writeDocToDisk(ParsedUrl url);
bool writeDocToDisk( ParsedUrl url );
bool shouldURLbeCrawled( size_t docId );
size_t hash(const char * s);
size_t hash( const char *s );
//int getRobots(ParsedUrl url );
bool checkRobots(ParsedUrl url);
bool checkRobots( ParsedUrl url );
private:
int locationOnDisk;
ProducerConsumerQueue < ParsedUrl > *urlFrontier;
ProducerConsumerQueue< DocIndex* > *IndexerQueue;
ProducerConsumerQueue < DocIndex * > *IndexerQueue;
unordered_map < size_t, int > *duplicateUrlMap;
string mode;
unordered_map < string, int > *docMapLookup;
Parser parser ;
Parser parser;
};
\ No newline at end of file
......@@ -6,17 +6,15 @@
using namespace std;
int main ( )
int main()
{
}
void robotsTest(){
void robotsTest()
{
}
\ No newline at end of file
#include "Indexer.h"
Indexer::Indexer(ProducerConsumerQueue < DocIndex* > *doc_index_queue_in) : pointerToDictionaries( doc_index_queue_in ){
currentFile = 0;
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
Indexer::Indexer( ProducerConsumerQueue < DocIndex * > *doc_index_queue_in ) : pointerToDictionaries(
doc_index_queue_in )
{
currentFile = 0;
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
}
void Indexer::run() {
void Indexer::run()
{
bool cond = true;
while(cond) {
DocIndex * dictionary = pointerToDictionaries->Pop();
cout << "INDEX GOT A NEW dnary" << endl;
DocumentEnding docEnd = DocumentEnding();
size_t indexedCount = 0;
currentBlockNumberDocs++;
for(auto word : *dictionary) {
if(word.first.at(0) == '=') {
docEnd.url = word.first.substr(1, word.first.length());
continue;
}
indexedCount += word.second.size();
currentBlockNumberWords += word.second.size();
for(auto location : word.second) {
masterDictionary[word.first].push_back(currentlyIndexed + location);
}
}
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 500) {
save();
reset();
}
}
save();
reset();
saveChunkDictionary();
}
void Indexer::verbose_run() {
while ( cond )
{
DocIndex *dictionary = pointerToDictionaries->Pop( );
cout << "INDEX GOT A NEW dnary" << endl;
DocumentEnding docEnd = DocumentEnding( );
size_t indexedCount = 0;
currentBlockNumberDocs++;
for ( auto word : *dictionary )
{
if ( word.first.at( 0 ) == '=' )
{
docEnd.url = word.first.substr( 1, word.first.length( ));
continue;
}
indexedCount += word.second.size( );
currentBlockNumberWords += word.second.size( );
for ( auto location : word.second )
{
masterDictionary[ word.first ].push_back( currentlyIndexed + location );
}
}
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back( docEnd );
if ( currentBlockNumberWords >= 500 )
{
save( );
reset( );
}
}
save( );
reset( );
saveChunkDictionary( );
}
void Indexer::verbose_run()
{
/*
while(pointerToDictionaries.Size() != 0) {
DocIndex *pointerToDictionaries.Pop();
......@@ -60,128 +69,146 @@ void Indexer::verbose_run() {
}
}
*/
}
void Indexer::save() {
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
map<string, size_t> seeker;
string fileName = util::GetCurrentWorkingDir() + "/indexer/output/" + to_string(currentFile) + ".txt";
int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
// TODO: these should really be c strings
string statsHeader = "===STATS==="
"\nunique words: " + to_string(masterDictionary.size()) +
"\nnumber words: " + to_string(currentBlockNumberWords) +
"\nnumber docs: " + to_string(currentBlockNumberDocs) +
"\n===========\n";
write(file, statsHeader.c_str(), strlen(statsHeader.c_str()));
// REALLY GROSS HACK
size_t seekOffset = strlen(statsHeader.c_str());
for(auto word : maps) {
seeker[word.first] = seekOffset;
chunkDictionary[word.first].push_back(currentFile);
}
void Indexer::save()
{
map < string, vector < size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ));
map < string, size_t > seeker;
string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
// TODO: these should really be c strings
string statsHeader = "===STATS==="
"\nunique words: " + to_string( masterDictionary.size( )) +
"\nnumber words: " + to_string( currentBlockNumberWords ) +
"\nnumber docs: " + to_string( currentBlockNumberDocs ) +
"\n===========\n";
write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( )));
// REALLY GROSS HACK
size_t seekOffset = strlen( statsHeader.c_str( ));
for ( auto word : maps )
{
seeker[ word.first ] = seekOffset;
chunkDictionary[ word.first ].push_back( currentFile );
// string wordBreak = word.first + "\n";
// write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
// seekOffset += strlen(wordBreak.c_str());
bool firstPost = true;
size_t lastOne = 0;
int numIndexed = 0;
for(auto location : word.second) {
numIndexed++;
if(numIndexed >= 100) {
PostingsSeekTableEntry entry = PostingsSeekTableEntry();
entry.offset = seekOffset;
entry.realLocation = location;
postingsSeekTable[word.first].push_back(entry);
numIndexed = 0;
}
if(firstPost) {
string locationSpace = to_string(location) + " ";
write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
seekOffset += strlen(locationSpace.c_str());
firstPost = false;
} else {
size_t delta = location - lastOne;
string deltaSpace = to_string(delta) + " ";
write(file, deltaSpace.c_str(), strlen(deltaSpace.c_str()));
seekOffset += strlen(deltaSpace.c_str());
}
lastOne = location;
}
write(file, "\n", 1);
seekOffset += 1;
}
string docEndingHeader = "===Document Endings===\n";
write(file, docEndingHeader.c_str(), strlen(docEndingHeader.c_str()));
seekOffset += strlen(docEndingHeader.c_str());
seeker["=docEnding"] = seekOffset;
for(auto ending : docEndings) {
string docEndString = "[" +
ending.url + ", " +
to_string(ending.docEndPosition) + ", " +
to_string(ending.docNumWords) + "]\n";
write(file, docEndString.c_str(), strlen(docEndString.c_str()));
}
// TODO: seek dictionary
string seekFileName = util::GetCurrentWorkingDir() + "/indexer/output/" + to_string(currentFile) + "-seek.txt";
int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
for(auto word : seeker) {
string line = word.first + " " + to_string(word.second) + "\n";
write(seekFile, line.c_str(), strlen(line.c_str()));
if(postingsSeekTable.find(word.first) != postingsSeekTable.end()) {
string offsetLine = "\t";
for(int i = 0; i < postingsSeekTable[word.first].size(); i++) {
offsetLine += "<" +
to_string(postingsSeekTable[word.first][i].realLocation) +
", " +
to_string(postingsSeekTable[word.first][i].offset) +
"> ";
}
offsetLine += "\n";
write(seekFile, offsetLine.c_str(), strlen(offsetLine.c_str()));
}
}
close(file);
currentFile++;
}
void Indexer::saveChunkDictionary() {
string fileName = util::GetCurrentWorkingDir() + "/indexer/output/master-index.txt";
int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
for(auto word : chunkDictionary) {
string wordDictionary = word.first + " ";
for(auto chunk : word.second) {
wordDictionary += to_string(chunk) + " ";
}
wordDictionary += "\n";
write(file, wordDictionary.c_str(), strlen(wordDictionary.c_str()));
}
close(file);
}
void Indexer::verbose_save() {
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
for(auto word : maps) {
cout << word.first << endl;
for(auto location : word.second) {
cout << location << " ";
}
cout << endl;
}
currentFile++;
}
void Indexer::reset() {
masterDictionary.clear();
docEndings.clear();
postingsSeekTable.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
bool firstPost = true;
size_t lastOne = 0;
int numIndexed = 0;
for ( auto location : word.second )
{
numIndexed++;
if ( numIndexed >= 100 )
{
PostingsSeekTableEntry entry = PostingsSeekTableEntry( );
entry.offset = seekOffset;
entry.realLocation = location;
postingsSeekTable[ word.first ].push_back( entry );
numIndexed = 0;
}
if ( firstPost )
{
string locationSpace = to_string( location ) + " ";
write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( )));
seekOffset += strlen( locationSpace.c_str( ));
firstPost = false;
}
else
{
size_t delta = location - lastOne;
string deltaSpace = to_string( delta ) + " ";
write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( )));
seekOffset += strlen( deltaSpace.c_str( ));
}
lastOne = location;
}
write( file, "\n", 1 );
seekOffset += 1;
}
string docEndingHeader = "===Document Endings===\n";
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( )));
seekOffset += strlen( docEndingHeader.c_str( ));
seeker[ "=docEnding" ] = seekOffset;
for ( auto ending : docEndings )
{
string docEndString = "[" +
ending.url + ", " +
to_string( ending.docEndPosition ) + ", " +
to_string( ending.docNumWords ) + "]\n";
write( file, docEndString.c_str( ), strlen( docEndString.c_str( )));
}
// TODO: seek dictionary
string seekFileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt";
int seekFile = open( seekFileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
for ( auto word : seeker )
{
string line = word.first + " " + to_string( word.second ) + "\n";
write( seekFile, line.c_str( ), strlen( line.c_str( )));
if ( postingsSeekTable.find( word.first ) != postingsSeekTable.end( ))
{
string offsetLine = "\t";
for ( int i = 0; i < postingsSeekTable[ word.first ].size( ); i++ )
{
offsetLine += "<" +
to_string( postingsSeekTable[ word.first ][ i ].realLocation ) +
", " +
to_string( postingsSeekTable[ word.first ][ i ].offset ) +
"> ";
}
offsetLine += "\n";
write( seekFile, offsetLine.c_str( ), strlen( offsetLine.c_str( )));
}
}
close( file );
currentFile++;
}
void Indexer::saveChunkDictionary()
{
string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/master-index.txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
for ( auto word : chunkDictionary )
{
string wordDictionary = word.first + " ";
for ( auto chunk : word.second )
{
wordDictionary += to_string( chunk ) + " ";
}
wordDictionary += "\n";
write( file, wordDictionary.c_str( ), strlen( wordDictionary.c_str( )));
}
close( file );
}
void Indexer::verbose_save()
{
map < string, vector < size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ));
for ( auto word : maps )
{
cout << word.first << endl;
for ( auto location : word.second )
{
cout << location << " ";
}
cout << endl;
}
currentFile++;
}
void Indexer::reset()
{
masterDictionary.clear( );
docEndings.clear( );
postingsSeekTable.clear( );
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment