-
Nicholas Yang authoredNicholas Yang authored
Indexer.cpp 9.38 KiB
#include "Indexer.h"
Indexer::Indexer( ProducerConsumerQueue < DocIndex * > *doc_index_queue_in,
ProducerConsumerQueue < unordered_map < string, DocIndex * > > *anchor_in) :
pointerToDictionaries( doc_index_queue_in ), AnchorQueue( anchor_in )
{
currentFile = 0;
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
numberDocsIndexed = 0;
}
void Indexer::run()
{
while ( *alive || pointerToDictionaries->Size( ) > 0 )
{
if( pointerToDictionaries->Size( ) > 0)
{
DocIndex *dictionary = pointerToDictionaries->Pop( );
numberDocsIndexed++;
cout << "Number of documents indexed :: " << to_string(numberDocsIndexed) << endl;
DocumentEnding docEnd = DocumentEnding( );
size_t indexedCount = 0;
currentBlockNumberDocs++;
for ( auto word : *dictionary )
{
if ( word.first.at( 0 ) == '=' )
{
docEnd.url = word.first.substr( 1, word.first.length( ));
continue;
}
chunkDictionary[word.first].docFrequency++;
indexedCount += word.second.size( );
currentBlockNumberWords += word.second.size( );
for ( auto location : word.second )
{
masterDictionary[ word.first ].push_back( currentlyIndexed + location );
}
}
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back( docEnd );
//add the url to the ->doc end map
urlToDocEndings[ docEnd.url ] = docEnd.docEndPosition;
if ( currentBlockNumberWords >= IndexerConstants::chunkSizeLimit )
{
cout << " --- Saving current chunk --- " << endl;
save( );
saveWordSeek( );
reset( );
}
delete dictionary;
}
}
cout << "Indexer is shutting down" << endl;
save( );
saveWordSeek( );
reset( );
saveChunkDictionary( );
unordered_map < string, DocIndex * > anchorDict = AnchorQueue->Pop( );
SaveAnchorText( &anchorDict );
cout << " Indexer has finished running" << endl;
return;
}
void Indexer::save()
{
MMDiskHashTable seeker( util::GetCurrentWorkingDir( ) +
IndexerConstants::pathToIndex +
to_string( currentFile ) + "-seek.txt",
IndexerConstants::chunkSeekKeySize, IndexerConstants::chunkSeekValueSize );
string fileName = util::GetCurrentWorkingDir( ) +
IndexerConstants::pathToIndex +
to_string( currentFile ) + ".txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
seeker.insert("=numberUniqueWords", to_string(masterDictionary.size()));
seeker.insert("=numberWords", to_string(currentBlockNumberWords));
seeker.insert("=numberDocs", to_string(currentBlockNumberDocs));
// TODO: these should really be c strings
string statsHeader = "===STATS==="
"\nunique words: " + to_string( masterDictionary.size( )) +
"\nnumber words: " + to_string( currentBlockNumberWords ) +
"\nnumber docs: " + to_string( currentBlockNumberDocs ) +
"\n===========\n";
write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( )));
// REALLY GROSS HACK
size_t seekOffset = strlen( statsHeader.c_str( ));
size_t chunkEnd = 0;
for ( auto word : masterDictionary )
{
if ( word.first.size( ) > IndexerConstants::maxWordSize )
{
string resized = word.first;
resized.resize( IndexerConstants::maxWordSize );
seeker.insert( resized, to_string( seekOffset ));
}
else
{
seeker.insert( word.first, to_string( seekOffset ));
}
chunkDictionary[ word.first ].chunks.push_back( currentFile );
bool firstPost = true;
size_t lastOne = 0;
int numIndexed = 0;
for ( auto location : word.second )
{
if(chunkEnd < location) {
chunkEnd = location;
}
chunkDictionary[ word.first ].frequency++;
numIndexed++;
if ( firstPost )
{
string locationSpace = to_string( location ) + " ";
write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( )));
seekOffset += strlen( locationSpace.c_str( ));
firstPost = false;
}
else
{
size_t delta = location - lastOne;
string deltaSpace = to_string( delta ) + " ";
write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( )));
seekOffset += strlen( deltaSpace.c_str( ));
}
if ( numIndexed == IndexerConstants::saveEveryXEntries )
{
SeekEntry entry = SeekEntry( );
entry.offset = seekOffset;
entry.realLocation = location;
seekDictionary[ word.first ].push_back( entry );
numIndexed = 0;
}
lastOne = location;
}
chunkDictionary[ word.first ].lastLocation = lastOne;
write( file, "\n", 1 );
seekOffset += 1;
}
string docEndingHeader = "===Document Endings===\n";
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( )));
seekOffset += strlen( docEndingHeader.c_str( ));
seeker.insert( "=docEnding", to_string( seekOffset ));
int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
for ( auto ending : docEndings )
{
string docEndString = "[" +
ending.url + ", " +
to_string( ending.docEndPosition ) + ", " +
to_string( ending.docNumWords ) + "]\n";
write( file, docEndString.c_str( ), strlen( docEndString.c_str( )));
docEndSeekCounter++;
if ( docEndSeekCounter == IndexerConstants::saveEveryXEntries )
{
docEndSeekCounter = 0;
seekDictionary["=docEnding"].push_back( SeekEntry(ending.docEndPosition, seekOffset ));
}
seekOffset += strlen( docEndString.c_str( ));
}
chunkEndLocation.push_back(chunkEnd);
close( file );
//seeker.CloseFile();
}
void Indexer::saveChunkDictionary()
{
MMDiskHashTable dhtChunk = MMDiskHashTable( util::GetCurrentWorkingDir( ) +
IndexerConstants::pathToIndex +
"master.txt", IndexerConstants::masterKeySize,
IndexerConstants::masterValueSize );
for ( auto word : chunkDictionary )
{
string key = word.first;
if ( key.size( ) > IndexerConstants::maxWordSize )
key.resize( IndexerConstants::maxWordSize );
string value = "";
for ( auto chunk : word.second.chunks )
value += to_string( chunk ) + " ";
value += "\t" + to_string( word.second.frequency );
value += "\t" + to_string( word.second.lastLocation);
value += "\t" + to_string( word.second.docFrequency);
dhtChunk.insert( key, value );
}
dhtChunk.insert("=numberChunks", to_string(currentFile));
dhtChunk.insert("=totalNumberIndexed", to_string(currentlyIndexed));
dhtChunk.insert("=totalDocsIndexed", to_string(numberDocsIndexed));
int currentChunk = 0;
for(auto location : chunkEndLocation)
{
string key = "=chunk" + to_string(currentChunk);
dhtChunk.insert(key, to_string(location));
currentChunk++;
}
}
void Indexer::saveWordSeek()
{
MMDiskHashTable wordSeek = MMDiskHashTable(util::GetCurrentWorkingDir( ) +
IndexerConstants::pathToIndex +
to_string( currentFile ) + "-wordseek.txt",
IndexerConstants::chunkWordSeekKeySize,
IndexerConstants::chunkWordSeekValueSize );
for ( auto word : seekDictionary )
{
string key = word.first;
if(key == "=docEnding")
continue;
if ( key.size( ) > IndexerConstants::maxWordSize )
key.resize( IndexerConstants::maxWordSize );
string value = "";
for ( auto entry : word.second )
value += ("<" + to_string( entry.offset ) +
", " + to_string( entry.realLocation ) + "> ");
wordSeek.insert( key, value );
}
string key = "=docEnding";
string value = "";
int currentEndingPartition = 0;
for ( size_t i = 0; i < seekDictionary["=docEnding"].size( ); i++ )
{
string prospectiveDocEnding = "<" +
to_string( seekDictionary["=docEnding"][ i ].offset ) +
", " + to_string( seekDictionary["=docEnding"][ i ].realLocation ) + "> ";
if ( value.size( ) + prospectiveDocEnding.size( ) <= IndexerConstants::chunkWordSeekValueSize )
{
value += prospectiveDocEnding;
}
else
{
wordSeek.insert( key + to_string( currentEndingPartition ), value );
currentEndingPartition++;
value = prospectiveDocEnding;
}
}
currentFile++;
}
void Indexer::reset()
{
masterDictionary.clear( );
docEndings.clear( );
seekDictionary.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
void Indexer::Kill()
{
*(this->alive) = false;
//currentFile++;
}
void Indexer::SaveAnchorText( unordered_map < string, DocIndex * > *anchorDict )
{
//TODO create pointer to anchor
//pointerToAnchor->Pop();
//pass a dictionary of
//map <url string> - > vector<anchor word>
//for each url in map
//look up url string in url -> docEnding map
//for each anchor text in url map
// create a anchor text - > list of doc endings
//write to disk
cout << " -- SAVING ANCHOR TEXT --- " << endl;
for ( auto const &ent1 : *anchorDict )
{
auto const &outer_key = ent1.first;
//cout << "url: " << outer_key << endl;
if ( urlToDocEndings.find( outer_key ) != urlToDocEndings.end( ))
{
size_t docEndForUrl = urlToDocEndings[ outer_key ];
//cout << "Urls doc end : " << docEndForUrl << endl;
}
DocIndex *inner_map = ent1.second;
for ( auto const &ent2 : *inner_map )
{
auto const &inner_key = ent2.first;
auto const &inner_value = ent2.second;
//cout << "url: " << outer_key << endl;
//cout << "anchor text : " << inner_key << endl;
//for(auto offset :inner_value)
// cout << "offset " << offset << endl;
}
}
}