#include "Indexer.h" Indexer::Indexer( ProducerConsumerQueue < DocIndex * > *doc_index_queue_in, ProducerConsumerQueue < unordered_map < string, DocIndex * > > *anchor_in) : pointerToDictionaries( doc_index_queue_in ), AnchorQueue( anchor_in ) { currentFile = 0; currentlyIndexed = 0; currentBlockNumberWords = 0; currentBlockNumberDocs = 0; numberDocsIndexed = 0; } void Indexer::run() { while ( *alive || pointerToDictionaries->Size( ) > 0 ) { if( pointerToDictionaries->Size( ) > 0) { DocIndex *dictionary = pointerToDictionaries->Pop( ); numberDocsIndexed++; cout << "Number of documents indexed :: " << to_string(numberDocsIndexed) << endl; DocumentEnding docEnd = DocumentEnding( ); size_t indexedCount = 0; currentBlockNumberDocs++; for ( auto word : *dictionary ) { if ( word.first.at( 0 ) == '=' ) { docEnd.url = word.first.substr( 1, word.first.length( )); continue; } chunkDictionary[word.first].docFrequency++; indexedCount += word.second.size( ); currentBlockNumberWords += word.second.size( ); for ( auto location : word.second ) { masterDictionary[ word.first ].push_back( currentlyIndexed + location ); } } currentlyIndexed += indexedCount; docEnd.docEndPosition = currentlyIndexed; docEnd.docNumWords = indexedCount; docEndings.push_back( docEnd ); //add the url to the ->doc end map urlToDocEndings[ docEnd.url ] = docEnd.docEndPosition; if ( currentBlockNumberWords >= IndexerConstants::chunkSizeLimit ) { cout << " --- Saving current chunk --- " << endl; save( ); saveWordSeek( ); reset( ); } delete dictionary; } } cout << "Indexer is shutting down" << endl; save( ); saveWordSeek( ); reset( ); saveChunkDictionary( ); unordered_map < string, DocIndex * > anchorDict = AnchorQueue->Pop( ); SaveAnchorText( &anchorDict ); cout << " Indexer has finished running" << endl; return; } void Indexer::save() { MMDiskHashTable seeker( util::GetCurrentWorkingDir( ) + IndexerConstants::pathToIndex + to_string( currentFile ) + "-seek.txt", IndexerConstants::chunkSeekKeySize, IndexerConstants::chunkSeekValueSize ); string fileName = util::GetCurrentWorkingDir( ) + IndexerConstants::pathToIndex + to_string( currentFile ) + ".txt"; int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU ); seeker.insert("=numberUniqueWords", to_string(masterDictionary.size())); seeker.insert("=numberWords", to_string(currentBlockNumberWords)); seeker.insert("=numberDocs", to_string(currentBlockNumberDocs)); // TODO: these should really be c strings string statsHeader = "===STATS===" "\nunique words: " + to_string( masterDictionary.size( )) + "\nnumber words: " + to_string( currentBlockNumberWords ) + "\nnumber docs: " + to_string( currentBlockNumberDocs ) + "\n===========\n"; write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( ))); // REALLY GROSS HACK size_t seekOffset = strlen( statsHeader.c_str( )); size_t chunkEnd = 0; for ( auto word : masterDictionary ) { if ( word.first.size( ) > IndexerConstants::maxWordSize ) { string resized = word.first; resized.resize( IndexerConstants::maxWordSize ); seeker.insert( resized, to_string( seekOffset )); } else { seeker.insert( word.first, to_string( seekOffset )); } chunkDictionary[ word.first ].chunks.push_back( currentFile ); bool firstPost = true; size_t lastOne = 0; int numIndexed = 0; for ( auto location : word.second ) { if(chunkEnd < location) { chunkEnd = location; } chunkDictionary[ word.first ].frequency++; numIndexed++; if ( firstPost ) { string locationSpace = to_string( location ) + " "; write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( ))); seekOffset += strlen( locationSpace.c_str( )); firstPost = false; } else { size_t delta = location - lastOne; string deltaSpace = to_string( delta ) + " "; write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( ))); seekOffset += strlen( deltaSpace.c_str( )); } if ( numIndexed == IndexerConstants::saveEveryXEntries ) { SeekEntry entry = SeekEntry( ); entry.offset = seekOffset; entry.realLocation = location; seekDictionary[ word.first ].push_back( entry ); numIndexed = 0; } lastOne = location; } chunkDictionary[ word.first ].lastLocation = lastOne; write( file, "\n", 1 ); seekOffset += 1; } string docEndingHeader = "===Document Endings===\n"; write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ))); seekOffset += strlen( docEndingHeader.c_str( )); seeker.insert( "=docEnding", to_string( seekOffset )); int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk for ( auto ending : docEndings ) { string docEndString = "[" + ending.url + ", " + to_string( ending.docEndPosition ) + ", " + to_string( ending.docNumWords ) + "]\n"; write( file, docEndString.c_str( ), strlen( docEndString.c_str( ))); docEndSeekCounter++; if ( docEndSeekCounter == IndexerConstants::saveEveryXEntries ) { docEndSeekCounter = 0; seekDictionary["=docEnding"].push_back( SeekEntry(ending.docEndPosition, seekOffset )); } seekOffset += strlen( docEndString.c_str( )); } chunkEndLocation.push_back(chunkEnd); close( file ); //seeker.CloseFile(); } void Indexer::saveChunkDictionary() { MMDiskHashTable dhtChunk = MMDiskHashTable( util::GetCurrentWorkingDir( ) + IndexerConstants::pathToIndex + "master.txt", IndexerConstants::masterKeySize, IndexerConstants::masterValueSize ); for ( auto word : chunkDictionary ) { string key = word.first; if ( key.size( ) > IndexerConstants::maxWordSize ) key.resize( IndexerConstants::maxWordSize ); string value = ""; for ( auto chunk : word.second.chunks ) value += to_string( chunk ) + " "; value += "\t" + to_string( word.second.frequency ); value += "\t" + to_string( word.second.lastLocation); value += "\t" + to_string( word.second.docFrequency); dhtChunk.insert( key, value ); } dhtChunk.insert("=numberChunks", to_string(currentFile)); dhtChunk.insert("=totalNumberIndexed", to_string(currentlyIndexed)); dhtChunk.insert("=totalDocsIndexed", to_string(numberDocsIndexed)); int currentChunk = 0; for(auto location : chunkEndLocation) { string key = "=chunk" + to_string(currentChunk); dhtChunk.insert(key, to_string(location)); currentChunk++; } } void Indexer::saveWordSeek() { MMDiskHashTable wordSeek = MMDiskHashTable(util::GetCurrentWorkingDir( ) + IndexerConstants::pathToIndex + to_string( currentFile ) + "-wordseek.txt", IndexerConstants::chunkWordSeekKeySize, IndexerConstants::chunkWordSeekValueSize ); for ( auto word : seekDictionary ) { string key = word.first; if(key == "=docEnding") continue; if ( key.size( ) > IndexerConstants::maxWordSize ) key.resize( IndexerConstants::maxWordSize ); string value = ""; for ( auto entry : word.second ) value += ("<" + to_string( entry.offset ) + ", " + to_string( entry.realLocation ) + "> "); wordSeek.insert( key, value ); } string key = "=docEnding"; string value = ""; int currentEndingPartition = 0; for ( size_t i = 0; i < seekDictionary["=docEnding"].size( ); i++ ) { string prospectiveDocEnding = "<" + to_string( seekDictionary["=docEnding"][ i ].offset ) + ", " + to_string( seekDictionary["=docEnding"][ i ].realLocation ) + "> "; if ( value.size( ) + prospectiveDocEnding.size( ) <= IndexerConstants::chunkWordSeekValueSize ) { value += prospectiveDocEnding; } else { wordSeek.insert( key + to_string( currentEndingPartition ), value ); currentEndingPartition++; value = prospectiveDocEnding; } } currentFile++; } void Indexer::reset() { masterDictionary.clear( ); docEndings.clear( ); seekDictionary.clear(); currentBlockNumberWords = 0; currentBlockNumberDocs = 0; } void Indexer::Kill() { *(this->alive) = false; //currentFile++; } void Indexer::SaveAnchorText( unordered_map < string, DocIndex * > *anchorDict ) { //TODO create pointer to anchor //pointerToAnchor->Pop(); //pass a dictionary of //map <url string> - > vector<anchor word> //for each url in map //look up url string in url -> docEnding map //for each anchor text in url map // create a anchor text - > list of doc endings //write to disk cout << " -- SAVING ANCHOR TEXT --- " << endl; for ( auto const &ent1 : *anchorDict ) { auto const &outer_key = ent1.first; //cout << "url: " << outer_key << endl; if ( urlToDocEndings.find( outer_key ) != urlToDocEndings.end( )) { size_t docEndForUrl = urlToDocEndings[ outer_key ]; //cout << "Urls doc end : " << docEndForUrl << endl; } DocIndex *inner_map = ent1.second; for ( auto const &ent2 : *inner_map ) { auto const &inner_key = ent2.first; auto const &inner_value = ent2.second; //cout << "url: " << outer_key << endl; //cout << "anchor text : " << inner_key << endl; //for(auto offset :inner_value) // cout << "offset " << offset << endl; } } }