diff --git a/Indexer-twitter-tests b/Indexer-twitter-tests index e8f9569e520c58e316d6380aac0c52b0b6f6ee01..a3dd658a75f1ade4731a15407e50b6517ebd108a 100755 Binary files a/Indexer-twitter-tests and b/Indexer-twitter-tests differ diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index fdddcd9a2cc6b1d5e92eb62216752550aa2eae23..75ec9b8ffa48fbc288de6f24dbcc38590090476c 100755 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -107,7 +107,7 @@ void Indexer::save ( ) { chunkDictionary[word.first].second++; numIndexed++; - if ( numIndexed >= 100 ) + if ( numIndexed == 100 ) { PostingsSeekTableEntry entry = PostingsSeekTableEntry( ); entry.offset = seekOffset; @@ -139,7 +139,7 @@ void Indexer::save ( ) write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) ); seekOffset += strlen( docEndingHeader.c_str( ) ); seeker.insert("=docEnding", to_string(seekOffset)); - + int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk for ( auto ending : docEndings ) { string docEndString = "[" + @@ -147,6 +147,13 @@ void Indexer::save ( ) to_string( ending.docEndPosition ) + ", " + to_string( ending.docNumWords ) + "]\n"; write( file, docEndString.c_str( ), strlen( docEndString.c_str( ) ) ); + docEndSeekCounter++; + if(docEndSeekCounter == 100) + { + docEndSeekCounter = 0; + docEndingsSeek.push_back({ ending.docEndPosition, seekOffset }); + } + seekOffset += strlen(docEndString.c_str()); } close( file ); @@ -186,6 +193,21 @@ void Indexer::saveWordSeek() { } wordSeek.insert(key, value); } + string key = "=docEnding"; + string value = ""; + int currentEndingPartition = 0; + for(size_t i = 0; i < docEndingsSeek.size(); i++) { + string prospectiveDocEnding = "<" + + to_string(docEndingsSeek[i].first) + + ", " + to_string(docEndingsSeek[i].second) + "> "; + if(value.size() + prospectiveDocEnding.size() <= 168) { + value += prospectiveDocEnding; + } else { + wordSeek.insert(key + to_string(currentEndingPartition), value); + currentEndingPartition++; + value = prospectiveDocEnding; + } + } } void Indexer::verbose_save ( ) @@ -208,7 +230,7 @@ void Indexer::reset ( ) masterDictionary.clear( ); docEndings.clear( ); postingsSeekTable.clear( ); - + docEndingsSeek.clear(); currentBlockNumberWords = 0; currentBlockNumberDocs = 0; currentFile++; diff --git a/indexer/Indexer.h b/indexer/Indexer.h index 5afd9db0dd64815d0fe7c9ad85fd5271b4dce833..ae169176ea0a7fa7ec8b6464c6b1096142aa669c 100755 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -57,6 +57,7 @@ private: unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable; vector< DocumentEnding > docEndings; + vector< pair<size_t, size_t> > docEndingsSeek; // <realLocation, offset (to the correspond docEnding)> size_t totalWordsIndexed; size_t currentFile; size_t currentlyIndexed;