Skip to content
Snippets Groups Projects
Commit 63580ae8 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

groundwork in index for ending doc seek

parent edeb9b0d
No related branches found
No related tags found
1 merge request!3Indexer
No preview for this file type
......@@ -107,7 +107,7 @@ void Indexer::save ( )
{
chunkDictionary[word.first].second++;
numIndexed++;
if ( numIndexed >= 100 )
if ( numIndexed == 100 )
{
PostingsSeekTableEntry entry = PostingsSeekTableEntry( );
entry.offset = seekOffset;
......@@ -139,7 +139,7 @@ void Indexer::save ( )
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) );
seekOffset += strlen( docEndingHeader.c_str( ) );
seeker.insert("=docEnding", to_string(seekOffset));
int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
for ( auto ending : docEndings )
{
string docEndString = "[" +
......@@ -147,6 +147,13 @@ void Indexer::save ( )
to_string( ending.docEndPosition ) + ", " +
to_string( ending.docNumWords ) + "]\n";
write( file, docEndString.c_str( ), strlen( docEndString.c_str( ) ) );
docEndSeekCounter++;
if(docEndSeekCounter == 100)
{
docEndSeekCounter = 0;
docEndingsSeek.push_back({ ending.docEndPosition, seekOffset });
}
seekOffset += strlen(docEndString.c_str());
}
close( file );
......@@ -186,6 +193,21 @@ void Indexer::saveWordSeek() {
}
wordSeek.insert(key, value);
}
string key = "=docEnding";
string value = "";
int currentEndingPartition = 0;
for(size_t i = 0; i < docEndingsSeek.size(); i++) {
string prospectiveDocEnding = "<" +
to_string(docEndingsSeek[i].first) +
", " + to_string(docEndingsSeek[i].second) + "> ";
if(value.size() + prospectiveDocEnding.size() <= 168) {
value += prospectiveDocEnding;
} else {
wordSeek.insert(key + to_string(currentEndingPartition), value);
currentEndingPartition++;
value = prospectiveDocEnding;
}
}
}
void Indexer::verbose_save ( )
......@@ -208,7 +230,7 @@ void Indexer::reset ( )
masterDictionary.clear( );
docEndings.clear( );
postingsSeekTable.clear( );
docEndingsSeek.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
currentFile++;
......
......@@ -57,6 +57,7 @@ private:
unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable;
vector< DocumentEnding > docEndings;
vector< pair<size_t, size_t> > docEndingsSeek; // <realLocation, offset (to the correspond docEnding)>
size_t totalWordsIndexed;
size_t currentFile;
size_t currentlyIndexed;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment