diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 4316e317e8340d5913c4fcfdcb925e8c6c1e550b..e9f054278eac388292b73a9dcf4cfe55291adc4f 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -12,21 +12,29 @@ Indexer::Indexer() { void Indexer::run() { while(pointerToDictionaries.Size() != 0) { - unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop(); + unordered_map<string, vector<int> >* dictionary = pointerToDictionaries.Pop(); + DocumentEnding docEnd = DocumentEnding(); currentBlockNumberDocs++; for(auto word : *dictionary) { indexedCount += word.second.size(); totalIndexed += word.second.size(); currentBlockNumberWords += word.second.size(); + if(word.first.at(0) == '=') { + docEnd.url = word.first; + continue; + } for(auto location : word.second) { masterDictionary[word.first].push_back(currentlyIndexed + location); } } + currentlyIndexed += indexedCount; + docEnd.docEndPosition = currentlyIndexed; + docEnd.docNumWords = indexedCount; + docEndings.push_back(docEnd); if(currentBlockNumberWords >= 300000) { save(); reset(); } - currentlyIndexed += indexedCount; indexedCount = 0; } save(); @@ -54,7 +62,7 @@ void Indexer::save() { string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n"; string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n"; string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n"; - string footer = "===SEEK===\n"; + string footer = "===========\n"; write(file, header.c_str(), strlen(header.c_str())); write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str())); write(file, numberWords.c_str(), strlen(numberWords.c_str())); @@ -72,6 +80,17 @@ void Indexer::save() { write(file, "\n", 1); } + string docEndingHeader = "===Document Endings===\n"; + write(file, docEndingHeader.c_str(), strlen(docEndingHeader.c_str())); + + for(auto ending : docEndings) { + string docEndString = "[" + + ending.url + ", " + + to_string(ending.docEndPosition) + ", " + + to_string(ending.docNumWords) + "]\n"; + write(file, docEndString.c_str(), strlen(docEndString.c_str())); + } + // TODO: seek dictionary string seekFileName = "index" + to_string(currentFile) + "-seek.txt"; int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); @@ -97,7 +116,7 @@ void Indexer::verbose_save() { } void Indexer::reset() { - unordered_map<string, vector<size_t>> lastOne; + unordered_map<string, vector<size_t> > lastOne; for(auto bucket : masterDictionary) { lastOne[bucket.first].push_back(bucket.second.back()); diff --git a/indexer/Indexer.h b/indexer/Indexer.h index ea359a971a27bca6415826d0ed166345c2b5a45b..b1f655408e2528e7b39bad4aaefc4d0fd7b831da 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -2,6 +2,7 @@ #define indexer_h #include "../ProducerConsumerQueue.h" #include "../ProducerConsumerQueue.cpp" +#include "DocumentEnding.h" #include <unordered_map> #include <map> #include <vector> @@ -40,6 +41,8 @@ class Indexer { unordered_map<string, vector<size_t> > masterDictionary; unordered_map<string, vector<size_t> > lastOne; + vector<DocumentEnding> docEndings; + size_t indexedCount; size_t currentFile; size_t totalIndexed;