#include "Indexer.h" Indexer::Indexer(ProducerConsumerQueue < DocIndex* > *doc_index_queue_in) : pointerToDictionaries( doc_index_queue_in ){ currentFile = 0; currentlyIndexed = 0; currentBlockNumberWords = 0; currentBlockNumberDocs = 0; } void Indexer::run() { bool cond = true; while(cond) { DocIndex * dictionary = pointerToDictionaries->Pop(); cout << "INDEX GOT A NEW dnary" << endl; DocumentEnding docEnd = DocumentEnding(); size_t indexedCount = 0; currentBlockNumberDocs++; for(auto word : *dictionary) { if(word.first.at(0) == '=') { docEnd.url = word.first.substr(1, word.first.length()); continue; } indexedCount += word.second.size(); currentBlockNumberWords += word.second.size(); for(auto location : word.second) { masterDictionary[word.first].push_back(currentlyIndexed + location); } } currentlyIndexed += indexedCount; docEnd.docEndPosition = currentlyIndexed; docEnd.docNumWords = indexedCount; docEndings.push_back(docEnd); if(currentBlockNumberWords >= 100000) { save(); reset(); } } save(); reset(); saveChunkDictionary(); } void Indexer::verbose_run() { /* while(pointerToDictionaries.Size() != 0) { DocIndex *pointerToDictionaries.Pop(); for(auto word : dictionary) { for(auto location : word.second) { // indexedCount++; masterDictionary[word.first].push_back(location); } } } */ } void Indexer::save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); map<string, size_t> seeker; string fileName = "index" + to_string(currentFile) + ".txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); // TODO: these should really be c strings string statsHeader = "===STATS===" "\nunique words: " + to_string(masterDictionary.size()) + "\nnumber words: " + to_string(currentBlockNumberWords) + "\nnumber docs: " + to_string(currentBlockNumberDocs) + "\n===========\n"; write(file, statsHeader.c_str(), strlen(statsHeader.c_str())); // REALLY GROSS HACK size_t seekOffset = strlen(statsHeader.c_str()); for(auto word : maps) { seeker[word.first] = seekOffset; chunkDictionary[word.first].push_back(currentFile); // string wordBreak = word.first + "\n"; // write(file, wordBreak.c_str(), strlen(wordBreak.c_str())); // seekOffset += strlen(wordBreak.c_str()); bool firstPost = true; size_t lastOne = 0; int numIndexed = 0; for(auto location : word.second) { numIndexed++; if(numIndexed >= 100) { PostingsSeekTableEntry entry = PostingsSeekTableEntry(); entry.offset = seekOffset; entry.realLocation = location; postingsSeekTable[word.first].push_back(entry); numIndexed = 0; } if(firstPost) { string locationSpace = to_string(location) + " "; write(file, locationSpace.c_str(), strlen(locationSpace.c_str())); seekOffset += strlen(locationSpace.c_str()); firstPost = false; } else { size_t delta = location - lastOne; string deltaSpace = to_string(delta) + " "; write(file, deltaSpace.c_str(), strlen(deltaSpace.c_str())); seekOffset += strlen(deltaSpace.c_str()); } lastOne = location; } write(file, "\n", 1); seekOffset += 1; } string docEndingHeader = "===Document Endings===\n"; write(file, docEndingHeader.c_str(), strlen(docEndingHeader.c_str())); seekOffset += strlen(docEndingHeader.c_str()); seeker["=docEnding"] = seekOffset; for(auto ending : docEndings) { string docEndString = "[" + ending.url + ", " + to_string(ending.docEndPosition) + ", " + to_string(ending.docNumWords) + "]\n"; write(file, docEndString.c_str(), strlen(docEndString.c_str())); } // TODO: seek dictionary string seekFileName = "index" + to_string(currentFile) + "-seek.txt"; int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); for(auto word : seeker) { string line = word.first + " " + to_string(word.second) + "\n"; write(seekFile, line.c_str(), strlen(line.c_str())); if(postingsSeekTable.find(word.first) != postingsSeekTable.end()) { string offsetLine = "\t"; for(int i = 0; i < postingsSeekTable[word.first].size(); i++) { offsetLine += "<" + to_string(postingsSeekTable[word.first][i].realLocation) + ", " + to_string(postingsSeekTable[word.first][i].offset) + "> "; } offsetLine += "\n"; write(seekFile, offsetLine.c_str(), strlen(offsetLine.c_str())); } } close(file); currentFile++; } void Indexer::saveChunkDictionary() { string fileName = "master-index.txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); for(auto word : chunkDictionary) { string wordDictionary = word.first + " "; for(auto chunk : word.second) { wordDictionary += to_string(chunk) + " "; } wordDictionary += "\n"; write(file, wordDictionary.c_str(), strlen(wordDictionary.c_str())); } close(file); } void Indexer::verbose_save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); for(auto word : maps) { cout << word.first << endl; for(auto location : word.second) { cout << location << " "; } cout << endl; } currentFile++; } void Indexer::reset() { masterDictionary.clear(); docEndings.clear(); postingsSeekTable.clear(); currentBlockNumberWords = 0; currentBlockNumberDocs = 0; }