Newer
Older
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
void Indexer::run() {
while(pointerToDictionaries.Size() != 0) {
unordered_map<string, vector<int> >* dictionary = pointerToDictionaries.Pop();
DocumentEnding docEnd = DocumentEnding();
currentBlockNumberDocs++;
if(word.first.at(0) == '=') {
docEnd.url = word.first.substr(1, word.first.length());
indexedCount += word.second.size();
currentBlockNumberWords += word.second.size();
masterDictionary[word.first].push_back(currentlyIndexed + location);
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 300000) {
save();
reset();
}
void Indexer::verbose_run() {
while(pointerToDictionaries.Size() != 0) {
unordered_map<string, vector<int>> dictionary = *pointerToDictionaries.Pop();
for(auto word : dictionary) {
for(auto location : word.second) {
masterDictionary[word.first].push_back(location);
}
}
}
}
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
map<string, size_t> seeker;
string fileName = "index" + to_string(currentFile) + ".txt";
int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
// TODO: these should really be c strings
string header = "===STATS===\n";
string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n";
string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n";
string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n";
string footer = "===========\n";
write(file, header.c_str(), strlen(header.c_str()));
write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str()));
write(file, numberWords.c_str(), strlen(numberWords.c_str()));
write(file, numberDocs.c_str(), strlen(numberDocs.c_str()));
write(file, footer.c_str(), strlen(footer.c_str()));
strlen(numberDocs.c_str()) +
strlen(numberWords.c_str()) +
strlen(uniqueWords.c_str()) +
strlen(footer.c_str());
// string wordBreak = word.first + "\n";
// write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
// seekOffset += strlen(wordBreak.c_str());
string locationSpace = to_string(location) + " ";
write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
seekOffset += strlen(locationSpace.c_str());
} else {
size_t delta = location - lastOne;
string deltaSpace = to_string(delta) + " ";
write(file, deltaSpace.c_str(), strlen(deltaSpace.c_str()));
seekOffset += strlen(deltaSpace.c_str());
}
lastOne = location;
string docEndingHeader = "===Document Endings===\n";
write(file, docEndingHeader.c_str(), strlen(docEndingHeader.c_str()));
for(auto ending : docEndings) {
string docEndString = "[" +
ending.url + ", " +
to_string(ending.docEndPosition) + ", " +
to_string(ending.docNumWords) + "]\n";
write(file, docEndString.c_str(), strlen(docEndString.c_str()));
}
// TODO: seek dictionary
string seekFileName = "index" + to_string(currentFile) + "-seek.txt";
int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
for(auto word : seeker) {
string line = word.first + " " + to_string(word.second) + "\n";
write(seekFile, line.c_str(), strlen(line.c_str()));
}
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
for(auto word : maps) {
cout << word.first << endl;
for(auto location : word.second) {
cout << location << " ";
}
cout << endl;
}
currentFile++;
}
void Indexer::reset() {
masterDictionary.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;