Skip to content
Snippets Groups Projects
Commit 6415adf0 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

code cleanup

parent b4f7eadf
No related branches found
No related tags found
No related merge requests found
#include "Indexer.h" #include "Indexer.h"
Indexer::Indexer() { Indexer::Indexer() {
indexedCount = 0;
currentFile = 0; currentFile = 0;
totalIndexed = 0;
currentlyIndexed = 0; currentlyIndexed = 0;
currentBlockNumberWords = 0; currentBlockNumberWords = 0;
...@@ -14,29 +12,34 @@ void Indexer::run() { ...@@ -14,29 +12,34 @@ void Indexer::run() {
while(pointerToDictionaries.Size() != 0) { while(pointerToDictionaries.Size() != 0) {
unordered_map<string, vector<int> >* dictionary = pointerToDictionaries.Pop(); unordered_map<string, vector<int> >* dictionary = pointerToDictionaries.Pop();
DocumentEnding docEnd = DocumentEnding(); DocumentEnding docEnd = DocumentEnding();
size_t indexedCount = 0;
currentBlockNumberDocs++; currentBlockNumberDocs++;
for(auto word : *dictionary) { for(auto word : *dictionary) {
if(word.first.at(0) == '=') { if(word.first.at(0) == '=') {
docEnd.url = word.first.substr(1, word.first.length()); docEnd.url = word.first.substr(1, word.first.length());
continue; continue;
} }
indexedCount += word.second.size(); indexedCount += word.second.size();
totalIndexed += word.second.size();
currentBlockNumberWords += word.second.size(); currentBlockNumberWords += word.second.size();
for(auto location : word.second) { for(auto location : word.second) {
masterDictionary[word.first].push_back(currentlyIndexed + location); masterDictionary[word.first].push_back(currentlyIndexed + location);
} }
} }
currentlyIndexed += indexedCount; currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed; docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount; docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd); docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 300000) { if(currentBlockNumberWords >= 300000) {
save(); save();
reset(); reset();
} }
indexedCount = 0;
} }
save(); save();
reset(); reset();
} }
...@@ -46,7 +49,7 @@ void Indexer::verbose_run() { ...@@ -46,7 +49,7 @@ void Indexer::verbose_run() {
unordered_map<string, vector<int>> dictionary = *pointerToDictionaries.Pop(); unordered_map<string, vector<int>> dictionary = *pointerToDictionaries.Pop();
for(auto word : dictionary) { for(auto word : dictionary) {
for(auto location : word.second) { for(auto location : word.second) {
indexedCount++; // indexedCount++;
masterDictionary[word.first].push_back(location); masterDictionary[word.first].push_back(location);
} }
} }
...@@ -58,6 +61,7 @@ void Indexer::save() { ...@@ -58,6 +61,7 @@ void Indexer::save() {
map<string, size_t> seeker; map<string, size_t> seeker;
string fileName = "index" + to_string(currentFile) + ".txt"; string fileName = "index" + to_string(currentFile) + ".txt";
int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
// TODO: these should really be c strings // TODO: these should really be c strings
string header = "===STATS===\n"; string header = "===STATS===\n";
string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n"; string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n";
...@@ -69,34 +73,27 @@ void Indexer::save() { ...@@ -69,34 +73,27 @@ void Indexer::save() {
write(file, numberWords.c_str(), strlen(numberWords.c_str())); write(file, numberWords.c_str(), strlen(numberWords.c_str()));
write(file, numberDocs.c_str(), strlen(numberDocs.c_str())); write(file, numberDocs.c_str(), strlen(numberDocs.c_str()));
write(file, footer.c_str(), strlen(footer.c_str())); write(file, footer.c_str(), strlen(footer.c_str()));
// REALLY GROSS HACK // REALLY GROSS HACK
int seekOffset = strlen(header.c_str()) + size_t seekOffset = strlen(header.c_str()) +
strlen(numberDocs.c_str()) + strlen(numberDocs.c_str()) +
strlen(numberWords.c_str()) + strlen(numberWords.c_str()) +
strlen(uniqueWords.c_str()) + strlen(uniqueWords.c_str()) +
strlen(footer.c_str()); strlen(footer.c_str());
bool first = true;
for(auto word : maps) { for(auto word : maps) {
if(first) { //REALLY BAD HACKK seeker[word.first] = seekOffset;
first = false;
seeker[word.first] = seekOffset;
} else {
seeker[word.first] = seekOffset;
}
// string wordBreak = word.first + "\n"; // string wordBreak = word.first + "\n";
// write(file, wordBreak.c_str(), strlen(wordBreak.c_str())); // write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
// seekOffset += strlen(wordBreak.c_str()); // seekOffset += strlen(wordBreak.c_str());
bool first = true; bool firstPost = true;
size_t lastOne = 0; size_t lastOne = 0;
for(auto location : word.second) { for(auto location : word.second) {
if(first) { if(firstPost) {
string locationSpace = to_string(location) + " "; string locationSpace = to_string(location) + " ";
write(file, locationSpace.c_str(), strlen(locationSpace.c_str())); write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
seekOffset += strlen(locationSpace.c_str()); seekOffset += strlen(locationSpace.c_str());
first = false; firstPost = false;
} else { } else {
size_t delta = location - lastOne; size_t delta = location - lastOne;
string deltaSpace = to_string(delta) + " "; string deltaSpace = to_string(delta) + " ";
...@@ -146,7 +143,6 @@ void Indexer::verbose_save() { ...@@ -146,7 +143,6 @@ void Indexer::verbose_save() {
void Indexer::reset() { void Indexer::reset() {
masterDictionary.clear(); masterDictionary.clear();
docEndings.clear(); docEndings.clear();
currentBlockNumberWords = 0; currentBlockNumberWords = 0;
......
...@@ -42,9 +42,7 @@ class Indexer { ...@@ -42,9 +42,7 @@ class Indexer {
vector<DocumentEnding> docEndings; vector<DocumentEnding> docEndings;
size_t indexedCount;
size_t currentFile; size_t currentFile;
size_t totalIndexed;
size_t currentlyIndexed; size_t currentlyIndexed;
size_t currentBlockNumberWords; size_t currentBlockNumberWords;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment