diff --git a/indexer/DocumentEnding.h b/indexer/DocumentEnding.h new file mode 100644 index 0000000000000000000000000000000000000000..41c2d08dbf10dbf6cc34338788f80b1e1f4a2cb8 --- /dev/null +++ b/indexer/DocumentEnding.h @@ -0,0 +1,19 @@ +#pragma once + +#include <iostream> + +using namespace std; + +class DocumentEnding { +public: + + DocumentEnding() { + docEndPosition = 0; + docNumWords = 0; + url = ""; + } + + size_t docEndPosition; + size_t docNumWords; + string url; +}; \ No newline at end of file diff --git a/indexer/IndexStreamReader.cpp b/indexer/IndexStreamReader.cpp deleted file mode 100644 index cc1d222327b430ea6eb22f8fae652e655d570bae..0000000000000000000000000000000000000000 --- a/indexer/IndexStreamReader.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// -// Created by nick on 2/6/18. -// - -#include "IndexStreamReader.h" - -IndexStreamReader::IndexStreamReader(string word) { - this->word = word; -} - -int IndexStreamReader::first() { - -} - -int IndexStreamReader::last() { - -} - -int IndexStreamReader::next(int location) { - -} \ No newline at end of file diff --git a/indexer/IndexStreamReader.h b/indexer/IndexStreamReader.h deleted file mode 100644 index 92e3f16c1f281effe6e5d227e2fa681dd6364668..0000000000000000000000000000000000000000 --- a/indexer/IndexStreamReader.h +++ /dev/null @@ -1,22 +0,0 @@ -// -// Created by nick on 2/6/18. -// - -#ifndef EECS398_SEARCH_INDEXSTREAMREADER_H -#define EECS398_SEARCH_INDEXSTREAMREADER_H - -#include <iostream> - -using namespace std; - -class IndexStreamReader { -public: - int first(); - int last(); - int next(int location); -private: - IndexStreamReader(string word); - string word; -}; - -#endif //EECS398_SEARCH_INDEXSTREAMREADER_H diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 9fd454ed35bf1c3ab914117f48fbe3c3d2842c87..4316e317e8340d5913c4fcfdcb925e8c6c1e550b 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -5,20 +5,27 @@ Indexer::Indexer() { currentFile = 0; totalIndexed = 0; currentlyIndexed = 0; + + currentBlockNumberWords = 0; + currentBlockNumberDocs = 0; } void Indexer::run() { while(pointerToDictionaries.Size() != 0) { - save(); - reset(); unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop(); + currentBlockNumberDocs++; for(auto word : *dictionary) { indexedCount += word.second.size(); totalIndexed += word.second.size(); + currentBlockNumberWords += word.second.size(); for(auto location : word.second) { masterDictionary[word.first].push_back(currentlyIndexed + location); } } + if(currentBlockNumberWords >= 300000) { + save(); + reset(); + } currentlyIndexed += indexedCount; indexedCount = 0; } @@ -39,8 +46,21 @@ void Indexer::verbose_run() { void Indexer::save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); + map<string, size_t> seeker; string fileName = "index" + to_string(currentFile) + ".txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); + // TODO: these should really be c strings + string header = "===STATS===\n"; + string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n"; + string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n"; + string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n"; + string footer = "===SEEK===\n"; + write(file, header.c_str(), strlen(header.c_str())); + write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str())); + write(file, numberWords.c_str(), strlen(numberWords.c_str())); + write(file, numberDocs.c_str(), strlen(numberDocs.c_str())); + write(file, footer.c_str(), strlen(footer.c_str())); + for(auto word : maps) { string wordBreak = word.first + "\n"; write(file, wordBreak.c_str(), strlen(wordBreak.c_str())); @@ -48,11 +68,21 @@ void Indexer::save() { string locationSpace = to_string(location) + " "; write(file, locationSpace.c_str(), strlen(locationSpace.c_str())); } + seeker[word.first] = 013; write(file, "\n", 1); } + + // TODO: seek dictionary + string seekFileName = "index" + to_string(currentFile) + "-seek.txt"; + int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); + for(auto word : seeker) { + string line = word.first + " " + to_string(word.second) + "\n"; + write(seekFile, line.c_str(), strlen(line.c_str())); + } + close(file); currentFile++; - } +} void Indexer::verbose_save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); @@ -67,5 +97,15 @@ void Indexer::verbose_save() { } void Indexer::reset() { + unordered_map<string, vector<size_t>> lastOne; + + for(auto bucket : masterDictionary) { + lastOne[bucket.first].push_back(bucket.second.back()); + } + + this->lastOne = lastOne; masterDictionary.clear(); + + currentBlockNumberWords = 0; + currentBlockNumberDocs = 0; } diff --git a/indexer/Indexer.h b/indexer/Indexer.h index b92d1fc667b447814ec986aa7b3c1a2c6c6716cf..ea359a971a27bca6415826d0ed166345c2b5a45b 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -19,7 +19,7 @@ master index. TODO: Use deltas between the offsets Save with UTF-8 encoding - Concrete block size - 500MB per block? + Concrete block size - 100MB per block? Save document endings and other relevant metadata? */ @@ -36,11 +36,17 @@ class Indexer { private: void save(); void reset(); + unordered_map<string, vector<size_t> > masterDictionary; + unordered_map<string, vector<size_t> > lastOne; + size_t indexedCount; size_t currentFile; size_t totalIndexed; size_t currentlyIndexed; + + size_t currentBlockNumberWords; + size_t currentBlockNumberDocs; }; #endif /*indexer_h*/ diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp index da9a2110eead0693856389a5d2e6e795e4cd2bec..09ca76c0c8d4400382710e99dae7c48bd5dd75e5 100644 --- a/indexer/IndexerTests.cpp +++ b/indexer/IndexerTests.cpp @@ -30,6 +30,7 @@ int main() { id++; } } + test1["=tests/test1.txt"].push_back(0); id = 0; while(ifstream2 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -39,6 +40,7 @@ int main() { id++; } } + test2["=tests/test2.txt"].push_back(0); id = 0; while(ifstream3 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -48,6 +50,7 @@ int main() { id++; } } + test3["=tests/test3.txt"].push_back(0); id = 0; while(ifstream4 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -57,6 +60,7 @@ int main() { id++; } } + test4["=tests/test4.txt"].push_back(0); indexer.pointerToDictionaries.Push(&test1); indexer.pointerToDictionaries.Push(&test2); indexer.pointerToDictionaries.Push(&test3); diff --git a/indexer/SeekFileTests.cpp b/indexer/SeekFileTests.cpp new file mode 100644 index 0000000000000000000000000000000000000000..337d98d27692809a4c3fea135ca6521b3f9a32d5 --- /dev/null +++ b/indexer/SeekFileTests.cpp @@ -0,0 +1,20 @@ +// +// Created by nick on 3/13/18. +// + +#include <iostream> +#include <fcntl.h> +#include <unistd.h> + +using namespace std; + +int main() { + int index1 = open("index0.txt", O_CREAT|O_WRONLY, S_IRWXU); + lseek(index1, 25, SEEK_SET); + string fs = "hello"; + if(write(index1, fs.c_str(), strlen(fs.c_str())) != strlen(fs.c_str())) { + cout << "ERROR" << endl; + } + close(index1); + return 0; +} \ No newline at end of file diff --git a/indexer/testUniqueness.cpp b/indexer/UniquenessTests.cpp similarity index 100% rename from indexer/testUniqueness.cpp rename to indexer/UniquenessTests.cpp