From 30338cd9e478f189549ba5119dcce653f37c58b5 Mon Sep 17 00:00:00 2001 From: Nicholas Yang <parablank@gmail.com> Date: Tue, 13 Mar 2018 17:34:27 -0400 Subject: [PATCH] very simple statistics for each block + doc endings, TODO: SEEK FILE --- indexer/DocumentEnding.h | 19 ++++++++ indexer/IndexStreamReader.cpp | 21 --------- indexer/IndexStreamReader.h | 22 --------- indexer/Indexer.cpp | 46 +++++++++++++++++-- indexer/Indexer.h | 8 +++- indexer/IndexerTests.cpp | 4 ++ indexer/SeekFileTests.cpp | 20 ++++++++ ...testUniqueness.cpp => UniquenessTests.cpp} | 0 8 files changed, 93 insertions(+), 47 deletions(-) create mode 100644 indexer/DocumentEnding.h delete mode 100644 indexer/IndexStreamReader.cpp delete mode 100644 indexer/IndexStreamReader.h create mode 100644 indexer/SeekFileTests.cpp rename indexer/{testUniqueness.cpp => UniquenessTests.cpp} (100%) diff --git a/indexer/DocumentEnding.h b/indexer/DocumentEnding.h new file mode 100644 index 0000000..41c2d08 --- /dev/null +++ b/indexer/DocumentEnding.h @@ -0,0 +1,19 @@ +#pragma once + +#include <iostream> + +using namespace std; + +class DocumentEnding { +public: + + DocumentEnding() { + docEndPosition = 0; + docNumWords = 0; + url = ""; + } + + size_t docEndPosition; + size_t docNumWords; + string url; +}; \ No newline at end of file diff --git a/indexer/IndexStreamReader.cpp b/indexer/IndexStreamReader.cpp deleted file mode 100644 index cc1d222..0000000 --- a/indexer/IndexStreamReader.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// -// Created by nick on 2/6/18. -// - -#include "IndexStreamReader.h" - -IndexStreamReader::IndexStreamReader(string word) { - this->word = word; -} - -int IndexStreamReader::first() { - -} - -int IndexStreamReader::last() { - -} - -int IndexStreamReader::next(int location) { - -} \ No newline at end of file diff --git a/indexer/IndexStreamReader.h b/indexer/IndexStreamReader.h deleted file mode 100644 index 92e3f16..0000000 --- a/indexer/IndexStreamReader.h +++ /dev/null @@ -1,22 +0,0 @@ -// -// Created by nick on 2/6/18. -// - -#ifndef EECS398_SEARCH_INDEXSTREAMREADER_H -#define EECS398_SEARCH_INDEXSTREAMREADER_H - -#include <iostream> - -using namespace std; - -class IndexStreamReader { -public: - int first(); - int last(); - int next(int location); -private: - IndexStreamReader(string word); - string word; -}; - -#endif //EECS398_SEARCH_INDEXSTREAMREADER_H diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 9fd454e..4316e31 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -5,20 +5,27 @@ Indexer::Indexer() { currentFile = 0; totalIndexed = 0; currentlyIndexed = 0; + + currentBlockNumberWords = 0; + currentBlockNumberDocs = 0; } void Indexer::run() { while(pointerToDictionaries.Size() != 0) { - save(); - reset(); unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop(); + currentBlockNumberDocs++; for(auto word : *dictionary) { indexedCount += word.second.size(); totalIndexed += word.second.size(); + currentBlockNumberWords += word.second.size(); for(auto location : word.second) { masterDictionary[word.first].push_back(currentlyIndexed + location); } } + if(currentBlockNumberWords >= 300000) { + save(); + reset(); + } currentlyIndexed += indexedCount; indexedCount = 0; } @@ -39,8 +46,21 @@ void Indexer::verbose_run() { void Indexer::save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); + map<string, size_t> seeker; string fileName = "index" + to_string(currentFile) + ".txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); + // TODO: these should really be c strings + string header = "===STATS===\n"; + string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n"; + string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n"; + string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n"; + string footer = "===SEEK===\n"; + write(file, header.c_str(), strlen(header.c_str())); + write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str())); + write(file, numberWords.c_str(), strlen(numberWords.c_str())); + write(file, numberDocs.c_str(), strlen(numberDocs.c_str())); + write(file, footer.c_str(), strlen(footer.c_str())); + for(auto word : maps) { string wordBreak = word.first + "\n"; write(file, wordBreak.c_str(), strlen(wordBreak.c_str())); @@ -48,11 +68,21 @@ void Indexer::save() { string locationSpace = to_string(location) + " "; write(file, locationSpace.c_str(), strlen(locationSpace.c_str())); } + seeker[word.first] = 013; write(file, "\n", 1); } + + // TODO: seek dictionary + string seekFileName = "index" + to_string(currentFile) + "-seek.txt"; + int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); + for(auto word : seeker) { + string line = word.first + " " + to_string(word.second) + "\n"; + write(seekFile, line.c_str(), strlen(line.c_str())); + } + close(file); currentFile++; - } +} void Indexer::verbose_save() { map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); @@ -67,5 +97,15 @@ void Indexer::verbose_save() { } void Indexer::reset() { + unordered_map<string, vector<size_t>> lastOne; + + for(auto bucket : masterDictionary) { + lastOne[bucket.first].push_back(bucket.second.back()); + } + + this->lastOne = lastOne; masterDictionary.clear(); + + currentBlockNumberWords = 0; + currentBlockNumberDocs = 0; } diff --git a/indexer/Indexer.h b/indexer/Indexer.h index b92d1fc..ea359a9 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -19,7 +19,7 @@ master index. TODO: Use deltas between the offsets Save with UTF-8 encoding - Concrete block size - 500MB per block? + Concrete block size - 100MB per block? Save document endings and other relevant metadata? */ @@ -36,11 +36,17 @@ class Indexer { private: void save(); void reset(); + unordered_map<string, vector<size_t> > masterDictionary; + unordered_map<string, vector<size_t> > lastOne; + size_t indexedCount; size_t currentFile; size_t totalIndexed; size_t currentlyIndexed; + + size_t currentBlockNumberWords; + size_t currentBlockNumberDocs; }; #endif /*indexer_h*/ diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp index da9a211..09ca76c 100644 --- a/indexer/IndexerTests.cpp +++ b/indexer/IndexerTests.cpp @@ -30,6 +30,7 @@ int main() { id++; } } + test1["=tests/test1.txt"].push_back(0); id = 0; while(ifstream2 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -39,6 +40,7 @@ int main() { id++; } } + test2["=tests/test2.txt"].push_back(0); id = 0; while(ifstream3 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -48,6 +50,7 @@ int main() { id++; } } + test3["=tests/test3.txt"].push_back(0); id = 0; while(ifstream4 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); @@ -57,6 +60,7 @@ int main() { id++; } } + test4["=tests/test4.txt"].push_back(0); indexer.pointerToDictionaries.Push(&test1); indexer.pointerToDictionaries.Push(&test2); indexer.pointerToDictionaries.Push(&test3); diff --git a/indexer/SeekFileTests.cpp b/indexer/SeekFileTests.cpp new file mode 100644 index 0000000..337d98d --- /dev/null +++ b/indexer/SeekFileTests.cpp @@ -0,0 +1,20 @@ +// +// Created by nick on 3/13/18. +// + +#include <iostream> +#include <fcntl.h> +#include <unistd.h> + +using namespace std; + +int main() { + int index1 = open("index0.txt", O_CREAT|O_WRONLY, S_IRWXU); + lseek(index1, 25, SEEK_SET); + string fs = "hello"; + if(write(index1, fs.c_str(), strlen(fs.c_str())) != strlen(fs.c_str())) { + cout << "ERROR" << endl; + } + close(index1); + return 0; +} \ No newline at end of file diff --git a/indexer/testUniqueness.cpp b/indexer/UniquenessTests.cpp similarity index 100% rename from indexer/testUniqueness.cpp rename to indexer/UniquenessTests.cpp -- GitLab