From 30338cd9e478f189549ba5119dcce653f37c58b5 Mon Sep 17 00:00:00 2001
From: Nicholas Yang <parablank@gmail.com>
Date: Tue, 13 Mar 2018 17:34:27 -0400
Subject: [PATCH] very simple statistics for each block + doc endings, TODO:
 SEEK FILE

---
 indexer/DocumentEnding.h                      | 19 ++++++++
 indexer/IndexStreamReader.cpp                 | 21 ---------
 indexer/IndexStreamReader.h                   | 22 ---------
 indexer/Indexer.cpp                           | 46 +++++++++++++++++--
 indexer/Indexer.h                             |  8 +++-
 indexer/IndexerTests.cpp                      |  4 ++
 indexer/SeekFileTests.cpp                     | 20 ++++++++
 ...testUniqueness.cpp => UniquenessTests.cpp} |  0
 8 files changed, 93 insertions(+), 47 deletions(-)
 create mode 100644 indexer/DocumentEnding.h
 delete mode 100644 indexer/IndexStreamReader.cpp
 delete mode 100644 indexer/IndexStreamReader.h
 create mode 100644 indexer/SeekFileTests.cpp
 rename indexer/{testUniqueness.cpp => UniquenessTests.cpp} (100%)

diff --git a/indexer/DocumentEnding.h b/indexer/DocumentEnding.h
new file mode 100644
index 0000000..41c2d08
--- /dev/null
+++ b/indexer/DocumentEnding.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <iostream>
+
+using namespace std;
+
+class DocumentEnding {
+public:
+
+    DocumentEnding() {
+        docEndPosition = 0;
+        docNumWords = 0;
+        url = "";
+    }
+
+    size_t docEndPosition;
+    size_t docNumWords;
+    string url;
+};
\ No newline at end of file
diff --git a/indexer/IndexStreamReader.cpp b/indexer/IndexStreamReader.cpp
deleted file mode 100644
index cc1d222..0000000
--- a/indexer/IndexStreamReader.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//
-// Created by nick on 2/6/18.
-//
-
-#include "IndexStreamReader.h"
-
-IndexStreamReader::IndexStreamReader(string word) {
-    this->word = word;
-}
-
-int IndexStreamReader::first() {
-    
-}
-
-int IndexStreamReader::last() {
-
-}
-
-int IndexStreamReader::next(int location) {
-
-}
\ No newline at end of file
diff --git a/indexer/IndexStreamReader.h b/indexer/IndexStreamReader.h
deleted file mode 100644
index 92e3f16..0000000
--- a/indexer/IndexStreamReader.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//
-// Created by nick on 2/6/18.
-//
-
-#ifndef EECS398_SEARCH_INDEXSTREAMREADER_H
-#define EECS398_SEARCH_INDEXSTREAMREADER_H
-
-#include <iostream>
-
-using namespace std;
-
-class IndexStreamReader {
-public:
-    int first();
-    int last();
-    int next(int location);
-private:
-    IndexStreamReader(string word);
-    string word;
-};
-
-#endif //EECS398_SEARCH_INDEXSTREAMREADER_H
diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp
index 9fd454e..4316e31 100644
--- a/indexer/Indexer.cpp
+++ b/indexer/Indexer.cpp
@@ -5,20 +5,27 @@ Indexer::Indexer() {
     currentFile = 0;
     totalIndexed = 0;
     currentlyIndexed = 0;
+
+    currentBlockNumberWords = 0;
+    currentBlockNumberDocs = 0;
 }
 
 void Indexer::run() {
     while(pointerToDictionaries.Size() != 0) {
-        save();
-        reset();
         unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop();
+        currentBlockNumberDocs++;
         for(auto word : *dictionary) {
             indexedCount += word.second.size();
             totalIndexed += word.second.size();
+            currentBlockNumberWords += word.second.size();
             for(auto location : word.second) {
                 masterDictionary[word.first].push_back(currentlyIndexed + location);
             }
         }
+        if(currentBlockNumberWords >= 300000) {
+            save();
+            reset();
+        }
         currentlyIndexed += indexedCount;
         indexedCount = 0;
     }
@@ -39,8 +46,21 @@ void Indexer::verbose_run() {
 
 void Indexer::save() {
     map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
+    map<string, size_t> seeker;
     string fileName = "index" + to_string(currentFile) + ".txt";
     int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
+    // TODO: these should really be c strings
+    string header = "===STATS===\n";
+    string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n";
+    string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n";
+    string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n";
+    string footer = "===SEEK===\n";
+    write(file, header.c_str(), strlen(header.c_str()));
+    write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str()));
+    write(file, numberWords.c_str(), strlen(numberWords.c_str()));
+    write(file, numberDocs.c_str(), strlen(numberDocs.c_str()));
+    write(file, footer.c_str(), strlen(footer.c_str()));
+
     for(auto word : maps) {
         string wordBreak = word.first + "\n";
         write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
@@ -48,11 +68,21 @@ void Indexer::save() {
             string locationSpace = to_string(location) + " ";
             write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
         }
+        seeker[word.first] = 013;
         write(file, "\n", 1);
     }
+
+    // TODO: seek dictionary
+    string seekFileName = "index" + to_string(currentFile) + "-seek.txt";
+    int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
+    for(auto word : seeker) {
+        string line = word.first + " " + to_string(word.second) + "\n";
+        write(seekFile, line.c_str(), strlen(line.c_str()));
+    }
+
     close(file);
     currentFile++;
-    }
+}
 
 void Indexer::verbose_save() {
     map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
@@ -67,5 +97,15 @@ void Indexer::verbose_save() {
     }
 
 void Indexer::reset() {
+    unordered_map<string, vector<size_t>> lastOne;
+
+    for(auto bucket : masterDictionary) {
+        lastOne[bucket.first].push_back(bucket.second.back());
+    }
+
+    this->lastOne = lastOne;
     masterDictionary.clear();
+
+    currentBlockNumberWords = 0;
+    currentBlockNumberDocs = 0;
 }
diff --git a/indexer/Indexer.h b/indexer/Indexer.h
index b92d1fc..ea359a9 100644
--- a/indexer/Indexer.h
+++ b/indexer/Indexer.h
@@ -19,7 +19,7 @@ master index.
 TODO:
  Use deltas between the offsets
  Save with UTF-8 encoding
- Concrete block size - 500MB per block?
+ Concrete block size - 100MB per block?
  Save document endings and other relevant metadata?
 
 */
@@ -36,11 +36,17 @@ class Indexer {
     private:
         void save();
         void reset();
+
         unordered_map<string, vector<size_t> > masterDictionary;
+		unordered_map<string, vector<size_t> > lastOne;
+
         size_t indexedCount;
         size_t currentFile;
         size_t totalIndexed;
         size_t currentlyIndexed;
+
+        size_t currentBlockNumberWords;
+        size_t currentBlockNumberDocs;
 };
 
 #endif /*indexer_h*/
diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp
index da9a211..09ca76c 100644
--- a/indexer/IndexerTests.cpp
+++ b/indexer/IndexerTests.cpp
@@ -30,6 +30,7 @@ int main() {
             id++;
         }
     }
+    test1["=tests/test1.txt"].push_back(0);
     id = 0;
     while(ifstream2 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
@@ -39,6 +40,7 @@ int main() {
             id++;
         }
     }
+    test2["=tests/test2.txt"].push_back(0);
     id = 0;
     while(ifstream3 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
@@ -48,6 +50,7 @@ int main() {
             id++;
         }
     }
+    test3["=tests/test3.txt"].push_back(0);
     id = 0;
     while(ifstream4 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
@@ -57,6 +60,7 @@ int main() {
             id++;
         }
     }
+    test4["=tests/test4.txt"].push_back(0);
     indexer.pointerToDictionaries.Push(&test1);
     indexer.pointerToDictionaries.Push(&test2);
     indexer.pointerToDictionaries.Push(&test3);
diff --git a/indexer/SeekFileTests.cpp b/indexer/SeekFileTests.cpp
new file mode 100644
index 0000000..337d98d
--- /dev/null
+++ b/indexer/SeekFileTests.cpp
@@ -0,0 +1,20 @@
+//
+// Created by nick on 3/13/18.
+//
+
+#include <iostream>
+#include <fcntl.h>
+#include <unistd.h>
+
+using namespace std;
+
+int main() {
+    int index1 = open("index0.txt", O_CREAT|O_WRONLY, S_IRWXU);
+    lseek(index1, 25, SEEK_SET);
+    string fs = "hello";
+    if(write(index1, fs.c_str(), strlen(fs.c_str())) != strlen(fs.c_str())) {
+        cout << "ERROR" << endl;
+    }
+    close(index1);
+    return 0;
+}
\ No newline at end of file
diff --git a/indexer/testUniqueness.cpp b/indexer/UniquenessTests.cpp
similarity index 100%
rename from indexer/testUniqueness.cpp
rename to indexer/UniquenessTests.cpp
-- 
GitLab