From 7dea3986a5824fbcf0de1b600acf21b195e4ab36 Mon Sep 17 00:00:00 2001
From: Nicholas Yang <parablank@gmail.com>
Date: Wed, 21 Feb 2018 21:18:35 -0500
Subject: [PATCH] allows dictionaries that are local to the document to be
 merged

---
 indexer/Indexer.cpp       | 16 ++++++---
 indexer/Indexer.h         |  4 ++-
 indexer/IndexerTests.cpp  | 31 ++++++++++-------
 indexer/index0.txt        | 56 ++++++++++++++++++++++++++++++
 indexer/index1.txt        | 56 ++++++++++++++++++++++++++++++
 indexer/index2.txt        | 56 ++++++++++++++++++++++++++++++
 indexer/index3.txt        | 72 +++++++++++++++++++++++++++++++++++++++
 indexer/tests/s-test1.txt |  1 +
 indexer/tests/s-test2.txt |  2 ++
 indexer/tests/s-test3.txt |  2 ++
 indexer/tests/s-test4.txt |  1 +
 11 files changed, 279 insertions(+), 18 deletions(-)
 create mode 100755 indexer/index0.txt
 create mode 100755 indexer/index1.txt
 create mode 100755 indexer/index2.txt
 create mode 100755 indexer/index3.txt
 create mode 100644 indexer/tests/s-test1.txt
 create mode 100644 indexer/tests/s-test2.txt
 create mode 100644 indexer/tests/s-test3.txt
 create mode 100644 indexer/tests/s-test4.txt

diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp
index 1f4cbd4..c2e4b97 100644
--- a/indexer/Indexer.cpp
+++ b/indexer/Indexer.cpp
@@ -3,26 +3,32 @@
 Indexer::Indexer() {
     indexedCount = 0;
     currentFile = 0;
+    totalIndexed = 0;
+    currentlyIndexed = 0;
 }
 
 void Indexer::run() {
     while(pointerToDictionaries.Size() != 0) {
-        if(indexedCount > 100000) {
+        if(totalIndexed > 5) {
             save();
             reset();
         }
         unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop();
         for(auto word : *dictionary) {
+            indexedCount += word.second.size();
+            totalIndexed += word.second.size();
             for(auto location : word.second) {
-                indexedCount++;
-                masterDictionary[word.first].push_back(location);
+                masterDictionary[word.first].push_back(currentlyIndexed + location);
             }
         }
+        currentlyIndexed += indexedCount;
+        indexedCount = 0;
     }
+    save();
 }
 
 void Indexer::save() {
-    map<string, vector<int> > maps(masterDictionary.begin(), masterDictionary.end());
+    map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
     string fileName = "index" + to_string(currentFile) + ".txt";
     int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
     for(auto word : maps) {
@@ -40,5 +46,5 @@ void Indexer::save() {
 
 void Indexer::reset() {
     masterDictionary.clear();
-    indexedCount = 0;
+    totalIndexed = 0;
 }
diff --git a/indexer/Indexer.h b/indexer/Indexer.h
index 7b0c748..7bd1257 100644
--- a/indexer/Indexer.h
+++ b/indexer/Indexer.h
@@ -25,7 +25,9 @@ class Indexer {
     private:
         void save();
         void reset();
-        unordered_map<string, vector<int> > masterDictionary;
+        unordered_map<string, vector<size_t> > masterDictionary;
         size_t indexedCount;
         size_t currentFile;
+        size_t totalIndexed;
+        size_t currentlyIndexed;
 };
\ No newline at end of file
diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp
index e50a81b..e64e57a 100644
--- a/indexer/IndexerTests.cpp
+++ b/indexer/IndexerTests.cpp
@@ -16,39 +16,46 @@ int main() {
     unordered_map<string, vector<int>> test2;
     unordered_map<string, vector<int>> test3;
     unordered_map<string, vector<int>> test4;
-    ifstream ifstream1("tests/test1.txt");
-    ifstream ifstream2("tests/test2.txt");
-    ifstream ifstream3("tests/test3.txt");
-    ifstream ifstream4("tests/test4.txt");
+    ifstream ifstream1("tests/s-test1.txt");
+    ifstream ifstream2("tests/s-test2.txt");
+    ifstream ifstream3("tests/s-test3.txt");
+    ifstream ifstream4("tests/s-test4.txt");
     string word = "";
     int id = 0;
     while(ifstream1 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
         word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end());
-        if(word != "")
+        if(word != "") {
             test1[word].push_back(id);
-        id++;
+            id++;
+        }
     }
+    id = 0;
     while(ifstream2 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
         word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end());
-        if(word != "")
+        if(word != "") {
             test2[word].push_back(id);
-        id++;
+            id++;
+        }
     }
+    id = 0;
     while(ifstream3 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
         word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end());
-        if(word != "")
+        if(word != "") {
             test3[word].push_back(id);
-        id++;
+            id++;
+        }
     }
+    id = 0;
     while(ifstream4 >> word) {
         std::transform(word.begin(), word.end(), word.begin(), ::tolower);
         word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end());
-        if(word != "")
+        if(word != "") {
             test4[word].push_back(id);
-        id++;
+            id++;
+        }
     }
     indexer.pointerToDictionaries.Push(&test1);
     indexer.pointerToDictionaries.Push(&test2);
diff --git a/indexer/index0.txt b/indexer/index0.txt
new file mode 100755
index 0000000..14dfe24
--- /dev/null
+++ b/indexer/index0.txt
@@ -0,0 +1,56 @@
+ai
+6 
+believe
+1 
+but
+17 
+by
+8 
+can
+3 21 
+conversational
+5 
+human
+29 
+improve
+4 
+internet
+16 
+letting
+9 
+nature
+30 
+of
+28 
+on
+14 
+people
+13 
+reflect
+25 
+researchers
+0 
+say
+22 
+sometimes
+18 
+systems
+7 20 
+talk
+11 
+that
+24 
+the
+15 26 
+them
+10 
+these
+19 
+they
+2 
+things
+23 
+to
+12 
+worst
+27 
diff --git a/indexer/index1.txt b/indexer/index1.txt
new file mode 100755
index 0000000..124f3b6
--- /dev/null
+++ b/indexer/index1.txt
@@ -0,0 +1,56 @@
+a
+34 
+an
+55 
+and
+58 
+by
+47 
+dictionaries
+43 
+dictionary
+52 
+document
+51 
+each
+50 
+end
+62 
+from
+39 54 
+generated
+42 
+hello
+31 
+i
+45 
+index
+56 
+indexing
+38 
+is
+33 49 
+locally
+41 
+mean
+46 
+of
+37 57 
+quick
+35 
+runs
+59 
+starts
+53 
+test
+36 
+the
+61 
+this
+32 48 
+till
+60 
+what
+44 
+with
+40 
diff --git a/indexer/index2.txt b/indexer/index2.txt
new file mode 100755
index 0000000..1146c2a
--- /dev/null
+++ b/indexer/index2.txt
@@ -0,0 +1,56 @@
+all
+64 
+and
+87 
+because
+90 
+bike
+81 
+bikers
+66 
+do
+69 
+down
+78 
+dry
+86 
+ever
+71 
+feel
+72 
+for
+63 
+hate
+92 
+i
+91 
+its
+88 
+like
+73 
+looks
+84 
+not
+89 
+on
+79 
+out
+67 
+perfectly
+85 
+seat
+82 
+shit
+74 
+sit
+77 
+that
+83 93 
+there
+68 
+when
+75 
+you
+65 70 76 
+your
+80 
diff --git a/indexer/index3.txt b/indexer/index3.txt
new file mode 100755
index 0000000..172f889
--- /dev/null
+++ b/indexer/index3.txt
@@ -0,0 +1,72 @@
+and
+111 116 133 
+application
+110 
+changes
+125 
+code
+130 
+components
+121 
+create
+99 
+data
+124 
+debug
+136 
+declarative
+126 
+design
+102 
+each
+106 
+easier
+134 
+efficiently
+114 
+for
+105 
+in
+108 
+interactive
+100 
+it
+96 
+just
+118 
+make
+128 
+makes
+95 
+more
+131 
+painless
+97 
+predictable
+132 
+react
+94 112 
+render
+117 
+right
+120 
+simple
+103 
+state
+107 
+the
+119 
+to
+98 135 
+uis
+101 
+update
+115 
+views
+104 127 
+when
+122 
+will
+113 
+your
+109 123 129 
diff --git a/indexer/tests/s-test1.txt b/indexer/tests/s-test1.txt
new file mode 100644
index 0000000..7b5ddba
--- /dev/null
+++ b/indexer/tests/s-test1.txt
@@ -0,0 +1 @@
+Researchers believe they can improve conversational A.I. systems by letting them talk to people on the internet. But sometimes, these systems can say things that reflect the worst of human nature.
\ No newline at end of file
diff --git a/indexer/tests/s-test2.txt b/indexer/tests/s-test2.txt
new file mode 100644
index 0000000..faa6dae
--- /dev/null
+++ b/indexer/tests/s-test2.txt
@@ -0,0 +1,2 @@
+hello this is a quick test of indexing from 0 with locally generated dictionaries - what I mean by this is
+each document dictionary starts from an index of 0 and runs till the end.
\ No newline at end of file
diff --git a/indexer/tests/s-test3.txt b/indexer/tests/s-test3.txt
new file mode 100644
index 0000000..d11ea4f
--- /dev/null
+++ b/indexer/tests/s-test3.txt
@@ -0,0 +1,2 @@
+for all you bikers out there do you ever feel like shit when you sit down on your bike seat that looks perfectly dry
+and it's not because i hate that
\ No newline at end of file
diff --git a/indexer/tests/s-test4.txt b/indexer/tests/s-test4.txt
new file mode 100644
index 0000000..1ecba50
--- /dev/null
+++ b/indexer/tests/s-test4.txt
@@ -0,0 +1 @@
+React makes it painless to create interactive UIs. Design simple views for each state in your application, and React will efficiently update and render just the right components when your data changes. Declarative views make your code more predictable and easier to debug.
\ No newline at end of file
-- 
GitLab