From 7dea3986a5824fbcf0de1b600acf21b195e4ab36 Mon Sep 17 00:00:00 2001 From: Nicholas Yang <parablank@gmail.com> Date: Wed, 21 Feb 2018 21:18:35 -0500 Subject: [PATCH] allows dictionaries that are local to the document to be merged --- indexer/Indexer.cpp | 16 ++++++--- indexer/Indexer.h | 4 ++- indexer/IndexerTests.cpp | 31 ++++++++++------- indexer/index0.txt | 56 ++++++++++++++++++++++++++++++ indexer/index1.txt | 56 ++++++++++++++++++++++++++++++ indexer/index2.txt | 56 ++++++++++++++++++++++++++++++ indexer/index3.txt | 72 +++++++++++++++++++++++++++++++++++++++ indexer/tests/s-test1.txt | 1 + indexer/tests/s-test2.txt | 2 ++ indexer/tests/s-test3.txt | 2 ++ indexer/tests/s-test4.txt | 1 + 11 files changed, 279 insertions(+), 18 deletions(-) create mode 100755 indexer/index0.txt create mode 100755 indexer/index1.txt create mode 100755 indexer/index2.txt create mode 100755 indexer/index3.txt create mode 100644 indexer/tests/s-test1.txt create mode 100644 indexer/tests/s-test2.txt create mode 100644 indexer/tests/s-test3.txt create mode 100644 indexer/tests/s-test4.txt diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 1f4cbd4..c2e4b97 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -3,26 +3,32 @@ Indexer::Indexer() { indexedCount = 0; currentFile = 0; + totalIndexed = 0; + currentlyIndexed = 0; } void Indexer::run() { while(pointerToDictionaries.Size() != 0) { - if(indexedCount > 100000) { + if(totalIndexed > 5) { save(); reset(); } unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop(); for(auto word : *dictionary) { + indexedCount += word.second.size(); + totalIndexed += word.second.size(); for(auto location : word.second) { - indexedCount++; - masterDictionary[word.first].push_back(location); + masterDictionary[word.first].push_back(currentlyIndexed + location); } } + currentlyIndexed += indexedCount; + indexedCount = 0; } + save(); } void Indexer::save() { - map<string, vector<int> > maps(masterDictionary.begin(), masterDictionary.end()); + map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); string fileName = "index" + to_string(currentFile) + ".txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); for(auto word : maps) { @@ -40,5 +46,5 @@ void Indexer::save() { void Indexer::reset() { masterDictionary.clear(); - indexedCount = 0; + totalIndexed = 0; } diff --git a/indexer/Indexer.h b/indexer/Indexer.h index 7b0c748..7bd1257 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -25,7 +25,9 @@ class Indexer { private: void save(); void reset(); - unordered_map<string, vector<int> > masterDictionary; + unordered_map<string, vector<size_t> > masterDictionary; size_t indexedCount; size_t currentFile; + size_t totalIndexed; + size_t currentlyIndexed; }; \ No newline at end of file diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp index e50a81b..e64e57a 100644 --- a/indexer/IndexerTests.cpp +++ b/indexer/IndexerTests.cpp @@ -16,39 +16,46 @@ int main() { unordered_map<string, vector<int>> test2; unordered_map<string, vector<int>> test3; unordered_map<string, vector<int>> test4; - ifstream ifstream1("tests/test1.txt"); - ifstream ifstream2("tests/test2.txt"); - ifstream ifstream3("tests/test3.txt"); - ifstream ifstream4("tests/test4.txt"); + ifstream ifstream1("tests/s-test1.txt"); + ifstream ifstream2("tests/s-test2.txt"); + ifstream ifstream3("tests/s-test3.txt"); + ifstream ifstream4("tests/s-test4.txt"); string word = ""; int id = 0; while(ifstream1 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test1[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream2 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test2[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream3 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test3[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream4 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test4[word].push_back(id); - id++; + id++; + } } indexer.pointerToDictionaries.Push(&test1); indexer.pointerToDictionaries.Push(&test2); diff --git a/indexer/index0.txt b/indexer/index0.txt new file mode 100755 index 0000000..14dfe24 --- /dev/null +++ b/indexer/index0.txt @@ -0,0 +1,56 @@ +ai +6 +believe +1 +but +17 +by +8 +can +3 21 +conversational +5 +human +29 +improve +4 +internet +16 +letting +9 +nature +30 +of +28 +on +14 +people +13 +reflect +25 +researchers +0 +say +22 +sometimes +18 +systems +7 20 +talk +11 +that +24 +the +15 26 +them +10 +these +19 +they +2 +things +23 +to +12 +worst +27 diff --git a/indexer/index1.txt b/indexer/index1.txt new file mode 100755 index 0000000..124f3b6 --- /dev/null +++ b/indexer/index1.txt @@ -0,0 +1,56 @@ +a +34 +an +55 +and +58 +by +47 +dictionaries +43 +dictionary +52 +document +51 +each +50 +end +62 +from +39 54 +generated +42 +hello +31 +i +45 +index +56 +indexing +38 +is +33 49 +locally +41 +mean +46 +of +37 57 +quick +35 +runs +59 +starts +53 +test +36 +the +61 +this +32 48 +till +60 +what +44 +with +40 diff --git a/indexer/index2.txt b/indexer/index2.txt new file mode 100755 index 0000000..1146c2a --- /dev/null +++ b/indexer/index2.txt @@ -0,0 +1,56 @@ +all +64 +and +87 +because +90 +bike +81 +bikers +66 +do +69 +down +78 +dry +86 +ever +71 +feel +72 +for +63 +hate +92 +i +91 +its +88 +like +73 +looks +84 +not +89 +on +79 +out +67 +perfectly +85 +seat +82 +shit +74 +sit +77 +that +83 93 +there +68 +when +75 +you +65 70 76 +your +80 diff --git a/indexer/index3.txt b/indexer/index3.txt new file mode 100755 index 0000000..172f889 --- /dev/null +++ b/indexer/index3.txt @@ -0,0 +1,72 @@ +and +111 116 133 +application +110 +changes +125 +code +130 +components +121 +create +99 +data +124 +debug +136 +declarative +126 +design +102 +each +106 +easier +134 +efficiently +114 +for +105 +in +108 +interactive +100 +it +96 +just +118 +make +128 +makes +95 +more +131 +painless +97 +predictable +132 +react +94 112 +render +117 +right +120 +simple +103 +state +107 +the +119 +to +98 135 +uis +101 +update +115 +views +104 127 +when +122 +will +113 +your +109 123 129 diff --git a/indexer/tests/s-test1.txt b/indexer/tests/s-test1.txt new file mode 100644 index 0000000..7b5ddba --- /dev/null +++ b/indexer/tests/s-test1.txt @@ -0,0 +1 @@ +Researchers believe they can improve conversational A.I. systems by letting them talk to people on the internet. But sometimes, these systems can say things that reflect the worst of human nature. \ No newline at end of file diff --git a/indexer/tests/s-test2.txt b/indexer/tests/s-test2.txt new file mode 100644 index 0000000..faa6dae --- /dev/null +++ b/indexer/tests/s-test2.txt @@ -0,0 +1,2 @@ +hello this is a quick test of indexing from 0 with locally generated dictionaries - what I mean by this is +each document dictionary starts from an index of 0 and runs till the end. \ No newline at end of file diff --git a/indexer/tests/s-test3.txt b/indexer/tests/s-test3.txt new file mode 100644 index 0000000..d11ea4f --- /dev/null +++ b/indexer/tests/s-test3.txt @@ -0,0 +1,2 @@ +for all you bikers out there do you ever feel like shit when you sit down on your bike seat that looks perfectly dry +and it's not because i hate that \ No newline at end of file diff --git a/indexer/tests/s-test4.txt b/indexer/tests/s-test4.txt new file mode 100644 index 0000000..1ecba50 --- /dev/null +++ b/indexer/tests/s-test4.txt @@ -0,0 +1 @@ +React makes it painless to create interactive UIs. Design simple views for each state in your application, and React will efficiently update and render just the right components when your data changes. Declarative views make your code more predictable and easier to debug. \ No newline at end of file -- GitLab