diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp index 1f4cbd42383b23ddfa24c37bf119d8f34701d721..c2e4b970feb3997004b84c13ec480ea6dd1a475d 100644 --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -3,26 +3,32 @@ Indexer::Indexer() { indexedCount = 0; currentFile = 0; + totalIndexed = 0; + currentlyIndexed = 0; } void Indexer::run() { while(pointerToDictionaries.Size() != 0) { - if(indexedCount > 100000) { + if(totalIndexed > 5) { save(); reset(); } unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop(); for(auto word : *dictionary) { + indexedCount += word.second.size(); + totalIndexed += word.second.size(); for(auto location : word.second) { - indexedCount++; - masterDictionary[word.first].push_back(location); + masterDictionary[word.first].push_back(currentlyIndexed + location); } } + currentlyIndexed += indexedCount; + indexedCount = 0; } + save(); } void Indexer::save() { - map<string, vector<int> > maps(masterDictionary.begin(), masterDictionary.end()); + map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end()); string fileName = "index" + to_string(currentFile) + ".txt"; int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU); for(auto word : maps) { @@ -40,5 +46,5 @@ void Indexer::save() { void Indexer::reset() { masterDictionary.clear(); - indexedCount = 0; + totalIndexed = 0; } diff --git a/indexer/Indexer.h b/indexer/Indexer.h index 7b0c74883a4b5337fcd039f3b97d82227b8806d3..7bd12573ee6b9fda8f7522e3d248b9b5df5c3c08 100644 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -25,7 +25,9 @@ class Indexer { private: void save(); void reset(); - unordered_map<string, vector<int> > masterDictionary; + unordered_map<string, vector<size_t> > masterDictionary; size_t indexedCount; size_t currentFile; + size_t totalIndexed; + size_t currentlyIndexed; }; \ No newline at end of file diff --git a/indexer/IndexerTests.cpp b/indexer/IndexerTests.cpp index e50a81b84d55a917edf09480219278373388d20d..e64e57a591902026ff301483bd4cc0c29dd22088 100644 --- a/indexer/IndexerTests.cpp +++ b/indexer/IndexerTests.cpp @@ -16,39 +16,46 @@ int main() { unordered_map<string, vector<int>> test2; unordered_map<string, vector<int>> test3; unordered_map<string, vector<int>> test4; - ifstream ifstream1("tests/test1.txt"); - ifstream ifstream2("tests/test2.txt"); - ifstream ifstream3("tests/test3.txt"); - ifstream ifstream4("tests/test4.txt"); + ifstream ifstream1("tests/s-test1.txt"); + ifstream ifstream2("tests/s-test2.txt"); + ifstream ifstream3("tests/s-test3.txt"); + ifstream ifstream4("tests/s-test4.txt"); string word = ""; int id = 0; while(ifstream1 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test1[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream2 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test2[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream3 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test3[word].push_back(id); - id++; + id++; + } } + id = 0; while(ifstream4 >> word) { std::transform(word.begin(), word.end(), word.begin(), ::tolower); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); - if(word != "") + if(word != "") { test4[word].push_back(id); - id++; + id++; + } } indexer.pointerToDictionaries.Push(&test1); indexer.pointerToDictionaries.Push(&test2); diff --git a/indexer/index0.txt b/indexer/index0.txt new file mode 100755 index 0000000000000000000000000000000000000000..14dfe2426914747b83423267d97656b9463b617f --- /dev/null +++ b/indexer/index0.txt @@ -0,0 +1,56 @@ +ai +6 +believe +1 +but +17 +by +8 +can +3 21 +conversational +5 +human +29 +improve +4 +internet +16 +letting +9 +nature +30 +of +28 +on +14 +people +13 +reflect +25 +researchers +0 +say +22 +sometimes +18 +systems +7 20 +talk +11 +that +24 +the +15 26 +them +10 +these +19 +they +2 +things +23 +to +12 +worst +27 diff --git a/indexer/index1.txt b/indexer/index1.txt new file mode 100755 index 0000000000000000000000000000000000000000..124f3b639cde77e602ad3dd454fb1e73f14769f4 --- /dev/null +++ b/indexer/index1.txt @@ -0,0 +1,56 @@ +a +34 +an +55 +and +58 +by +47 +dictionaries +43 +dictionary +52 +document +51 +each +50 +end +62 +from +39 54 +generated +42 +hello +31 +i +45 +index +56 +indexing +38 +is +33 49 +locally +41 +mean +46 +of +37 57 +quick +35 +runs +59 +starts +53 +test +36 +the +61 +this +32 48 +till +60 +what +44 +with +40 diff --git a/indexer/index2.txt b/indexer/index2.txt new file mode 100755 index 0000000000000000000000000000000000000000..1146c2ad62fdb36d5f0fa651f0a598cc235e3856 --- /dev/null +++ b/indexer/index2.txt @@ -0,0 +1,56 @@ +all +64 +and +87 +because +90 +bike +81 +bikers +66 +do +69 +down +78 +dry +86 +ever +71 +feel +72 +for +63 +hate +92 +i +91 +its +88 +like +73 +looks +84 +not +89 +on +79 +out +67 +perfectly +85 +seat +82 +shit +74 +sit +77 +that +83 93 +there +68 +when +75 +you +65 70 76 +your +80 diff --git a/indexer/index3.txt b/indexer/index3.txt new file mode 100755 index 0000000000000000000000000000000000000000..172f889a490e0d762f157bdb4494baf7190ea7f5 --- /dev/null +++ b/indexer/index3.txt @@ -0,0 +1,72 @@ +and +111 116 133 +application +110 +changes +125 +code +130 +components +121 +create +99 +data +124 +debug +136 +declarative +126 +design +102 +each +106 +easier +134 +efficiently +114 +for +105 +in +108 +interactive +100 +it +96 +just +118 +make +128 +makes +95 +more +131 +painless +97 +predictable +132 +react +94 112 +render +117 +right +120 +simple +103 +state +107 +the +119 +to +98 135 +uis +101 +update +115 +views +104 127 +when +122 +will +113 +your +109 123 129 diff --git a/indexer/tests/s-test1.txt b/indexer/tests/s-test1.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b5ddba89f9be3818cb87e00c849bd69d9854b94 --- /dev/null +++ b/indexer/tests/s-test1.txt @@ -0,0 +1 @@ +Researchers believe they can improve conversational A.I. systems by letting them talk to people on the internet. But sometimes, these systems can say things that reflect the worst of human nature. \ No newline at end of file diff --git a/indexer/tests/s-test2.txt b/indexer/tests/s-test2.txt new file mode 100644 index 0000000000000000000000000000000000000000..faa6dae49313956ba2729b86dcbc026799118762 --- /dev/null +++ b/indexer/tests/s-test2.txt @@ -0,0 +1,2 @@ +hello this is a quick test of indexing from 0 with locally generated dictionaries - what I mean by this is +each document dictionary starts from an index of 0 and runs till the end. \ No newline at end of file diff --git a/indexer/tests/s-test3.txt b/indexer/tests/s-test3.txt new file mode 100644 index 0000000000000000000000000000000000000000..d11ea4f13427ddd1b554089603d98951a5a357fb --- /dev/null +++ b/indexer/tests/s-test3.txt @@ -0,0 +1,2 @@ +for all you bikers out there do you ever feel like shit when you sit down on your bike seat that looks perfectly dry +and it's not because i hate that \ No newline at end of file diff --git a/indexer/tests/s-test4.txt b/indexer/tests/s-test4.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ecba50f53362d5a2966a14fb392c184998d245e --- /dev/null +++ b/indexer/tests/s-test4.txt @@ -0,0 +1 @@ +React makes it painless to create interactive UIs. Design simple views for each state in your application, and React will efficiently update and render just the right components when your data changes. Declarative views make your code more predictable and easier to debug. \ No newline at end of file