From 97b9eaae7df0db5929084c02190c2df1e4fc59a4 Mon Sep 17 00:00:00 2001
From: Nicholas Yang <parablank@gmail.com>
Date: Tue, 27 Mar 2018 23:13:11 -0400
Subject: [PATCH] updating dhts and indexer to use mmdht

---
 DataStructures/DiskHashTable/DiskHashTable.h  |  25 +-
 .../DiskHashTable/DiskHashTableTests.cpp      |  66 +++++-
 .../DiskHashTable/MMDiskHashTable.h           | 216 ++++++++++++++++++
 indexer/Indexer.cpp                           |  75 ++----
 indexer/Indexer.h                             |   2 +-
 5 files changed, 319 insertions(+), 65 deletions(-)
 mode change 100644 => 100755 DataStructures/DiskHashTable/DiskHashTable.h
 mode change 100644 => 100755 DataStructures/DiskHashTable/DiskHashTableTests.cpp
 create mode 100644 DataStructures/DiskHashTable/MMDiskHashTable.h
 mode change 100644 => 100755 indexer/Indexer.cpp
 mode change 100644 => 100755 indexer/Indexer.h

diff --git a/DataStructures/DiskHashTable/DiskHashTable.h b/DataStructures/DiskHashTable/DiskHashTable.h
old mode 100644
new mode 100755
index 1933098..7ca9612
--- a/DataStructures/DiskHashTable/DiskHashTable.h
+++ b/DataStructures/DiskHashTable/DiskHashTable.h
@@ -64,6 +64,7 @@ public:
             lseek(file, 0, SEEK_SET);
             read(file, numKeys, 10);
             size = stoll(numKeys);
+            fileSize = FileSize1(file) - 10;
             capacity = floor(fileSize / nodeSize);
         }
     }
@@ -119,7 +120,7 @@ public:
         }
         size++;
         lseek(file, 0, SEEK_SET);
-        string sizeString = to_string(size) + "\n";
+        string sizeString = to_string(size);
         sizeString.resize(10);
         write(file, sizeString.c_str(), 10);
         return true;
@@ -135,9 +136,7 @@ public:
         lseek(file, loc, SEEK_SET);
         char buffer[nodeSize];
         pair<string, string> result;
-        size_t searched = 0;
         do {
-            searched++;
             buffer[0] = '\0';
             size_t bytes = read(file, buffer, nodeSize);
             if(bytes == 0) {
@@ -145,11 +144,10 @@ public:
                 read(file, buffer, nodeSize);
             }
             result = extractKeyValueFromBuffer(buffer);
-            if(searched == size) {
+            if(buffer[0] == '\0') {
                 return "";
             }
         } while(strcmp(result.first.c_str(), query.c_str()) != 0);
-        std::cout << searched << std::endl;
         return result.second;
     }
 
@@ -187,7 +185,7 @@ private:
     void rehash() {
         string tempRehashedFileName = fileName + "_rehashed.txt";
         int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU);
-        ssize_t doubledFileSize = fileSize * 2 + 9;
+        ssize_t doubledFileSize = (fileSize * 2) + 9;
         lseek(rehashFile, doubledFileSize, SEEK_SET);
         write(rehashFile, "", 1);
         fileSize = FileSize1(rehashFile) - 10;
@@ -206,10 +204,23 @@ private:
                 size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize;
                 lseek(rehashFile, newLocation, SEEK_SET);
                 char buffer[nodeSize];
+                bool rewindToStart = true;
                 while(read(rehashFile, buffer, nodeSize)) {
                     lseek(rehashFile, -nodeSize, SEEK_CUR);
                     if(buffer[0] == '\0') {
                         write(rehashFile, entry, strlen(entry));
+                        rewindToStart = false;
+                        break;
+                    } else {
+                        lseek(rehashFile, nodeSize, SEEK_CUR);
+                    }
+                }
+                lseek(rehashFile, 10, SEEK_SET);
+                while(rewindToStart && read(rehashFile, buffer, nodeSize)) {
+                    lseek(rehashFile, -nodeSize, SEEK_CUR);
+                    if(buffer[0] == '\0') {
+                        write(rehashFile, entry, strlen(entry));
+                        rewindToStart = false;
                         break;
                     } else {
                         lseek(rehashFile, nodeSize, SEEK_CUR);
@@ -230,4 +241,4 @@ private:
         return st.st_size;
     }
 
-};
\ No newline at end of file
+};
diff --git a/DataStructures/DiskHashTable/DiskHashTableTests.cpp b/DataStructures/DiskHashTable/DiskHashTableTests.cpp
old mode 100644
new mode 100755
index afb739a..e7d9a3d
--- a/DataStructures/DiskHashTable/DiskHashTableTests.cpp
+++ b/DataStructures/DiskHashTable/DiskHashTableTests.cpp
@@ -1,6 +1,9 @@
 #include <iostream>
 #include <vector>
+#include <chrono>
 #include <cassert>
+#include <unordered_map>
+#include "MMDiskHashTable.h"
 #include "DiskHashTable.h"
 
 using namespace std;
@@ -15,7 +18,8 @@ string randomString(int length) {
 }
 
 int main() {
-    DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8);
+    const size_t NUMBER_OF_ELEMENTS = 10000;
+    DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test1.txt", 10, 8);
 
     vector<pair<string, string>> data;
 //    data.push_back({"sherlock", "holmes"});
@@ -56,17 +60,65 @@ int main() {
 //    data.push_back({"lana del", "rey"});
 //    data.push_back({"system of", "a down"});
 
-    for(int i = 0; i < 5000; i++) {
-        data.push_back({randomString(rand() % 8 + 3), randomString(rand() % 6 + 3)});
+    double totalInsertionTime = 0.0;
+    for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        dht.insert(to_string(i), to_string(i));
+        auto end = clock();
+        totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
     }
+    cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into DHT: " << totalInsertionTime << endl;
+    cout << "Average insertion time for DHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
 
-    for(auto entry : data) {
-        dht.insert(entry.first, entry.second);
+    double totalLookupTime = 0.0;
+    for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        assert(dht.find(to_string(i)) == to_string(i));
+        auto end = clock();
+        totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
     }
+    cout << "Average lookup time for DHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl;
 
-    for(auto entry : data) {
-        assert(dht.find(entry.first) == entry.second);
+    MMDiskHashTable mmdht = MMDiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8);
+    totalInsertionTime = 0.0;
+    for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        mmdht.insert(to_string(i), to_string(i));
+        auto end = clock();
+        totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
     }
+    cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into MMDHT: " << totalInsertionTime << endl;
+    cout << "Average insertion time for MMDHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
+
+    totalLookupTime = 0.0;
+    for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        assert(mmdht.find(to_string(i)) == to_string(i));
+        auto end = clock();
+        totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
+    }
+    cout << "Average lookup time for MMDHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl;
+
+    unordered_map<string, string> stlTest;
+    totalInsertionTime = 0.0;
+    for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        stlTest[to_string(i)] = to_string(i);
+        auto end = clock();
+        totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
+    }
+    cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into unordered_map: " << totalInsertionTime << endl;
+    cout << "Average insertion time for STL unordered_map: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
+
+    totalLookupTime = 0.0;
+    for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
+        auto start = clock();
+        assert(stlTest[to_string(i)] == to_string(i));
+        auto end = clock();
+        totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
+    }
+    cout << "Average lookup time for STL unordered_map: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl;
 
     assert(dht.find("macos") == "");
+    assert(mmdht.find("macos") == "");
 }
diff --git a/DataStructures/DiskHashTable/MMDiskHashTable.h b/DataStructures/DiskHashTable/MMDiskHashTable.h
new file mode 100644
index 0000000..7b3426b
--- /dev/null
+++ b/DataStructures/DiskHashTable/MMDiskHashTable.h
@@ -0,0 +1,216 @@
+//
+// Created by nick on 3/23/18.
+//
+
+#pragma once
+
+#include <iostream>
+#include <fcntl.h>
+#include <string>
+#include <unistd.h>
+#include <cmath>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+using namespace std;
+
+/*
+ *
+ * A very simple implementation of a hash table: stored on disk though! :)
+ *
+ * This implementation supports only insertion and lookup. Once a key is inserted, one should abstain from inserting
+ * the same key. There is no error checking for this: if a duplicate key is inserted, it will permanently destroy the
+ * integrity of the hash table. In addition, one cannot delete key-value pairs from the table.
+ *
+ * The header of the file consists of 10 bytes. These 10 bytes correspond to the number of keys in the hash table. One
+ * must manually deduce or preset the key and value size.
+ *
+ */
+
+class MMDiskHashTable {
+
+public:
+
+    /**
+     * Constructs a disk hash table. If the file at the path is not found, or has a file size of 0, it will
+     * automatically spawn a file for the disk hash table. The initial number of bytes that the hash table spawns with
+     * is 1000.
+     *
+     * @param path
+     * @param maxKeySize_in
+     * @param maxValueSize_in
+     */
+    MMDiskHashTable(string path, size_t maxKeySize_in, size_t maxValueSize_in) {
+        file = open(path.c_str(), O_CREAT | O_RDWR, S_IRWXU);
+        fileName = path;
+        fileSize = FileSize1(file);
+        maxKeySize = maxKeySize_in;
+        maxValueSize = maxValueSize_in;
+        nodeSize = maxKeySize + maxValueSize + 2;
+        if(1000 % nodeSize != 0) {
+            cerr << "The sum of key size + value size + 2 must divide a multiple of 1000!";
+            exit(1);
+        }
+        if(fileSize <= 0) {             // no file, or empty file
+            lseek(file, 1009, SEEK_SET);
+            write(file, "", 1);
+            fileSize = FileSize1(file) - 10;
+            capacity = floor(fileSize / nodeSize);
+            size = 0;
+        } else {                        // pre-existing diskhashtable
+            char numKeys[10];
+            lseek(file, 0, SEEK_SET);
+            read(file, numKeys, 10);
+            size = stoll(numKeys);
+            fileSize = FileSize1(file) - 10;
+            capacity = floor(fileSize / nodeSize);
+        }
+        map = (char*) mmap(nullptr, FileSize1(file), PROT_READ | PROT_WRITE, MAP_SHARED, file, 0);
+    }
+
+    /**
+     * Inserts a key-value pair into the disk hash table.
+     * @param key
+     * @param value
+     * @return
+     */
+    bool insert(string key, string value) {
+        if(key.size() > maxKeySize) {
+            cerr << "A key you tried to insert into a disk hash table was larger than the set max key size!";
+            exit(1);
+        }
+        if(value.size() > maxValueSize) {
+            cerr << "A value you tried to insert into a disk hash table was larger than the set max value size!";
+            exit(1);
+        }
+        if((double) size / capacity >= 0.75) {
+            rehash();
+        }
+
+        size_t loc = 10 + (hasher(key) % capacity) * nodeSize;
+        string node = key + '\t' + value;
+        node.resize(nodeSize);
+
+        while(map[loc] != '\0') {
+            loc += nodeSize;
+            if(loc >= FileSize1(file)) {
+                loc = 10;
+            }
+        }
+
+        for(size_t i = 0; i < nodeSize; i++) {
+            map[loc++] = node[i];
+        }
+
+        size++;
+        string sizeString = to_string(size);
+        sizeString.resize(10);
+        for(size_t i = 0; i < 10; i++) {
+            map[i] = sizeString[i];
+        }
+    }
+
+    /**
+     * Looks up the key and returns the value.
+     * @param query The key to look up.
+     * @return The value corresponding to the key in the hash table. Returns an empty string if not found.
+     */
+    string find(string query) {
+        size_t loc = 10 + (hasher(query) % capacity) * nodeSize;
+        string key = "";
+        char* searchMap = map + loc;
+        while(*searchMap != '\0') {
+            auto q = extractKeyValueFromBuffer(searchMap);
+            if(q.first == query) {
+                return q.second;
+            }
+            searchMap += nodeSize;
+            if(searchMap >= map + FileSize1(file)) {
+                searchMap = map + 10;
+            }
+        }
+        return "";
+    }
+
+private:
+
+    int file;
+    string fileName;
+    char* map;
+
+    size_t size;
+    size_t capacity;
+    ssize_t fileSize;
+
+    size_t maxKeySize;
+    size_t maxValueSize;
+    size_t nodeSize;
+
+    std::hash<string> hasher;
+
+    pair<string, string> extractKeyValueFromBuffer(char* buffer) {
+        string key = "";
+        string value = "";
+        bool midVal = false;
+        for (int i = 0; i < strlen(buffer); i++) {
+            if (midVal) {
+                value += buffer[i];
+            } else if (buffer[i] == '\t') {
+                midVal = true;
+            } else {
+                key += buffer[i];
+            }
+        }
+        return {key, value};
+    }
+
+    void rehash() {
+        string tempRehashedFileName = fileName + "_rehashed.txt";
+        int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU);
+        ssize_t doubledFileSize = (fileSize * 2) + 9;
+        lseek(rehashFile, doubledFileSize, SEEK_SET);
+        write(rehashFile, "", 1);
+        fileSize = FileSize1(rehashFile) - 10;
+        char* newMap = (char*) mmap(nullptr, FileSize1(rehashFile), PROT_READ | PROT_WRITE, MAP_SHARED, rehashFile, 0);
+        size_t newCapacity = floor(doubledFileSize / nodeSize);
+        string sizeString = to_string(size);
+        sizeString.resize(10);
+        for(size_t i = 0; i < 10; i++) {
+            newMap[i] = sizeString[i];
+        }
+        for(int i = 0; i < capacity; i++) {
+            size_t oldLocation = 10 + i * nodeSize;
+            pair<string, string> result = extractKeyValueFromBuffer(map + oldLocation);
+            if (result.first != "") {
+                size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize;
+
+                while (newMap[newLocation] != '\0') {
+                    newLocation += nodeSize;
+                    if (newLocation >= FileSize1(rehashFile)) {
+                        newLocation = 10;
+                    }
+                }
+
+                string node = result.first + '\t' + result.second;
+                node.resize(nodeSize);
+                for (int i = 0; i < nodeSize; i++) {
+                    newMap[newLocation++] = node[i];
+                }
+            }
+        }
+        capacity = newCapacity;
+        close(file);
+        remove(fileName.c_str());
+        rename(tempRehashedFileName.c_str(), fileName.c_str());
+        file = rehashFile;
+        map = newMap;
+    }
+
+    ssize_t FileSize1(int file) {
+        struct stat st;
+        fstat(file, &st);
+        return st.st_size;
+    }
+
+};
diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp
old mode 100644
new mode 100755
index f78cee7..51eca4c
--- a/indexer/Indexer.cpp
+++ b/indexer/Indexer.cpp
@@ -69,8 +69,7 @@ void Indexer::verbose_run() {
 
 void Indexer::save ( )
 	{
-	map< string, vector< size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ) );
-	DiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8);
+	MMDiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8 );
 	string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt";
 	int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
 
@@ -85,15 +84,16 @@ void Indexer::save ( )
 	// REALLY GROSS HACK
 	size_t seekOffset = strlen( statsHeader.c_str( ) );
 
-	for ( auto word : maps )
+	for ( auto word : masterDictionary )
 		{
-        if(word.first.size() > 30) {
-            string resized = word.first;
-            resized.resize(30);
-            seeker.insert(resized, to_string(seekOffset));
-        } else {
-            seeker.insert(word.first, to_string(seekOffset));
-        }
+			if(word.first.size() > 30) {
+				string resized = word.first;
+				resized.resize(30);
+				seeker.insert(resized, to_string(seekOffset));
+			} else {
+				seeker.insert(word.first, to_string(seekOffset));
+			}
+
 		chunkDictionary[ word.first ].push_back( currentFile );
 //        string wordBreak = word.first + "\n";
 //        write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
@@ -129,21 +129,8 @@ void Indexer::save ( )
 			lastOne = location;
 			}
 		write( file, "\n", 1 );
-        seekOffset += 1;
-//        if(postingsSeekTable.find(word.first) != postingsSeekTable.end()) {
-//            string offsetLine = "\t";
-//            for (int i = 0; i < postingsSeekTable[word.first].size(); i++) {
-//                offsetLine += "<" +
-//                              to_string( postingsSeekTable[word.first][i].realLocation) +
-//                              ", " +
-//                              to_string( postingsSeekTable[word.first][i].offset) +
-//                              "> ";
-//            }
-//            offsetLine += "\n";
-//            write( file, offsetLine.c_str( ), strlen( offsetLine.c_str( ) ) );
-//            seekOffset += strlen(offsetLine.c_str());
-//        }
-//        }
+		seekOffset += 1;
+		}
 
 	string docEndingHeader = "===Document Endings===\n";
 	write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) );
@@ -165,32 +152,20 @@ void Indexer::save ( )
 
 void Indexer::saveChunkDictionary ( )
 	{
-    DiskHashTable dhtChunk = DiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168);
-    for(auto word : chunkDictionary) {
-        string key = word.first;
-        if(key.size() > 30) {
-            key.resize(30);
-        }
-		string value = "";
-		for (auto chunk : word.second) {
-			value += to_string(chunk) + " ";
+		MMDiskHashTable dhtChunk = MMDiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168);
+		for ( auto word : chunkDictionary )
+		{
+			string key = word.first;
+			if(key.size() > 30) {
+				key.resize(30);
+			}
+			string value = "";
+		for ( auto chunk : word.second )
+			{
+			value += to_string( chunk ) + " ";
+			}
+			dhtChunk.insert(key, value);
 		}
-		dhtChunk.insert(word.first, value);
-	}
-//	string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/master-index.txt";
-//
-//	int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
-//	for ( auto word : chunkDictionary )
-//		{
-//		string wordDictionary = word.first + " ";
-//		for ( auto chunk : word.second )
-//			{
-//			wordDictionary += to_string( chunk ) + " ";
-//			}
-//		wordDictionary += "\n";
-//		write( file, wordDictionary.c_str( ), strlen( wordDictionary.c_str( ) ) );
-//		}
-//	close( file );
 	}
 
 void Indexer::verbose_save ( )
diff --git a/indexer/Indexer.h b/indexer/Indexer.h
old mode 100644
new mode 100755
index 5d21f7e..376e385
--- a/indexer/Indexer.h
+++ b/indexer/Indexer.h
@@ -5,7 +5,7 @@
 #include "../shared/ThreadClass.h"
 #include "DocumentEnding.h"
 #include "PostingsSeekTableEntry.h"
-#include "../DataStructures/DiskHashTable/DiskHashTable.h"
+#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
 #include "../util/util.h"
 #include <unordered_map>
 #include <map>
-- 
GitLab