From 97b9eaae7df0db5929084c02190c2df1e4fc59a4 Mon Sep 17 00:00:00 2001 From: Nicholas Yang <parablank@gmail.com> Date: Tue, 27 Mar 2018 23:13:11 -0400 Subject: [PATCH] updating dhts and indexer to use mmdht --- DataStructures/DiskHashTable/DiskHashTable.h | 25 +- .../DiskHashTable/DiskHashTableTests.cpp | 66 +++++- .../DiskHashTable/MMDiskHashTable.h | 216 ++++++++++++++++++ indexer/Indexer.cpp | 75 ++---- indexer/Indexer.h | 2 +- 5 files changed, 319 insertions(+), 65 deletions(-) mode change 100644 => 100755 DataStructures/DiskHashTable/DiskHashTable.h mode change 100644 => 100755 DataStructures/DiskHashTable/DiskHashTableTests.cpp create mode 100644 DataStructures/DiskHashTable/MMDiskHashTable.h mode change 100644 => 100755 indexer/Indexer.cpp mode change 100644 => 100755 indexer/Indexer.h diff --git a/DataStructures/DiskHashTable/DiskHashTable.h b/DataStructures/DiskHashTable/DiskHashTable.h old mode 100644 new mode 100755 index 1933098..7ca9612 --- a/DataStructures/DiskHashTable/DiskHashTable.h +++ b/DataStructures/DiskHashTable/DiskHashTable.h @@ -64,6 +64,7 @@ public: lseek(file, 0, SEEK_SET); read(file, numKeys, 10); size = stoll(numKeys); + fileSize = FileSize1(file) - 10; capacity = floor(fileSize / nodeSize); } } @@ -119,7 +120,7 @@ public: } size++; lseek(file, 0, SEEK_SET); - string sizeString = to_string(size) + "\n"; + string sizeString = to_string(size); sizeString.resize(10); write(file, sizeString.c_str(), 10); return true; @@ -135,9 +136,7 @@ public: lseek(file, loc, SEEK_SET); char buffer[nodeSize]; pair<string, string> result; - size_t searched = 0; do { - searched++; buffer[0] = '\0'; size_t bytes = read(file, buffer, nodeSize); if(bytes == 0) { @@ -145,11 +144,10 @@ public: read(file, buffer, nodeSize); } result = extractKeyValueFromBuffer(buffer); - if(searched == size) { + if(buffer[0] == '\0') { return ""; } } while(strcmp(result.first.c_str(), query.c_str()) != 0); - std::cout << searched << std::endl; return result.second; } @@ -187,7 +185,7 @@ private: void rehash() { string tempRehashedFileName = fileName + "_rehashed.txt"; int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU); - ssize_t doubledFileSize = fileSize * 2 + 9; + ssize_t doubledFileSize = (fileSize * 2) + 9; lseek(rehashFile, doubledFileSize, SEEK_SET); write(rehashFile, "", 1); fileSize = FileSize1(rehashFile) - 10; @@ -206,10 +204,23 @@ private: size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize; lseek(rehashFile, newLocation, SEEK_SET); char buffer[nodeSize]; + bool rewindToStart = true; while(read(rehashFile, buffer, nodeSize)) { lseek(rehashFile, -nodeSize, SEEK_CUR); if(buffer[0] == '\0') { write(rehashFile, entry, strlen(entry)); + rewindToStart = false; + break; + } else { + lseek(rehashFile, nodeSize, SEEK_CUR); + } + } + lseek(rehashFile, 10, SEEK_SET); + while(rewindToStart && read(rehashFile, buffer, nodeSize)) { + lseek(rehashFile, -nodeSize, SEEK_CUR); + if(buffer[0] == '\0') { + write(rehashFile, entry, strlen(entry)); + rewindToStart = false; break; } else { lseek(rehashFile, nodeSize, SEEK_CUR); @@ -230,4 +241,4 @@ private: return st.st_size; } -}; \ No newline at end of file +}; diff --git a/DataStructures/DiskHashTable/DiskHashTableTests.cpp b/DataStructures/DiskHashTable/DiskHashTableTests.cpp old mode 100644 new mode 100755 index afb739a..e7d9a3d --- a/DataStructures/DiskHashTable/DiskHashTableTests.cpp +++ b/DataStructures/DiskHashTable/DiskHashTableTests.cpp @@ -1,6 +1,9 @@ #include <iostream> #include <vector> +#include <chrono> #include <cassert> +#include <unordered_map> +#include "MMDiskHashTable.h" #include "DiskHashTable.h" using namespace std; @@ -15,7 +18,8 @@ string randomString(int length) { } int main() { - DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8); + const size_t NUMBER_OF_ELEMENTS = 10000; + DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test1.txt", 10, 8); vector<pair<string, string>> data; // data.push_back({"sherlock", "holmes"}); @@ -56,17 +60,65 @@ int main() { // data.push_back({"lana del", "rey"}); // data.push_back({"system of", "a down"}); - for(int i = 0; i < 5000; i++) { - data.push_back({randomString(rand() % 8 + 3), randomString(rand() % 6 + 3)}); + double totalInsertionTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + dht.insert(to_string(i), to_string(i)); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into DHT: " << totalInsertionTime << endl; + cout << "Average insertion time for DHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; - for(auto entry : data) { - dht.insert(entry.first, entry.second); + double totalLookupTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(dht.find(to_string(i)) == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; } + cout << "Average lookup time for DHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl; - for(auto entry : data) { - assert(dht.find(entry.first) == entry.second); + MMDiskHashTable mmdht = MMDiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8); + totalInsertionTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + mmdht.insert(to_string(i), to_string(i)); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into MMDHT: " << totalInsertionTime << endl; + cout << "Average insertion time for MMDHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; + + totalLookupTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(mmdht.find(to_string(i)) == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Average lookup time for MMDHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl; + + unordered_map<string, string> stlTest; + totalInsertionTime = 0.0; + for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + stlTest[to_string(i)] = to_string(i); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into unordered_map: " << totalInsertionTime << endl; + cout << "Average insertion time for STL unordered_map: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; + + totalLookupTime = 0.0; + for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(stlTest[to_string(i)] == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Average lookup time for STL unordered_map: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl; assert(dht.find("macos") == ""); + assert(mmdht.find("macos") == ""); } diff --git a/DataStructures/DiskHashTable/MMDiskHashTable.h b/DataStructures/DiskHashTable/MMDiskHashTable.h new file mode 100644 index 0000000..7b3426b --- /dev/null +++ b/DataStructures/DiskHashTable/MMDiskHashTable.h @@ -0,0 +1,216 @@ +// +// Created by nick on 3/23/18. +// + +#pragma once + +#include <iostream> +#include <fcntl.h> +#include <string> +#include <unistd.h> +#include <cmath> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> + +using namespace std; + +/* + * + * A very simple implementation of a hash table: stored on disk though! :) + * + * This implementation supports only insertion and lookup. Once a key is inserted, one should abstain from inserting + * the same key. There is no error checking for this: if a duplicate key is inserted, it will permanently destroy the + * integrity of the hash table. In addition, one cannot delete key-value pairs from the table. + * + * The header of the file consists of 10 bytes. These 10 bytes correspond to the number of keys in the hash table. One + * must manually deduce or preset the key and value size. + * + */ + +class MMDiskHashTable { + +public: + + /** + * Constructs a disk hash table. If the file at the path is not found, or has a file size of 0, it will + * automatically spawn a file for the disk hash table. The initial number of bytes that the hash table spawns with + * is 1000. + * + * @param path + * @param maxKeySize_in + * @param maxValueSize_in + */ + MMDiskHashTable(string path, size_t maxKeySize_in, size_t maxValueSize_in) { + file = open(path.c_str(), O_CREAT | O_RDWR, S_IRWXU); + fileName = path; + fileSize = FileSize1(file); + maxKeySize = maxKeySize_in; + maxValueSize = maxValueSize_in; + nodeSize = maxKeySize + maxValueSize + 2; + if(1000 % nodeSize != 0) { + cerr << "The sum of key size + value size + 2 must divide a multiple of 1000!"; + exit(1); + } + if(fileSize <= 0) { // no file, or empty file + lseek(file, 1009, SEEK_SET); + write(file, "", 1); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + size = 0; + } else { // pre-existing diskhashtable + char numKeys[10]; + lseek(file, 0, SEEK_SET); + read(file, numKeys, 10); + size = stoll(numKeys); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + } + map = (char*) mmap(nullptr, FileSize1(file), PROT_READ | PROT_WRITE, MAP_SHARED, file, 0); + } + + /** + * Inserts a key-value pair into the disk hash table. + * @param key + * @param value + * @return + */ + bool insert(string key, string value) { + if(key.size() > maxKeySize) { + cerr << "A key you tried to insert into a disk hash table was larger than the set max key size!"; + exit(1); + } + if(value.size() > maxValueSize) { + cerr << "A value you tried to insert into a disk hash table was larger than the set max value size!"; + exit(1); + } + if((double) size / capacity >= 0.75) { + rehash(); + } + + size_t loc = 10 + (hasher(key) % capacity) * nodeSize; + string node = key + '\t' + value; + node.resize(nodeSize); + + while(map[loc] != '\0') { + loc += nodeSize; + if(loc >= FileSize1(file)) { + loc = 10; + } + } + + for(size_t i = 0; i < nodeSize; i++) { + map[loc++] = node[i]; + } + + size++; + string sizeString = to_string(size); + sizeString.resize(10); + for(size_t i = 0; i < 10; i++) { + map[i] = sizeString[i]; + } + } + + /** + * Looks up the key and returns the value. + * @param query The key to look up. + * @return The value corresponding to the key in the hash table. Returns an empty string if not found. + */ + string find(string query) { + size_t loc = 10 + (hasher(query) % capacity) * nodeSize; + string key = ""; + char* searchMap = map + loc; + while(*searchMap != '\0') { + auto q = extractKeyValueFromBuffer(searchMap); + if(q.first == query) { + return q.second; + } + searchMap += nodeSize; + if(searchMap >= map + FileSize1(file)) { + searchMap = map + 10; + } + } + return ""; + } + +private: + + int file; + string fileName; + char* map; + + size_t size; + size_t capacity; + ssize_t fileSize; + + size_t maxKeySize; + size_t maxValueSize; + size_t nodeSize; + + std::hash<string> hasher; + + pair<string, string> extractKeyValueFromBuffer(char* buffer) { + string key = ""; + string value = ""; + bool midVal = false; + for (int i = 0; i < strlen(buffer); i++) { + if (midVal) { + value += buffer[i]; + } else if (buffer[i] == '\t') { + midVal = true; + } else { + key += buffer[i]; + } + } + return {key, value}; + } + + void rehash() { + string tempRehashedFileName = fileName + "_rehashed.txt"; + int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU); + ssize_t doubledFileSize = (fileSize * 2) + 9; + lseek(rehashFile, doubledFileSize, SEEK_SET); + write(rehashFile, "", 1); + fileSize = FileSize1(rehashFile) - 10; + char* newMap = (char*) mmap(nullptr, FileSize1(rehashFile), PROT_READ | PROT_WRITE, MAP_SHARED, rehashFile, 0); + size_t newCapacity = floor(doubledFileSize / nodeSize); + string sizeString = to_string(size); + sizeString.resize(10); + for(size_t i = 0; i < 10; i++) { + newMap[i] = sizeString[i]; + } + for(int i = 0; i < capacity; i++) { + size_t oldLocation = 10 + i * nodeSize; + pair<string, string> result = extractKeyValueFromBuffer(map + oldLocation); + if (result.first != "") { + size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize; + + while (newMap[newLocation] != '\0') { + newLocation += nodeSize; + if (newLocation >= FileSize1(rehashFile)) { + newLocation = 10; + } + } + + string node = result.first + '\t' + result.second; + node.resize(nodeSize); + for (int i = 0; i < nodeSize; i++) { + newMap[newLocation++] = node[i]; + } + } + } + capacity = newCapacity; + close(file); + remove(fileName.c_str()); + rename(tempRehashedFileName.c_str(), fileName.c_str()); + file = rehashFile; + map = newMap; + } + + ssize_t FileSize1(int file) { + struct stat st; + fstat(file, &st); + return st.st_size; + } + +}; diff --git a/indexer/Indexer.cpp b/indexer/Indexer.cpp old mode 100644 new mode 100755 index f78cee7..51eca4c --- a/indexer/Indexer.cpp +++ b/indexer/Indexer.cpp @@ -69,8 +69,7 @@ void Indexer::verbose_run() { void Indexer::save ( ) { - map< string, vector< size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ) ); - DiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8); + MMDiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8 ); string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt"; int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU ); @@ -85,15 +84,16 @@ void Indexer::save ( ) // REALLY GROSS HACK size_t seekOffset = strlen( statsHeader.c_str( ) ); - for ( auto word : maps ) + for ( auto word : masterDictionary ) { - if(word.first.size() > 30) { - string resized = word.first; - resized.resize(30); - seeker.insert(resized, to_string(seekOffset)); - } else { - seeker.insert(word.first, to_string(seekOffset)); - } + if(word.first.size() > 30) { + string resized = word.first; + resized.resize(30); + seeker.insert(resized, to_string(seekOffset)); + } else { + seeker.insert(word.first, to_string(seekOffset)); + } + chunkDictionary[ word.first ].push_back( currentFile ); // string wordBreak = word.first + "\n"; // write(file, wordBreak.c_str(), strlen(wordBreak.c_str())); @@ -129,21 +129,8 @@ void Indexer::save ( ) lastOne = location; } write( file, "\n", 1 ); - seekOffset += 1; -// if(postingsSeekTable.find(word.first) != postingsSeekTable.end()) { -// string offsetLine = "\t"; -// for (int i = 0; i < postingsSeekTable[word.first].size(); i++) { -// offsetLine += "<" + -// to_string( postingsSeekTable[word.first][i].realLocation) + -// ", " + -// to_string( postingsSeekTable[word.first][i].offset) + -// "> "; -// } -// offsetLine += "\n"; -// write( file, offsetLine.c_str( ), strlen( offsetLine.c_str( ) ) ); -// seekOffset += strlen(offsetLine.c_str()); -// } -// } + seekOffset += 1; + } string docEndingHeader = "===Document Endings===\n"; write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) ); @@ -165,32 +152,20 @@ void Indexer::save ( ) void Indexer::saveChunkDictionary ( ) { - DiskHashTable dhtChunk = DiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168); - for(auto word : chunkDictionary) { - string key = word.first; - if(key.size() > 30) { - key.resize(30); - } - string value = ""; - for (auto chunk : word.second) { - value += to_string(chunk) + " "; + MMDiskHashTable dhtChunk = MMDiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168); + for ( auto word : chunkDictionary ) + { + string key = word.first; + if(key.size() > 30) { + key.resize(30); + } + string value = ""; + for ( auto chunk : word.second ) + { + value += to_string( chunk ) + " "; + } + dhtChunk.insert(key, value); } - dhtChunk.insert(word.first, value); - } -// string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/master-index.txt"; -// -// int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU ); -// for ( auto word : chunkDictionary ) -// { -// string wordDictionary = word.first + " "; -// for ( auto chunk : word.second ) -// { -// wordDictionary += to_string( chunk ) + " "; -// } -// wordDictionary += "\n"; -// write( file, wordDictionary.c_str( ), strlen( wordDictionary.c_str( ) ) ); -// } -// close( file ); } void Indexer::verbose_save ( ) diff --git a/indexer/Indexer.h b/indexer/Indexer.h old mode 100644 new mode 100755 index 5d21f7e..376e385 --- a/indexer/Indexer.h +++ b/indexer/Indexer.h @@ -5,7 +5,7 @@ #include "../shared/ThreadClass.h" #include "DocumentEnding.h" #include "PostingsSeekTableEntry.h" -#include "../DataStructures/DiskHashTable/DiskHashTable.h" +#include "../DataStructures/DiskHashTable/MMDiskHashTable.h" #include "../util/util.h" #include <unordered_map> #include <map> -- GitLab