Skip to content
Snippets Groups Projects
Commit 97b9eaae authored by Nicholas Yang's avatar Nicholas Yang
Browse files

updating dhts and indexer to use mmdht

parent 94ced410
No related branches found
No related tags found
1 merge request!3Indexer
......@@ -64,6 +64,7 @@ public:
lseek(file, 0, SEEK_SET);
read(file, numKeys, 10);
size = stoll(numKeys);
fileSize = FileSize1(file) - 10;
capacity = floor(fileSize / nodeSize);
}
}
......@@ -119,7 +120,7 @@ public:
}
size++;
lseek(file, 0, SEEK_SET);
string sizeString = to_string(size) + "\n";
string sizeString = to_string(size);
sizeString.resize(10);
write(file, sizeString.c_str(), 10);
return true;
......@@ -135,9 +136,7 @@ public:
lseek(file, loc, SEEK_SET);
char buffer[nodeSize];
pair<string, string> result;
size_t searched = 0;
do {
searched++;
buffer[0] = '\0';
size_t bytes = read(file, buffer, nodeSize);
if(bytes == 0) {
......@@ -145,11 +144,10 @@ public:
read(file, buffer, nodeSize);
}
result = extractKeyValueFromBuffer(buffer);
if(searched == size) {
if(buffer[0] == '\0') {
return "";
}
} while(strcmp(result.first.c_str(), query.c_str()) != 0);
std::cout << searched << std::endl;
return result.second;
}
......@@ -187,7 +185,7 @@ private:
void rehash() {
string tempRehashedFileName = fileName + "_rehashed.txt";
int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU);
ssize_t doubledFileSize = fileSize * 2 + 9;
ssize_t doubledFileSize = (fileSize * 2) + 9;
lseek(rehashFile, doubledFileSize, SEEK_SET);
write(rehashFile, "", 1);
fileSize = FileSize1(rehashFile) - 10;
......@@ -206,10 +204,23 @@ private:
size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize;
lseek(rehashFile, newLocation, SEEK_SET);
char buffer[nodeSize];
bool rewindToStart = true;
while(read(rehashFile, buffer, nodeSize)) {
lseek(rehashFile, -nodeSize, SEEK_CUR);
if(buffer[0] == '\0') {
write(rehashFile, entry, strlen(entry));
rewindToStart = false;
break;
} else {
lseek(rehashFile, nodeSize, SEEK_CUR);
}
}
lseek(rehashFile, 10, SEEK_SET);
while(rewindToStart && read(rehashFile, buffer, nodeSize)) {
lseek(rehashFile, -nodeSize, SEEK_CUR);
if(buffer[0] == '\0') {
write(rehashFile, entry, strlen(entry));
rewindToStart = false;
break;
} else {
lseek(rehashFile, nodeSize, SEEK_CUR);
......@@ -230,4 +241,4 @@ private:
return st.st_size;
}
};
\ No newline at end of file
};
#include <iostream>
#include <vector>
#include <chrono>
#include <cassert>
#include <unordered_map>
#include "MMDiskHashTable.h"
#include "DiskHashTable.h"
using namespace std;
......@@ -15,7 +18,8 @@ string randomString(int length) {
}
int main() {
DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8);
const size_t NUMBER_OF_ELEMENTS = 10000;
DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test1.txt", 10, 8);
vector<pair<string, string>> data;
// data.push_back({"sherlock", "holmes"});
......@@ -56,17 +60,65 @@ int main() {
// data.push_back({"lana del", "rey"});
// data.push_back({"system of", "a down"});
for(int i = 0; i < 5000; i++) {
data.push_back({randomString(rand() % 8 + 3), randomString(rand() % 6 + 3)});
double totalInsertionTime = 0.0;
for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
dht.insert(to_string(i), to_string(i));
auto end = clock();
totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into DHT: " << totalInsertionTime << endl;
cout << "Average insertion time for DHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
for(auto entry : data) {
dht.insert(entry.first, entry.second);
double totalLookupTime = 0.0;
for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
assert(dht.find(to_string(i)) == to_string(i));
auto end = clock();
totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Average lookup time for DHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl;
for(auto entry : data) {
assert(dht.find(entry.first) == entry.second);
MMDiskHashTable mmdht = MMDiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8);
totalInsertionTime = 0.0;
for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
mmdht.insert(to_string(i), to_string(i));
auto end = clock();
totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into MMDHT: " << totalInsertionTime << endl;
cout << "Average insertion time for MMDHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
totalLookupTime = 0.0;
for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
assert(mmdht.find(to_string(i)) == to_string(i));
auto end = clock();
totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Average lookup time for MMDHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl;
unordered_map<string, string> stlTest;
totalInsertionTime = 0.0;
for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
stlTest[to_string(i)] = to_string(i);
auto end = clock();
totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into unordered_map: " << totalInsertionTime << endl;
cout << "Average insertion time for STL unordered_map: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl;
totalLookupTime = 0.0;
for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) {
auto start = clock();
assert(stlTest[to_string(i)] == to_string(i));
auto end = clock();
totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC;
}
cout << "Average lookup time for STL unordered_map: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl;
assert(dht.find("macos") == "");
assert(mmdht.find("macos") == "");
}
//
// Created by nick on 3/23/18.
//
#pragma once
#include <iostream>
#include <fcntl.h>
#include <string>
#include <unistd.h>
#include <cmath>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
using namespace std;
/*
*
* A very simple implementation of a hash table: stored on disk though! :)
*
* This implementation supports only insertion and lookup. Once a key is inserted, one should abstain from inserting
* the same key. There is no error checking for this: if a duplicate key is inserted, it will permanently destroy the
* integrity of the hash table. In addition, one cannot delete key-value pairs from the table.
*
* The header of the file consists of 10 bytes. These 10 bytes correspond to the number of keys in the hash table. One
* must manually deduce or preset the key and value size.
*
*/
class MMDiskHashTable {
public:
/**
* Constructs a disk hash table. If the file at the path is not found, or has a file size of 0, it will
* automatically spawn a file for the disk hash table. The initial number of bytes that the hash table spawns with
* is 1000.
*
* @param path
* @param maxKeySize_in
* @param maxValueSize_in
*/
MMDiskHashTable(string path, size_t maxKeySize_in, size_t maxValueSize_in) {
file = open(path.c_str(), O_CREAT | O_RDWR, S_IRWXU);
fileName = path;
fileSize = FileSize1(file);
maxKeySize = maxKeySize_in;
maxValueSize = maxValueSize_in;
nodeSize = maxKeySize + maxValueSize + 2;
if(1000 % nodeSize != 0) {
cerr << "The sum of key size + value size + 2 must divide a multiple of 1000!";
exit(1);
}
if(fileSize <= 0) { // no file, or empty file
lseek(file, 1009, SEEK_SET);
write(file, "", 1);
fileSize = FileSize1(file) - 10;
capacity = floor(fileSize / nodeSize);
size = 0;
} else { // pre-existing diskhashtable
char numKeys[10];
lseek(file, 0, SEEK_SET);
read(file, numKeys, 10);
size = stoll(numKeys);
fileSize = FileSize1(file) - 10;
capacity = floor(fileSize / nodeSize);
}
map = (char*) mmap(nullptr, FileSize1(file), PROT_READ | PROT_WRITE, MAP_SHARED, file, 0);
}
/**
* Inserts a key-value pair into the disk hash table.
* @param key
* @param value
* @return
*/
bool insert(string key, string value) {
if(key.size() > maxKeySize) {
cerr << "A key you tried to insert into a disk hash table was larger than the set max key size!";
exit(1);
}
if(value.size() > maxValueSize) {
cerr << "A value you tried to insert into a disk hash table was larger than the set max value size!";
exit(1);
}
if((double) size / capacity >= 0.75) {
rehash();
}
size_t loc = 10 + (hasher(key) % capacity) * nodeSize;
string node = key + '\t' + value;
node.resize(nodeSize);
while(map[loc] != '\0') {
loc += nodeSize;
if(loc >= FileSize1(file)) {
loc = 10;
}
}
for(size_t i = 0; i < nodeSize; i++) {
map[loc++] = node[i];
}
size++;
string sizeString = to_string(size);
sizeString.resize(10);
for(size_t i = 0; i < 10; i++) {
map[i] = sizeString[i];
}
}
/**
* Looks up the key and returns the value.
* @param query The key to look up.
* @return The value corresponding to the key in the hash table. Returns an empty string if not found.
*/
string find(string query) {
size_t loc = 10 + (hasher(query) % capacity) * nodeSize;
string key = "";
char* searchMap = map + loc;
while(*searchMap != '\0') {
auto q = extractKeyValueFromBuffer(searchMap);
if(q.first == query) {
return q.second;
}
searchMap += nodeSize;
if(searchMap >= map + FileSize1(file)) {
searchMap = map + 10;
}
}
return "";
}
private:
int file;
string fileName;
char* map;
size_t size;
size_t capacity;
ssize_t fileSize;
size_t maxKeySize;
size_t maxValueSize;
size_t nodeSize;
std::hash<string> hasher;
pair<string, string> extractKeyValueFromBuffer(char* buffer) {
string key = "";
string value = "";
bool midVal = false;
for (int i = 0; i < strlen(buffer); i++) {
if (midVal) {
value += buffer[i];
} else if (buffer[i] == '\t') {
midVal = true;
} else {
key += buffer[i];
}
}
return {key, value};
}
void rehash() {
string tempRehashedFileName = fileName + "_rehashed.txt";
int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU);
ssize_t doubledFileSize = (fileSize * 2) + 9;
lseek(rehashFile, doubledFileSize, SEEK_SET);
write(rehashFile, "", 1);
fileSize = FileSize1(rehashFile) - 10;
char* newMap = (char*) mmap(nullptr, FileSize1(rehashFile), PROT_READ | PROT_WRITE, MAP_SHARED, rehashFile, 0);
size_t newCapacity = floor(doubledFileSize / nodeSize);
string sizeString = to_string(size);
sizeString.resize(10);
for(size_t i = 0; i < 10; i++) {
newMap[i] = sizeString[i];
}
for(int i = 0; i < capacity; i++) {
size_t oldLocation = 10 + i * nodeSize;
pair<string, string> result = extractKeyValueFromBuffer(map + oldLocation);
if (result.first != "") {
size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize;
while (newMap[newLocation] != '\0') {
newLocation += nodeSize;
if (newLocation >= FileSize1(rehashFile)) {
newLocation = 10;
}
}
string node = result.first + '\t' + result.second;
node.resize(nodeSize);
for (int i = 0; i < nodeSize; i++) {
newMap[newLocation++] = node[i];
}
}
}
capacity = newCapacity;
close(file);
remove(fileName.c_str());
rename(tempRehashedFileName.c_str(), fileName.c_str());
file = rehashFile;
map = newMap;
}
ssize_t FileSize1(int file) {
struct stat st;
fstat(file, &st);
return st.st_size;
}
};
......@@ -69,8 +69,7 @@ void Indexer::verbose_run() {
void Indexer::save ( )
{
map< string, vector< size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ) );
DiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8);
MMDiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8 );
string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
......@@ -85,15 +84,16 @@ void Indexer::save ( )
// REALLY GROSS HACK
size_t seekOffset = strlen( statsHeader.c_str( ) );
for ( auto word : maps )
for ( auto word : masterDictionary )
{
if(word.first.size() > 30) {
string resized = word.first;
resized.resize(30);
seeker.insert(resized, to_string(seekOffset));
} else {
seeker.insert(word.first, to_string(seekOffset));
}
if(word.first.size() > 30) {
string resized = word.first;
resized.resize(30);
seeker.insert(resized, to_string(seekOffset));
} else {
seeker.insert(word.first, to_string(seekOffset));
}
chunkDictionary[ word.first ].push_back( currentFile );
// string wordBreak = word.first + "\n";
// write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
......@@ -129,21 +129,8 @@ void Indexer::save ( )
lastOne = location;
}
write( file, "\n", 1 );
seekOffset += 1;
// if(postingsSeekTable.find(word.first) != postingsSeekTable.end()) {
// string offsetLine = "\t";
// for (int i = 0; i < postingsSeekTable[word.first].size(); i++) {
// offsetLine += "<" +
// to_string( postingsSeekTable[word.first][i].realLocation) +
// ", " +
// to_string( postingsSeekTable[word.first][i].offset) +
// "> ";
// }
// offsetLine += "\n";
// write( file, offsetLine.c_str( ), strlen( offsetLine.c_str( ) ) );
// seekOffset += strlen(offsetLine.c_str());
// }
// }
seekOffset += 1;
}
string docEndingHeader = "===Document Endings===\n";
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) );
......@@ -165,32 +152,20 @@ void Indexer::save ( )
void Indexer::saveChunkDictionary ( )
{
DiskHashTable dhtChunk = DiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168);
for(auto word : chunkDictionary) {
string key = word.first;
if(key.size() > 30) {
key.resize(30);
}
string value = "";
for (auto chunk : word.second) {
value += to_string(chunk) + " ";
MMDiskHashTable dhtChunk = MMDiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/index-master.txt", 30, 168);
for ( auto word : chunkDictionary )
{
string key = word.first;
if(key.size() > 30) {
key.resize(30);
}
string value = "";
for ( auto chunk : word.second )
{
value += to_string( chunk ) + " ";
}
dhtChunk.insert(key, value);
}
dhtChunk.insert(word.first, value);
}
// string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/master-index.txt";
//
// int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
// for ( auto word : chunkDictionary )
// {
// string wordDictionary = word.first + " ";
// for ( auto chunk : word.second )
// {
// wordDictionary += to_string( chunk ) + " ";
// }
// wordDictionary += "\n";
// write( file, wordDictionary.c_str( ), strlen( wordDictionary.c_str( ) ) );
// }
// close( file );
}
void Indexer::verbose_save ( )
......
......@@ -5,7 +5,7 @@
#include "../shared/ThreadClass.h"
#include "DocumentEnding.h"
#include "PostingsSeekTableEntry.h"
#include "../DataStructures/DiskHashTable/DiskHashTable.h"
#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
#include "../util/util.h"
#include <unordered_map>
#include <map>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment