From 2bcbf5e610171d7548c3d21c0559706156ce25c6 Mon Sep 17 00:00:00 2001 From: Nicholas Yang <parablank@gmail.com> Date: Thu, 29 Mar 2018 01:20:01 -0400 Subject: [PATCH] adding data structs to constraint solver --- .DS_Store | Bin 8196 -> 0 bytes DataStructures/DiskHashTable/DiskHashTable.h | 244 +++++++++++++++++ .../DiskHashTable/DiskHashTableTests.cpp | 124 +++++++++ .../DiskHashTable/MMDiskHashTable.h | 216 +++++++++++++++ DataStructures/HashTable/HashTable.h | 228 ++++++++++++++++ DataStructures/HashTable/HashTableTests.cpp | 20 ++ DataStructures/Vector/Vector.h | 248 ++++++++++++++++++ DataStructures/Vector/VectorTests.cpp | 28 ++ 8 files changed, 1108 insertions(+) delete mode 100644 .DS_Store create mode 100644 DataStructures/DiskHashTable/DiskHashTable.h create mode 100644 DataStructures/DiskHashTable/DiskHashTableTests.cpp create mode 100644 DataStructures/DiskHashTable/MMDiskHashTable.h create mode 100644 DataStructures/HashTable/HashTable.h create mode 100644 DataStructures/HashTable/HashTableTests.cpp create mode 100644 DataStructures/Vector/Vector.h create mode 100644 DataStructures/Vector/VectorTests.cpp diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 407ff4f142413934bf7b250f292da7433a0250ce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMU2GLa6rS(4g<ZRFw-u#@yW!R<MN**rwME2S+VWel$St(|NZEULp)1?nvb*=T zLWl*8#)KH-UyMEw4M>z1HTVc9`T**Su@8cYPiW$c(Fc6ML_IURm(Z5_Xh^z~?3|f% z=A4=N_M4raTgDjL3dT~#Y8Yb@JzT1lRNbX;JAWpWM5L)83G!zwU>?(%#T;tIB}O1d zAVwfYAVwfY;9f+4?rdJ9N$&m78rLxbF#=N(0sehR)5B#Zk|UuMpAM=-3P7@))S^(E z@&MtJj$|g1BcT+o6f?{o5V;~sF%Y_wJ<W%c%tUe|l%YFA=*|$$j3`5an4SF6VmL!4 zlyMy+5F;=V0iL_7EYDoZ+5Iv3dzN%ID>9RL2Mj--b6uOePZ#OELF!>{$_@F0Ior*V z<z{Zsw)~)}=?5gr$}6T#m*o2@<;t0r^3&OVcPJa^LBVg+y*@RSwG1<-dp(vB^mm)i zI@>IoPM5bu4@_?lAKXo|(SWe6=XP4=P@g)h-E{)pa!gOe3uEl<u^hu4YI6&Y;fwK6 zaM9kT2mKUBs?Y2n;j3q|B=@N`M~^OBxw0|6Y-RJY#`MvamZrw^isjA6j>(c#+pvE7 zzQT!D-g)=*nGeoh{!(}`M4ug274~n}Ur}ouB(`5{4<SihEkSbq4oH+4+UCy9Ta+hz zt-R}Gts`cin)GSYo2a|ZGmEs@p1f-p1{}Xnt;*|mzF-qFYi*kkCSB6cd#-J7bA2nY zTxVBrU=9bRKx0h3yM5OS+C{KFRW11Dc9UXRMYDq<QmAS^Th|~D<L@**pJaWeDcqiX zZszQH4=-4>tmW~{y3wRsRek>~McGZbJArk;GQAx``~~jP{k-WImUFPDX!%yoHoF~t zz^p=}ERQB@YUj*NY1(8%dkZ=1aKQ?mRqG{bv|6o7Ds@`DtbQi#HE4xfU4zeL2#Z#y zHAvDPu5b5DstvKIM;B^Jw@=8Zx<nJwG8ZpVl<q^s3)SaKAkFd>inhbnbEd5tvLsZk zs}!xL7;ahZjFf9;DOzvd9T=c3`lRCv(%Q5RNxdTN;}dVu{h({mbZG85HML1v(nd+z z#Vu<M!!)G&5;N*E3&N%VLst0(r?L60l`gbyc7P7oadwiOV&~Zfc9DI}zGpwN8|*js zC;N;2&He#{j3jDNhlOawA}mG=TJacGV<Vozc4X0qehk3HAP(aQp2KmR!1FkX*YF13 z#9KIpv-l7n;T+E60xsebuHY)Z!MFGkH}Erl!SDD3w-RLuDItezO_XTfRoX_R=PANO znnx!;@#@`ap4zZ+!zQKuo~QY>nIX3ycyM<5k%dcEta_sDHV4MN^gGEXI2V#ikgY@} z@xsTlH#`K&ywU*~e;RfWSIuFG9~pJ}=-k>lHFXcAQYvSiB)an(sGFcuA&v4_N@HrV zOmtVvL^rimSuPQ=)r!#CthCBReYHHau2$Aa#Bg<5Xw{UpvaB)@x4j{ysF37IdHbKN z{f=E{zp|U`7O}MgGf<6rSb!!pBZKwWfX(Q{R_sJCc4H6r!o)#Xcm{_MpnxF^6K|i# z3wRMP;br3O>w>p$<2}5O(>O!?{TQF%Q^DU$_#9v0GVym5*T%86b1YkNeIi?nS<kUu z=O9^193Sg61$$-To@@BwUN`yg|5Ff|c*`*YQwIT*cVs)-Xup$v8F{Xqre{Aryzp@& vp%gAu@gZC<(s7cv|6xe&bY#m!awL>wq4K}~5HRu2-1z>F?|;F?sZ{+7V;5h9 diff --git a/DataStructures/DiskHashTable/DiskHashTable.h b/DataStructures/DiskHashTable/DiskHashTable.h new file mode 100644 index 0000000..916cbf2 --- /dev/null +++ b/DataStructures/DiskHashTable/DiskHashTable.h @@ -0,0 +1,244 @@ +// +// Created by nick on 3/23/18. +// + +#pragma once + +#include <iostream> +#include <fcntl.h> +#include <string> +#include <unistd.h> +#include <cmath> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> + +using namespace std; + +/* + * + * A very simple implementation of a hash table: stored on disk though! :) + * + * This implementation supports only insertion and lookup. Once a key is inserted, one should abstain from inserting + * the same key. There is no error checking for this: if a duplicate key is inserted, it will permanently destroy the + * integrity of the hash table. In addition, one cannot delete key-value pairs from the table. + * + * The header of the file consists of 10 bytes. These 10 bytes correspond to the number of keys in the hash table. One + * must manually deduce or preset the key and value size. + * + */ + +class DiskHashTable { + +public: + + /** + * Constructs a disk hash table. If the file at the path is not found, or has a file size of 0, it will + * automatically spawn a file for the disk hash table. The initial number of bytes that the hash table spawns with + * is 1000. + * + * @param path + * @param maxKeySize_in + * @param maxValueSize_in + */ + DiskHashTable(string path, size_t maxKeySize_in, size_t maxValueSize_in) { + file = open(path.c_str(), O_CREAT | O_RDWR, S_IRWXU); + fileName = path; + fileSize = FileSize1(file); + maxKeySize = maxKeySize_in; + maxValueSize = maxValueSize_in; + nodeSize = maxKeySize + maxValueSize + 2; + if(1000 % nodeSize != 0) { + cerr << "The sum of key size + value size + 2 must divide a multiple of 1000!"; + exit(1); + } + if(fileSize <= 0) { // no file, or empty file + lseek(file, 1009, SEEK_SET); + write(file, "", 1); + lseek(file, 0, SEEK_SET); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + size = 0; + } else { // pre-existing diskhashtable + char numKeys[10]; + lseek(file, 0, SEEK_SET); + read(file, numKeys, 10); + size = stoll(numKeys); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + } + } + + ~DiskHashTable() { + close(file); + } + + /** + * Inserts a key-value pair into the disk hash table. + * @param key + * @param value + * @return + */ + bool insert(string key, string value) { + if(key.size() > maxKeySize) { + cerr << "A key you tried to insert into a disk hash table was larger than the set max key size!"; + exit(1); + } + if(value.size() > maxValueSize) { + cerr << "A value you tried to insert into a disk hash table was larger than the set max value size!"; + exit(1); + } + if((double) size / capacity >= 0.75) { + rehash(); + } + size_t loc = 10 + (hasher(key) % capacity) * nodeSize; + string node = key + '\t' + value; + node.resize(nodeSize); + lseek(file, loc, SEEK_SET); + char buffer[nodeSize]; + bool rewindToStart = true; + while(read(file, buffer, nodeSize)) { + lseek(file, -nodeSize, SEEK_CUR); + if(buffer[0] == '\0') { + write(file, node.c_str(), strlen(node.c_str())); + rewindToStart = false; + break; + } else { + lseek(file, nodeSize, SEEK_CUR); + } + } + lseek(file, 10, SEEK_SET); + while(rewindToStart && read(file, buffer, nodeSize)) { + lseek(file, -nodeSize, SEEK_CUR); + if(buffer[0] == '\0') { + write(file, node.c_str(), strlen(node.c_str())); + rewindToStart = false; + break; + } else { + lseek(file, nodeSize, SEEK_CUR); + } + } + size++; + lseek(file, 0, SEEK_SET); + string sizeString = to_string(size); + sizeString.resize(10); + write(file, sizeString.c_str(), 10); + return true; + } + + /** + * Looks up the key and returns the value. + * @param query The key to look up. + * @return The value corresponding to the key in the hash table. Returns an empty string if not found. + */ + string find(string query) { + size_t loc = 10 + (hasher(query) % capacity) * nodeSize; + lseek(file, loc, SEEK_SET); + char buffer[nodeSize]; + pair<string, string> result; + do { + buffer[0] = '\0'; + size_t bytes = read(file, buffer, nodeSize); + if(bytes == 0) { + lseek(file, 10, SEEK_SET); + read(file, buffer, nodeSize); + } + result = extractKeyValueFromBuffer(buffer); + if(buffer[0] == '\0') { + return ""; + } + } while(strcmp(result.first.c_str(), query.c_str()) != 0); + return result.second; + } + +private: + + int file; + string fileName; + + size_t size; + size_t capacity; + ssize_t fileSize; + + size_t maxKeySize; + size_t maxValueSize; + size_t nodeSize; + + std::hash<string> hasher; + + pair<string, string> extractKeyValueFromBuffer(char* buffer) { + string key = ""; + string value = ""; + bool midVal = false; + for (int i = 0; i < strlen(buffer); i++) { + if (midVal) { + value += buffer[i]; + } else if (buffer[i] == '\t') { + midVal = true; + } else { + key += buffer[i]; + } + } + return {key, value}; + }; + + void rehash() { + string tempRehashedFileName = fileName + "_rehashed.txt"; + int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU); + ssize_t doubledFileSize = (fileSize * 2) + 9; + lseek(rehashFile, doubledFileSize, SEEK_SET); + write(rehashFile, "", 1); + fileSize = FileSize1(rehashFile) - 10; + size_t newCapacity = floor(doubledFileSize / nodeSize); + lseek(rehashFile, 0, SEEK_SET); + string sizeString = to_string(size) + '\n'; + sizeString.resize(10); + write(rehashFile, sizeString.c_str(), 10); + for(int i = 0; i < capacity; i++) { + size_t oldLocation = 10 + i * nodeSize; + lseek(file, oldLocation, SEEK_SET); + char entry[nodeSize]; + read(file, entry, nodeSize); + pair<string, string> result = extractKeyValueFromBuffer(entry); + if(result.first != "") { + size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize; + lseek(rehashFile, newLocation, SEEK_SET); + char buffer[nodeSize]; + bool rewindToStart = true; + while(read(rehashFile, buffer, nodeSize)) { + lseek(rehashFile, -nodeSize, SEEK_CUR); + if(buffer[0] == '\0') { + write(rehashFile, entry, strlen(entry)); + rewindToStart = false; + break; + } else { + lseek(rehashFile, nodeSize, SEEK_CUR); + } + } + lseek(rehashFile, 10, SEEK_SET); + while(rewindToStart && read(rehashFile, buffer, nodeSize)) { + lseek(rehashFile, -nodeSize, SEEK_CUR); + if(buffer[0] == '\0') { + write(rehashFile, entry, strlen(entry)); + rewindToStart = false; + break; + } else { + lseek(rehashFile, nodeSize, SEEK_CUR); + } + } + } + } + capacity = newCapacity; + close(file); + remove(fileName.c_str()); + rename(tempRehashedFileName.c_str(), fileName.c_str()); + file = rehashFile; + } + + ssize_t FileSize1(int file) { + struct stat st; + fstat(file, &st); + return st.st_size; + } + +}; \ No newline at end of file diff --git a/DataStructures/DiskHashTable/DiskHashTableTests.cpp b/DataStructures/DiskHashTable/DiskHashTableTests.cpp new file mode 100644 index 0000000..e7d9a3d --- /dev/null +++ b/DataStructures/DiskHashTable/DiskHashTableTests.cpp @@ -0,0 +1,124 @@ +#include <iostream> +#include <vector> +#include <chrono> +#include <cassert> +#include <unordered_map> +#include "MMDiskHashTable.h" +#include "DiskHashTable.h" + +using namespace std; + +string randomString(int length) { + string set = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + string randomString = ""; + for(int i = 0; i < length; i++) { + randomString += set[rand() % 61 + 1]; + } + return randomString; +} + +int main() { + const size_t NUMBER_OF_ELEMENTS = 10000; + DiskHashTable dht = DiskHashTable("DataStructures/DiskHashTable/test1.txt", 10, 8); + + vector<pair<string, string>> data; +// data.push_back({"sherlock", "holmes"}); +// data.push_back({"kendrick", "lamar"}); +// data.push_back({"hello", "goodbye"}); +// data.push_back({"moon", "landing"}); +// data.push_back({"barack", "obama"}); +// data.push_back({"katy", "perry"}); +// data.push_back({"anderson", "paak"}); +// data.push_back({"dunder", "mifflin"}); +// data.push_back({"university", "michigan"}); +// data.push_back({"abel", "tesfaye"}); +// data.push_back({"vince", "staples"}); +// data.push_back({"danny", "brown"}); +// data.push_back({"ann", "arbor"}); +// data.push_back({"tame", "impala"}); +// data.push_back({"machine", "learning"}); +// data.push_back({"north", "face"}); +// data.push_back({"eecs", "398"}); +// data.push_back({"intel", "corei7"}); +// data.push_back({"constraint", "solver"}); +// data.push_back({"multi", "threaded"}); +// data.push_back({"march", "madness"}); +// data.push_back({"sister", "nation"}); +// data.push_back({"daft", "punk"}); +// data.push_back({"the god", "anddevil"}); +// data.push_back({"are raging", "insideme"}); +// data.push_back({"hiatus", "kaiyote"}); +// data.push_back({"jai", "wolf"}); +// data.push_back({"griz", "psgfy"}); +// data.push_back({"stack", "overflow"}); +// data.push_back({"carpenter", "brut"}); +// data.push_back({"harry", "potter"}); +// data.push_back({"fall out", "boy"}); +// data.push_back({"red hot", "chili"}); +// data.push_back({"after", "laughter"}); +// data.push_back({"carly rae", "jepsen"}); +// data.push_back({"lana del", "rey"}); +// data.push_back({"system of", "a down"}); + + double totalInsertionTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + dht.insert(to_string(i), to_string(i)); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into DHT: " << totalInsertionTime << endl; + cout << "Average insertion time for DHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; + + double totalLookupTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(dht.find(to_string(i)) == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Average lookup time for DHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl; + + MMDiskHashTable mmdht = MMDiskHashTable("DataStructures/DiskHashTable/test.txt", 10, 8); + totalInsertionTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + mmdht.insert(to_string(i), to_string(i)); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into MMDHT: " << totalInsertionTime << endl; + cout << "Average insertion time for MMDHT: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; + + totalLookupTime = 0.0; + for(size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(mmdht.find(to_string(i)) == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Average lookup time for MMDHT: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl << endl; + + unordered_map<string, string> stlTest; + totalInsertionTime = 0.0; + for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + stlTest[to_string(i)] = to_string(i); + auto end = clock(); + totalInsertionTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Total time to insert " << NUMBER_OF_ELEMENTS << " key-value pairs into unordered_map: " << totalInsertionTime << endl; + cout << "Average insertion time for STL unordered_map: " << totalInsertionTime / NUMBER_OF_ELEMENTS << endl; + + totalLookupTime = 0.0; + for (size_t i = 0; i < NUMBER_OF_ELEMENTS; i++) { + auto start = clock(); + assert(stlTest[to_string(i)] == to_string(i)); + auto end = clock(); + totalLookupTime += (end - start) / (double) CLOCKS_PER_SEC; + } + cout << "Average lookup time for STL unordered_map: " << totalLookupTime / NUMBER_OF_ELEMENTS << endl; + + assert(dht.find("macos") == ""); + assert(mmdht.find("macos") == ""); +} diff --git a/DataStructures/DiskHashTable/MMDiskHashTable.h b/DataStructures/DiskHashTable/MMDiskHashTable.h new file mode 100644 index 0000000..58798cd --- /dev/null +++ b/DataStructures/DiskHashTable/MMDiskHashTable.h @@ -0,0 +1,216 @@ +// +// Created by nick on 3/23/18. +// + +#pragma once + +#include <iostream> +#include <fcntl.h> +#include <string> +#include <unistd.h> +#include <cmath> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> + +using namespace std; + +/* + * + * A very simple implementation of a hash table: stored on disk though! :) + * + * This implementation supports only insertion and lookup. Once a key is inserted, one should abstain from inserting + * the same key. There is no error checking for this: if a duplicate key is inserted, it will permanently destroy the + * integrity of the hash table. In addition, one cannot delete key-value pairs from the table. + * + * The header of the file consists of 10 bytes. These 10 bytes correspond to the number of keys in the hash table. One + * must manually deduce or preset the key and value size. + * + */ + +class MMDiskHashTable { + +public: + + /** + * Constructs a disk hash table. If the file at the path is not found, or has a file size of 0, it will + * automatically spawn a file for the disk hash table. The initial number of bytes that the hash table spawns with + * is 1000. + * + * @param path + * @param maxKeySize_in + * @param maxValueSize_in + */ + MMDiskHashTable(string path, size_t maxKeySize_in, size_t maxValueSize_in) { + file = open(path.c_str(), O_CREAT | O_RDWR, S_IRWXU); + fileName = path; + fileSize = FileSize1(file); + maxKeySize = maxKeySize_in; + maxValueSize = maxValueSize_in; + nodeSize = maxKeySize + maxValueSize + 2; + if(1000 % nodeSize != 0) { + cerr << "The sum of key size + value size + 2 must divide a multiple of 1000!"; + exit(1); + } + if(fileSize <= 0) { // no file, or empty file + lseek(file, 1009, SEEK_SET); + write(file, "", 1); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + size = 0; + } else { // pre-existing diskhashtable + char numKeys[10]; + lseek(file, 0, SEEK_SET); + read(file, numKeys, 10); + size = stoll(numKeys); + fileSize = FileSize1(file) - 10; + capacity = floor(fileSize / nodeSize); + } + map = (char*) mmap(nullptr, FileSize1(file), PROT_READ | PROT_WRITE, MAP_SHARED, file, 0); + } + + /** + * Inserts a key-value pair into the disk hash table. + * @param key + * @param value + * @return + */ + bool insert(string key, string value) { + if(key.size() > maxKeySize) { + cerr << "A key you tried to insert into a disk hash table was larger than the set max key size!"; + exit(1); + } + if(value.size() > maxValueSize) { + cerr << "A value you tried to insert into a disk hash table was larger than the set max value size!"; + exit(1); + } + if((double) size / capacity >= 0.75) { + rehash(); + } + + size_t loc = 10 + (hasher(key) % capacity) * nodeSize; + string node = key + '\t' + value; + node.resize(nodeSize); + + while(map[loc] != '\0') { + loc += nodeSize; + if(loc >= FileSize1(file)) { + loc = 10; + } + } + + for(size_t i = 0; i < nodeSize; i++) { + map[loc++] = node[i]; + } + + size++; + string sizeString = to_string(size); + sizeString.resize(10); + for(size_t i = 0; i < 10; i++) { + map[i] = sizeString[i]; + } + } + + /** + * Looks up the key and returns the value. + * @param query The key to look up. + * @return The value corresponding to the key in the hash table. Returns an empty string if not found. + */ + string find(string query) { + size_t loc = 10 + (hasher(query) % capacity) * nodeSize; + string key = ""; + char* searchMap = map + loc; + while(*searchMap != '\0') { + auto q = extractKeyValueFromBuffer(searchMap); + if(q.first == query) { + return q.second; + } + searchMap += nodeSize; + if(searchMap >= map + FileSize1(file)) { + searchMap = map + 10; + } + } + return ""; + } + +private: + + int file; + string fileName; + char* map; + + size_t size; + size_t capacity; + ssize_t fileSize; + + size_t maxKeySize; + size_t maxValueSize; + size_t nodeSize; + + std::hash<string> hasher; + + pair<string, string> extractKeyValueFromBuffer(char* buffer) { + string key = ""; + string value = ""; + bool midVal = false; + for (int i = 0; i < strlen(buffer); i++) { + if (midVal) { + value += buffer[i]; + } else if (buffer[i] == '\t') { + midVal = true; + } else { + key += buffer[i]; + } + } + return {key, value}; + } + + void rehash() { + string tempRehashedFileName = fileName + "_rehashed.txt"; + int rehashFile = open(tempRehashedFileName.c_str(), O_CREAT | O_RDWR, S_IRWXU); + ssize_t doubledFileSize = (fileSize * 2) + 9; + lseek(rehashFile, doubledFileSize, SEEK_SET); + write(rehashFile, "", 1); + fileSize = FileSize1(rehashFile) - 10; + char* newMap = (char*) mmap(nullptr, FileSize1(rehashFile), PROT_READ | PROT_WRITE, MAP_SHARED, rehashFile, 0); + size_t newCapacity = floor(doubledFileSize / nodeSize); + string sizeString = to_string(size); + sizeString.resize(10); + for(size_t i = 0; i < 10; i++) { + newMap[i] = sizeString[i]; + } + for(int i = 0; i < capacity; i++) { + size_t oldLocation = 10 + i * nodeSize; + pair<string, string> result = extractKeyValueFromBuffer(map + oldLocation); + if (result.first != "") { + size_t newLocation = 10 + (hasher(result.first) % newCapacity) * nodeSize; + + while (newMap[newLocation] != '\0') { + newLocation += nodeSize; + if (newLocation >= FileSize1(rehashFile)) { + newLocation = 10; + } + } + + string node = result.first + '\t' + result.second; + node.resize(nodeSize); + for (int i = 0; i < nodeSize; i++) { + newMap[newLocation++] = node[i]; + } + } + } + capacity = newCapacity; + close(file); + remove(fileName.c_str()); + rename(tempRehashedFileName.c_str(), fileName.c_str()); + file = rehashFile; + map = newMap; + } + + ssize_t FileSize1(int file) { + struct stat st; + fstat(file, &st); + return st.st_size; + } + +}; \ No newline at end of file diff --git a/DataStructures/HashTable/HashTable.h b/DataStructures/HashTable/HashTable.h new file mode 100644 index 0000000..4c1b04b --- /dev/null +++ b/DataStructures/HashTable/HashTable.h @@ -0,0 +1,228 @@ +// +// hashTable.hpp +// hw2 +// +// Created by Jake Close on 1/23/18. +// Copyright © 2018 Jake Close. All rights reserved. +// + +#pragma once + +#include <string> +#include <stdio.h> +#include <cassert> +#include <cmath> +#include "../Vector/Vector.h" + +template<typename K, typename V> +class Node + { +public: + + Node(K key_in, V val_in) : key(key_in), val(val_in), next(nullptr) + { } + + Node(K key_in) : key(key_in), val(V()), next(nullptr) + { } + + ~Node() + { } + + K key; + Node<K, V> *next; + V val; + }; + + +//Implementation of a Hash Table +template<typename K, typename V> +class HashTable + { +private: + + Vector<Node<K, V> *> array; + + size_t sizeVector; + int numElements; + int numBuckets; + + const double LOAD_FACTOR = 0.5; + + + //(djb2) hash function + int hash(K key) + { + long long hash = 5381; + int c; + + for (int i = 0; i < key.size( ); ++i) + { + c = key[ i ]; + hash = (( hash << 5 ) + hash ) + c; /* hash * 33 + c */ + + } + + + return llabs( hash ) % ( int ) sizeVector; + } + + void rehash( ) { + if (this->exceedsLoadFactor()) { + Vector<Node<K, V> *> *newArray = new Vector<Node<K, V> *>; + sizeVector *= 2; + newArray->resize(sizeVector); + numBuckets = 0; + + for (auto bucket : array) { + //used to delete old linked list + auto iterBucket = bucket; + while (iterBucket != nullptr) { + Node<K, V> *newNode = new Node<K, V>(iterBucket->key); + newNode->val = iterBucket->val; + + int rehash = hash(iterBucket->key); + //if collision + if (newArray->at(rehash) != nullptr) { + auto current = newArray->at(rehash); + while (current->next != nullptr) { + current = current->next; + } + current->next = newNode; + } else { + numBuckets++; + newArray->at(rehash) = newNode; + } + iterBucket = iterBucket->next; + //Delete linked list + while (bucket != nullptr) { + auto next = bucket->next; + delete bucket; + bucket = next; + } + } + array = *newArray; + } + } + } + + bool exceedsLoadFactor() + { + return ((double) this->numElements / (double) this->sizeVector >= LOAD_FACTOR); + } + +public: + + HashTable() + { + numElements = 0; + numBuckets = 0; + sizeVector = 16; + array.resize(sizeVector); + } + + HashTable ( const HashTable & copy ) + { + this->numElements = copy.numElements; + this->numCollisions = copy.numCollisions; + this->maxCollisions = copy.maxCollisions; + this->numBuckets = copy.numBuckets; + this->sizeVector = copy.sizeVector; + array.resize ( sizeVector ); + int size = sizeof ( array ); + + for ( size_t i = 0; i < sizeVector; ++i ) + { + auto iter = copy.array[ i ]; + while ( iter != nullptr ) + { + this->operator [] ( iter->key ) = iter->val; + iter = iter->next; + } + } + } + + + V &operator[](const K &key) + { + if (this->exceedsLoadFactor( )) + this->rehash(); + //return seek( key )->val; + Node<K, V> *value = find( key ); + if ( value == nullptr ) + value = insert(key); + + return value->val; + + } + + Node<K, V> *insert(K key) + { + + if ( this->exceedsLoadFactor( ) ) + this->rehash( ); + + int hash_val = hash( key ); + Node<K, V> *current = array[ hash_val ]; + if ( !current ) + { + array[ hash_val ] = new Node<K, V>( key ); + current = array[ hash_val ]; + ++numBuckets; + ++numElements; + return current; + } + + while ( current->next != nullptr && current->key != key ) + { + current = current->next; + } + + if ( current->key == key ) + { + return current; + } + + + numBuckets++; + //gets to end of chain, creates new element + current->next = new Node<K, V>( key ); + numElements++; + return current->next; + } + + /* + * Returns Pointer to Node if found at Key + * Otherwise node pointer + */ + Node<K, V> *find( K key ) + { + int hash_val = hash( key ); + Node<K, V> *current = array[ hash_val ]; + + while ( current != nullptr && current->key != key ) + current = current->next; + + return current; + + } + + //returns the number of keys + int size() + { + return numElements; + } + + //returns the size of the underlying array + int capacity() + { + return sizeVector; + } + + int buckets() + { + return numBuckets; + } + +}; + + diff --git a/DataStructures/HashTable/HashTableTests.cpp b/DataStructures/HashTable/HashTableTests.cpp new file mode 100644 index 0000000..08d921b --- /dev/null +++ b/DataStructures/HashTable/HashTableTests.cpp @@ -0,0 +1,20 @@ +// +// Created by nick on 3/23/18. +// + +#include <iostream> +#include "HashTable.h" + +using namespace std; + +int main() { + HashTable<string, string> ht; + + ht["hello"] = "goodbye"; + assert(ht["hello"] == "goodbye"); + ht["i want you to feel"] = "surreal"; + assert(ht["i want you to feel"] == "surreal"); + + + return 0; +} \ No newline at end of file diff --git a/DataStructures/Vector/Vector.h b/DataStructures/Vector/Vector.h new file mode 100644 index 0000000..46a266c --- /dev/null +++ b/DataStructures/Vector/Vector.h @@ -0,0 +1,248 @@ +// +// Created by Zane Dunnings on 1/25/18. +// + +#pragma once + +#include <iostream> + +template< typename T > +class Vector +{ +public: + Vector( ); + Vector( size_t size ); + ~Vector( ); + size_t size( ); + const T & operator [] ( size_t loc ) const; + T &operator[ ]( size_t index ); + Vector& operator=( const Vector &rhs); + T &at( size_t index); + void push_back( T ); + void reserve( size_t ); + void resize( size_t size ); + + class Iterator + { + public: + Iterator() + :currentIndex( nullptr ){ } + Iterator( T* inputIndex ) + :currentIndex( inputIndex ){ } + + T &operator*( ) + { + + return *currentIndex; + + } + + Iterator &operator++( ) + { + ++this->currentIndex; + return *this; + } + + Iterator &operator--( ) + { + --this->currentIndex; + return *this; + } + + bool operator==( const Iterator &rhs ) const + { + return this->currentIndex == rhs.currentIndex; + } + + bool operator!=(const Iterator &rhs) const + { + return this->currentIndex != rhs.currentIndex; + } + + private: + + T* currentIndex; + friend class Vector; + }; + + Iterator begin( ) + { + if( this->start == nullptr ) + { + return Iterator( ); + } + return Iterator( this->start ); + } + + Iterator end( ) + { + + return Iterator( &start[ numberOfItems ] ); + } + +private: + + //points to the beginning of the array + T* start; + //the number of objects in the array and its "size" to the user + size_t totalSize; + //the actual size of the array in memory + size_t numberOfItems; + //The standard starting size of the array + const size_t defaultSize; + +}; + +template< typename T> +///Default constructor array size is 16 +Vector<T>::Vector(): defaultSize(16) +{ + this->start = new T[ this->defaultSize ]; + this->totalSize = defaultSize; + this->numberOfItems = 0; +} + +template< typename T> +/// creates an Vector thats 'size' big +Vector<T>::Vector( size_t size ): defaultSize( size ) +{ + this->start = new T[ this->defaultSize ]; + this->totalSize = defaultSize; + this->numberOfItems = 0; +} + +template< typename T> +/// Destructor deletes the array +/// \tparam T +Vector<T>::~Vector() +{ + delete [ ] this->start; +} + +template< typename T> +/// \tparam T +/// \param item : item to be pushed to the back of the array +void Vector<T>::push_back(T item) +{ + //extend the size of the array cuz you dont have enough space + if( ( this->numberOfItems + 1 ) >= this->totalSize ) + this->reserve( totalSize * 2); + + this->start[ this->numberOfItems ] = item; + ++this->numberOfItems; +} + +template< typename T> +/// \tparam T +/// \param loc : index to be accessed +/// \return : reference to the index, arr[ loc ] +T &Vector<T>::operator[ ]( size_t loc) +{ + return this->start[ loc ]; +} + + +template< typename T > +/// \tparam T +/// \param loc : index to be accessed +/// \return : reference to the index, arr[ loc ] +const T & Vector< T >::operator [] ( size_t loc ) const +{ + return this->start[ loc ]; +} + + +template< typename T> +/// \tparam T +/// \param loc : index to be accessed +/// \return : reference to the index, arr[ loc ] +T &Vector<T>::at( size_t loc) +{ + if( loc >= this->totalSize ) + { + std::cerr << "Index out of Range"; + exit(1); + } + else + { + return this->start[ loc ]; + } +} + +template< typename T> +/// \tparam T +/// \return the amount of elements in the array +size_t Vector<T>::size() { + return this->numberOfItems; +} + +template< typename T> +/// Reserves the space by allocating 'size' amount of containers, but doesn't change the number of objects, +/// or give default values +/// \tparam T +/// \param size : new size of the arr +void Vector<T>::reserve( size_t size ) +{ + if( size < this->numberOfItems ) + { + std::cerr << "Vector.reserve( size ) :: 'size' must be larger than the previous size of the array\n"; + exit(1); + } + totalSize = size; + T* newArr = new T[totalSize]; + + for( int i = 0; i < numberOfItems ; ++i ) + { + newArr[ i ] = this->start[ i ]; + } + delete [ ] this->start; + this->start = newArr; +} + +template< typename T> +/// Will resize the array to be 'size', and fill the new spots with default values. Can access these, +/// and push_back will push to arr[ size ] +/// \tparam T +/// \param size : new size of the arr +void Vector<T>::resize( size_t size ) +{ + if( size < this->numberOfItems ) + { + std::cerr << "Vector.reserve( size ) :: 'size' must be larger than the previous size of the array\n"; + exit( 1 ); + } + + this->totalSize = size; + T* newArr = new T[ this->totalSize ]; + + for( int i = 0; i < this->numberOfItems; ++i ) + { + newArr[ i ] = this->start[ i ]; + } + + for( int i = this->numberOfItems; i < size; ++i ) + { + newArr[ i ] = T( ); + } + + this->numberOfItems = size; + delete [ ] this->start; + this->start = newArr; +} + +template< typename T> +///Default constructor array size is 16 +Vector<T>& Vector<T>::operator=( const Vector<T> © ) +{ + delete[] start; + this->numberOfItems = copy.numberOfItems; + this->totalSize = copy.totalSize; + this->start = new T[ this->totalSize ]; + + for(int i = 0; i < totalSize; ++i) + { + this->start[ i ] = copy.start[ i ]; + } + + return *this; +} \ No newline at end of file diff --git a/DataStructures/Vector/VectorTests.cpp b/DataStructures/Vector/VectorTests.cpp new file mode 100644 index 0000000..5f2e732 --- /dev/null +++ b/DataStructures/Vector/VectorTests.cpp @@ -0,0 +1,28 @@ +// +// Created by nick on 3/22/18. +// + +#include <iostream> +#include <cassert> +#include "Vector.h" +#include <vector> + +using namespace std; + +// TOOO: COMPREHENSIVE TESTS +int main() { + Vector<int> test1; + test1.reserve(20); + vector<int> test62; + test62.reserve(20); + test1[51] = 52; + test62[51] = 352; + test1.push_back(50); + assert(test1[0] == 50); + assert(test1.size() == 1); + test1[0] = 150; + assert(test1[0] == 150); + assert(test1.size() == 1); + cout << "ALL VECTOR TESTS PASS :)" << endl; +} + -- GitLab