diff --git a/CMakeLists.txt b/CMakeLists.txt index d0cadfc23205c4a58b0b6e3258b36c4ee00a8be8..7a341bbc57bae5806a207fceaf7e29a1a2441ce9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ add_executable(URLTEST shared/url.h shared/urlTest.cpp) add_executable(search-engine search.cpp query/Query.cpp) -add_executable(ISRWord-tests util/util.cpp constraintSolver/ISRWord.cpp constraintSolver/tests/ISRWordTests.cpp ) +add_executable(ISRWord-tests constraintSolver/ISRWord.cpp constraintSolver/tests/ISRWordTests.cpp ) find_package(OpenSSL REQUIRED) diff --git a/constraintSolver/ISRWord.cpp b/constraintSolver/ISRWord.cpp index a3497302d8e5c355e91e5fb0a03a2bf887af7e04..a4def3c08c11cd8446fadcd831b47802b58fe8fd 100644 --- a/constraintSolver/ISRWord.cpp +++ b/constraintSolver/ISRWord.cpp @@ -2,9 +2,14 @@ // Created by Jake Close on 3/13/18. // +#include <string> #include "ISRWord.h" -using namespace std; +size_t FileSize(int f) { + struct stat fileInfo; + fstat( f, &fileInfo); + return fileInfo.st_size; +} ISRWord::ISRWord(char* word) : term(word) { getChunks(); @@ -24,15 +29,28 @@ vector<size_t> ISRWord::getSeekContents(string fileName) { string word = ""; bool midWord = false; bool midFind = false; + WordSeek wordDictionaryEntry; if(memMap != MAP_FAILED) { for(char* map = memMap; map < memMap + fileSize; map++) { if(midFind && isalpha(*map)) { break; } switch(*map) { - case '\t': + if(midFind) { + case '<': + wordDictionaryEntry = WordSeek(); + break; + case '>': + wordDictionaryEntry.seekOffset = stoll(word); + wordSeekLookupTable.push_back(wordDictionaryEntry); + break; + case ',': + wordDictionaryEntry.realLocation = stoll(word); + break; + } case '\n': case '\r': + case '\t': case ' ': if (midFind && word != "") { contents.push_back(stoll(word)); @@ -151,6 +169,25 @@ Location ISRWord::next() { //go to next chunk Location ISRWord::seek( Location target ) { - + if(!wordSeekLookupTable.empty()) { + auto best = wordSeekLookupTable.front(); + for(auto entry : wordSeekLookupTable) { + if(entry.realLocation < target) { + best = entry; + } else { + string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt"; + int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY); + ssize_t currentChunkFileSize = FileSize(currentChunkFile); + currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0); + currentMemMap += best.seekOffset; + currentLocation = best.realLocation; + return best.realLocation; + } + } + } else { + while(next() <= target) { + } + return currentLocation; + } } diff --git a/constraintSolver/ISRWord.h b/constraintSolver/ISRWord.h index e0eb8c86f7660a412d3d5ba8f39500dd441f705c..eface9310c6a8550888ef7f73805c0d843845501 100644 --- a/constraintSolver/ISRWord.h +++ b/constraintSolver/ISRWord.h @@ -5,6 +5,7 @@ #pragma once //#include "ISR.h" +#include <iostream> #include <vector> #include <fcntl.h> #include <stdio.h> @@ -13,18 +14,13 @@ #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> +#include "WordSeek.h" //#include "../util/util.h" - -size_t FileSize(int f) { - struct stat fileInfo; - fstat( f, &fileInfo); - return fileInfo.st_size; -} - using namespace std; + //Find occurrences of individual words typedef size_t Location; @@ -50,6 +46,7 @@ public: char* term; char* masterIndex; vector<size_t> listOfChunks; + vector<WordSeek> wordSeekLookupTable; size_t currentChunk; char* currentMemMap; diff --git a/constraintSolver/WordSeek.h b/constraintSolver/WordSeek.h new file mode 100644 index 0000000000000000000000000000000000000000..e0f44b945acfbe9b73ff12df138f179be96d2b2a --- /dev/null +++ b/constraintSolver/WordSeek.h @@ -0,0 +1,7 @@ +#pragma once + +class WordSeek { +public: + ssize_t seekOffset; + size_t realLocation; +}; \ No newline at end of file diff --git a/constraintSolver/tests/ISRWordTests.cpp b/constraintSolver/tests/ISRWordTests.cpp index d34c5ea37b28347d43470c73532d048e25f61cf6..e9501ac45b27d95e48183be143fcad9866c09c27 100644 --- a/constraintSolver/tests/ISRWordTests.cpp +++ b/constraintSolver/tests/ISRWordTests.cpp @@ -4,7 +4,6 @@ #include <iostream> #include "../ISRWord.h" -#include "../ISRWord.cpp" using namespace std;