Skip to content
Snippets Groups Projects
Commit 9f77422e authored by Nicholas Yang's avatar Nicholas Yang
Browse files

getting isrword to behave kind of properly - need to make wordseek mmhas

parent 2bcbf5e6
No related branches found
No related tags found
No related merge requests found
Showing
with 68555 additions and 133 deletions
.DS_Store 0 → 100644
File added
No preview for this file type
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include "ISR.h"
#include "WordSeek.h" #include "WordSeek.h"
#include "../util/util.h" #include "../util/util.h"
#include "../indexer/DocumentEnding.h" #include "../indexer/DocumentEnding.h"
......
...@@ -8,117 +8,29 @@ size_t FileSize(int f) { ...@@ -8,117 +8,29 @@ size_t FileSize(int f) {
return fileInfo.st_size; return fileInfo.st_size;
} }
ISRWord::ISRWord ( char *word ) ISRWord::ISRWord ( char *word ) {
{
term = new char[strlen(word)]; term = new char[strlen(word)];
strcpy(term, word); strcpy(term, word);
getChunks( ); getChunks( );
currentChunk = 0; currentChunk = 0;
currentLocation = First( ); currentLocation = First( );
} }
// put into util file
vector<size_t> ISRWord::getSeekContents(string fileName) {
int file = open(fileName.c_str(), O_RDONLY);
vector<size_t> contents;
if(file == -1)
{
cerr << "Was not able to open master index file";
exit(1);
}
ssize_t fileSize = FileSize(file);
char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0); void ISRWord::getChunks() {
// char* memMap = util::getFileMap(fileName); MMDiskHashTable diskHashTable(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt", 30, 168);
string word = ""; string value = diskHashTable.find(term);
bool midWord = false; string chunkInput = "";
bool midFind = false; for(char val : value) {
WordSeek wordDictionaryEntry; if(isnumber(val)) {
if(memMap != MAP_FAILED) { chunkInput += val;
for(char* map = memMap; map < memMap + fileSize; map++) { } else {
if(midFind && isalpha(*map)) { listOfChunks.push_back(stoll(chunkInput));
break; chunkInput = "";
}
switch(*map) {
if(midFind) {
case '<':
wordDictionaryEntry = WordSeek();
break;
case '>':
wordDictionaryEntry.seekOffset = stoll(word);
wordSeekLookupTable.push_back(wordDictionaryEntry);
break;
case ',':
wordDictionaryEntry.realLocation = stoll(word);
break;
}
case '\n':
case '\r':
case '\t':
case ' ':
if (midFind && word != "") {
contents.push_back(stoll(word));
word = "";
} else if (midWord) {
midWord = false;
if(word == term) {
midFind = true;
}
word = "";
}
break;
default:
word += *map;
midWord = true;
}
} }
} }
return contents;
} }
void ISRWord::getChunks() {
listOfChunks = getSeekContents(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt");
// int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
// ssize_t chunkFileSize = FileSize(chunkFile);
// char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
// string word = "";
// bool midWord = false;
// bool midChunkFind = false;
// if(chunkMemMap != MAP_FAILED) {
// for(char* map = chunkMemMap; map < chunkMemMap + chunkFileSize; map++) {
// if(midChunkFind && isalpha(*map)) {
// break;
// }
// switch(*map) {
// case '\t':
// case '\n':
// case '\r':
// case ' ':
// if (midChunkFind && word != "") {
// listOfChunks.push_back(stoll(word));
// word = "";
// } else if (midWord) {
// midWord = false;
// if(word == term) {
// midChunkFind = true;
// }
// word = "";
// }
// break;
// default:
// word += *map;
// midWord = true;
// }
// }
// }
}
//Go to current chunk //Go to current chunk
//Look in seek dictionary for chunk (mem map, binary search) //Look in seek dictionary for chunk (mem map, binary search)
//Have offset into chunk, find post seek to post, return value //Have offset into chunk, find post seek to post, return value
...@@ -129,22 +41,22 @@ void ISRWord::getChunks() { ...@@ -129,22 +41,22 @@ void ISRWord::getChunks() {
Location ISRWord::First ( ) Location ISRWord::First ( )
{ {
string currentChunkSeekFileLocation = string currentChunkSeekFileLocation =
util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/" + to_string( listOfChunks[ currentChunk ] ) +
"-seek.txt"; "-seek.txt";
vector< size_t > location = getSeekContents( currentChunkSeekFileLocation ); MMDiskHashTable currentChunkSeekFileHashTable = MMDiskHashTable(currentChunkSeekFileLocation, 30, 8);
string loc = currentChunkSeekFileHashTable.find(term);
string currentChunkFileLocation = string currentChunkFileLocation =
util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/" + to_string( listOfChunks[ currentChunk ] ) +
".txt"; ".txt";
int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY ); int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
ssize_t currentChunkFileSize = FileSize( currentChunkFile ); ssize_t currentChunkFileSize = FileSize( currentChunkFile );
currentMemMap = ( char * ) mmap( nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0 ); currentMemMap = ( char * ) mmap( nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0 );
currentMemMap += location[ 0 ]; currentMemMap += stoll(loc);
string firstLoc = ""; string firstLoc = "";
while ( *currentMemMap != ' ' ) while ( *currentMemMap != ' ' ) {
{
firstLoc += *currentMemMap; firstLoc += *currentMemMap;
currentMemMap++; currentMemMap++;
} }
currentMemMap++; currentMemMap++;
return stoll( firstLoc ); return stoll( firstLoc );
} }
...@@ -185,10 +97,9 @@ Location ISRWord::Next ( ) ...@@ -185,10 +97,9 @@ Location ISRWord::Next ( )
return currentLocation; return currentLocation;
} }
Location ISRWord::getCurrentLocation() Location ISRWord::getCurrentLocation() {
{
return currentLocation; return currentLocation;
} }
//look thru each chunk //look thru each chunk
//check if absolute position at offset in chunk is less then chunk, //check if absolute position at offset in chunk is less then chunk,
...@@ -196,26 +107,26 @@ Location ISRWord::getCurrentLocation() ...@@ -196,26 +107,26 @@ Location ISRWord::getCurrentLocation()
//if so, set location to that big chunk //if so, set location to that big chunk
//go to next chunk //go to next chunk
Location ISRWord::Seek( Location target ) { Location ISRWord::Seek( Location target ) {
if(!wordSeekLookupTable.empty()) { // if(!wordSeekLookupTable.empty()) {
auto best = wordSeekLookupTable.front(); // auto best = wordSeekLookupTable.front();
for(auto entry : wordSeekLookupTable) { // for(auto entry : wordSeekLookupTable) {
if(entry.realLocation < target) { // if(entry.realLocation < target) {
best = entry; // best = entry;
} else { // } else {
string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt"; // string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY); // int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile); // ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0); // currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
currentMemMap += best.seekOffset; // currentMemMap += best.seekOffset;
currentLocation = best.realLocation; // currentLocation = best.realLocation;
return best.realLocation; // return best.realLocation;
} // }
} // }
} else { // } else {
while(Next() <= target) { // while(Next() <= target) {
} // }
return currentLocation; // return currentLocation;
} // }
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <sys/types.h> #include <sys/types.h>
#include "WordSeek.h" #include "WordSeek.h"
#include "../util/util.h" #include "../util/util.h"
#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
using namespace std; using namespace std;
...@@ -30,8 +31,6 @@ class ISRWord : public ISR ...@@ -30,8 +31,6 @@ class ISRWord : public ISR
Location Seek ( Location target ) override; Location Seek ( Location target ) override;
Location GetEndDocument ( ) override; Location GetEndDocument ( ) override;
vector< size_t > getSeekContents ( string fileName );
unsigned GetDocumentCount ( ); unsigned GetDocumentCount ( );
unsigned GetNumberOfOccurrences ( ); unsigned GetNumberOfOccurrences ( );
......
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment