Newer
Older
//
// Created by Jake Close on 3/13/18.
//
jsclose
committed
#include "ISRWord.h"
ISRWord::ISRWord(char* word) : term(word) {
getChunks();
currentChunk = 0;
currentLocation = first();
}
// put into util file
vector<size_t> ISRWord::getSeekContents(string fileName) {
int file = open(fileName.c_str(), O_RDONLY);
ssize_t fileSize = FileSize(file);
vector<size_t> contents;
char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0);
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
string word = "";
bool midWord = false;
bool midFind = false;
if(memMap != MAP_FAILED) {
for(char* map = memMap; map < memMap + fileSize; map++) {
if(midFind && isalpha(*map)) {
break;
}
switch(*map) {
case '\t':
case '\n':
case '\r':
case ' ':
if (midFind && word != "") {
contents.push_back(stoll(word));
word = "";
} else if (midWord) {
midWord = false;
if(word == term) {
midFind = true;
}
word = "";
}
break;
default:
word += *map;
midWord = true;
}
}
}
return contents;
}
void ISRWord::getChunks() {
string path = util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index-master.txt";
listOfChunks = getSeekContents(path);
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
// ssize_t chunkFileSize = FileSize(chunkFile);
// char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
// string word = "";
// bool midWord = false;
// bool midChunkFind = false;
// if(chunkMemMap != MAP_FAILED) {
// for(char* map = chunkMemMap; map < chunkMemMap + chunkFileSize; map++) {
// if(midChunkFind && isalpha(*map)) {
// break;
// }
// switch(*map) {
// case '\t':
// case '\n':
// case '\r':
// case ' ':
// if (midChunkFind && word != "") {
// listOfChunks.push_back(stoll(word));
// word = "";
// } else if (midWord) {
// midWord = false;
// if(word == term) {
// midChunkFind = true;
// }
// word = "";
// }
// break;
// default:
// word += *map;
// midWord = true;
// }
// }
// }
}
//Go to current chunk
//Look in seek dictionary for chunk (mem map, binary search)
//Have offset into chunk, find post seek to post, return value
//update ISR currentLocation
//set current memory map
//returns offset into corpus
Location ISRWord::first() {
if(listOfChunks.size() <= currentChunk) {
exit(0);
}
string currentChunkSeekFileLocation = util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + "-seek.txt";
vector<size_t> location = getSeekContents(currentChunkSeekFileLocation);
string currentChunkFileLocation = util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
currentMemMap += location[0];
string firstLoc = "";
while(*currentMemMap != ' ') {
firstLoc += *currentMemMap;
currentMemMap++;
}
currentMemMap++;
return stoll(firstLoc);
}
//returns next absolute location in corpus
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
//looks at memory map
//if new line ( end of current list for that file
//move to next chunk, update chunk
//find new offset, return first location
//else
//find way to increment to next delta
//return new location
Location ISRWord::next() {
if(*currentMemMap == '\n') {
currentChunk++;
currentLocation = first();
} else {
string delta = "";
while(*currentMemMap != ' ') {
delta += *currentMemMap;
currentMemMap++;
}
currentLocation += stoll(delta);
currentMemMap++;
}
return currentLocation;
}
//look thru each chunk
//check if absolute position at offset in chunk is less then chunk,
//check seek lookup table to find if offset+absulte is bigger than target
//if so, set location to that big chunk
//go to next chunk
Location ISRWord::seek( Location target ) {
}