Skip to content
Snippets Groups Projects
Commit 66abad67 authored by Nicholas Yang's avatar Nicholas Yang Committed by jsclose
Browse files

changed to use indexerconstants + MUCH BETTER ISRENDDOC

parent cd4c7b33
Branches
No related tags found
1 merge request!12Searcher
File deleted
......@@ -16,9 +16,7 @@
#include <limits>
#include "ISREndDoc.h"
#include "../indexer/Corpus.h"
#define pathToIndex "/constraintSolver/index-test-files/twitter/"
//#define pathToIndex "/buildIndex/"
#include "../indexer/IndexerConstants.h"
typedef size_t Location; // Location 0 is the null location.
const Location MAX_Location = std::numeric_limits<unsigned>::max();
......
......@@ -3,30 +3,18 @@
//
#include "ISREndDoc.h"
#define pathToIndex "/constraintSolver/index-test-files/twitter/"
//#define pathToIndex "/buildIndex/"
ISREndDoc::ISREndDoc() {
currentChunk = 0;
memMap = nullptr;
memMap = nullptr;
}
DocumentEnding ISREndDoc::next() {
if(memMap == nullptr || *memMap == '\0' ) {
string fileName = util::GetCurrentWorkingDir() + pathToIndex + to_string(currentChunk) + ".txt";
currentFile = open(fileName.c_str(), O_RDONLY);
string seekFileName = util::GetCurrentWorkingDir() + pathToIndex + to_string(currentChunk) + "-seek.txt";
if(0 != access(seekFileName.c_str(), 0)) {
DocumentEnding a = DocumentEnding();
a.url = "aaa";
return a;
}
MMDiskHashTable de(seekFileName, 30, 8);
memMap = (char*) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile, 0);
memMap += stoll(de.find("=docEnding"));
openChunk(++currentChunk);
}
string currentOne;
for(char* map = memMap; map < memMap + util::FileSize(currentFile); map++) {
for(char* map = memMap; map < memMap + util::FileSize(currentFileHandle); map++) {
if(*map == '\0') {
currentChunk++;
memMap = nullptr;
......@@ -66,97 +54,155 @@ DocumentEnding ISREndDoc::next() {
return currentDoc;
}
// open up current chunk wordseek mem map
// seek all possible keys for doc ending
// check bounds
void ISREndDoc::seek(Location target) {
string key = "=docEnding";
string value = "";
bool found = false;
pair<size_t, size_t> docEndingWordSeek = {0, 0}; // location, offset
size_t tempLocation = 0;
void ISREndDoc::openChunk(int chunk) {
assert(chunk >= 0 && chunk < corpus.chunks.size());
currentChunk = chunk;
seekTable.clear();
memMap = corpus.chunks[chunk].chunkMap;
currentFileHandle = corpus.chunks[chunk].chunkFileHandle;
memMap += stoll(corpus.chunks[chunk].seeker.find("=docEnding"));
WordSeek entry = WordSeek();
int currentSeekLookup = 0;
string input = "";
bool init = false;
bool breakout = false;
bool between = false;
size_t foundChunk;
while(!found) {
string fileName = util::GetCurrentWorkingDir() +
pathToIndex +
to_string(currentChunk) + "-wordseek.txt";
if(0 != access(fileName.c_str(), 0)) {
currentChunk--;
break;
}
MMDiskHashTable currentWordSeek = MMDiskHashTable(fileName, 30, 168);
int currentValueChunk = 0;
value = currentWordSeek.find(key + to_string(currentValueChunk));
while(value.compare("") != 0) {
//cout << "searching through " << key + to_string(currentValueChunk) << endl;
for (auto comp : value) {
switch (comp) {
case '<':
break;
case '>':
if (target < tempLocation && target > docEndingWordSeek.first)
{
if(!init) {
breakout = true;
break;
}
breakout = true;
found = true;
foundChunk = between ? currentChunk - 1 : currentChunk ;
break;
}
between = false;
init = true;
docEndingWordSeek.first = tempLocation;
docEndingWordSeek.second = stoll(input);
input = "";
break;
case ',':
tempLocation = stoll(input);
input = "";
break;
default:
input += comp;
break;
}
if (found) {
string fileName = util::GetCurrentWorkingDir() + pathToIndex + to_string(foundChunk) + ".txt";
currentFile = open(fileName.c_str(), O_RDONLY);
memMap = (char *) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile,
0);
memMap += docEndingWordSeek.second;
}
if(breakout) {
string value = corpus.chunks[chunk].wordSeek.find("=docEnding" + to_string(currentSeekLookup));
while(value != "") {
for (auto comp : value) {
switch (comp) {
case '<':
entry = WordSeek();
break;
case '>':
entry.seekOffset = stoll(input);
seekTable.push_back(entry);
input = "";
break;
case ',':
entry.realLocation = stoll(input);
input = "";
break;
default:
input += comp;
break;
}
}
if(breakout) {
}
currentSeekLookup++;
value = corpus.chunks[chunk].wordSeek.find("=docEnding" + to_string(currentSeekLookup));
}
}
void ISREndDoc::seek(Location target) {
int chunk = 0;
while(target < corpus.chunks[chunk].lastLocation)
chunk++;
if(chunk != currentChunk)
openChunk(chunk);
if(!seekTable.empty()) {
WordSeek last = WordSeek();
for(int i = 0; i < seekTable.size(); i++) {
if(target < seekTable[i].realLocation) {
last = seekTable[i];
} else if(target >= seekTable[i].realLocation) {
break;
}
currentValueChunk++;
value = currentWordSeek.find(key + to_string(currentValueChunk));
}
if(breakout) {
break;
}
currentChunk++;
between = true;
}
while(target > (next().docEndPosition - 1)) {
memMap += last.seekOffset;
}
//next();
while(target > (next().docEndPosition - 1));
}
//void ISREndDoc::seek(Location target) {
// string key = "=docEnding";
// string value = "";
// bool found = false;
// pair<size_t, size_t> docEndingWordSeek = {0, 0}; // location, offset
// size_t tempLocation = 0;
// string input = "";
// bool init = false;
// bool breakout = false;
// bool between = false;
// size_t foundChunk;
// while(!found) {
// string fileName = util::GetCurrentWorkingDir() +
// IndexerConstants::pathToIndex +
// to_string(currentChunk) + "-wordseek.txt";
// if(0 != access(fileName.c_str(), 0)) {
// currentChunk--;
// break;
// }
// MMDiskHashTable currentWordSeek = MMDiskHashTable(fileName, 30, 168);
// int currentValueChunk = 0;
// value = currentWordSeek.find(key + to_string(currentValueChunk));
// while(value.compare("") != 0) {
// //cout << "searching through " << key + to_string(currentValueChunk) << endl;
// for (auto comp : value) {
// switch (comp) {
// case '<':
// break;
// case '>':
// if (target < tempLocation && target > docEndingWordSeek.first)
// {
// if(!init) {
// breakout = true;
// break;
// }
// breakout = true;
// found = true;
// foundChunk = between ? currentChunk - 1 : currentChunk ;
// break;
// }
// between = false;
// init = true;
// docEndingWordSeek.first = tempLocation;
// docEndingWordSeek.second = stoll(input);
// input = "";
// break;
// case ',':
// tempLocation = stoll(input);
// input = "";
// break;
// default:
// input += comp;
// break;
// }
// if (found) {
// string fileName = util::GetCurrentWorkingDir() + IndexerConstants::pathToIndex + to_string(foundChunk) + ".txt";
// currentFile = open(fileName.c_str(), O_RDONLY);
// memMap = (char *) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile,
// 0);
// memMap += docEndingWordSeek.second;
// }
// if(breakout) {
// break;
// }
// }
// if(breakout) {
// break;
// }
// currentValueChunk++;
// value = currentWordSeek.find(key + to_string(currentValueChunk));
// }
// if(breakout) {
// break;
// }
// currentChunk++;
// between = true;
// }
//
// while(target > (next().docEndPosition - 1)) {
// }
// //next();
//}
DocumentEnding ISREndDoc::getCurrentDoc() {
return currentDoc;
}
Location ISREndDoc::GetStartingPositionOfDoc( )
{
Location ISREndDoc::GetStartingPositionOfDoc( ) {
return currentDoc.docEndPosition - currentDoc.docNumWords - 1;
}
\ No newline at end of file
}
\ No newline at end of file
......@@ -7,6 +7,7 @@
#include <iostream>
#include <vector>
#include <fcntl.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
......@@ -17,6 +18,8 @@
#include "../util/util.h"
#include "../indexer/DocumentEnding.h"
#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
#include "../indexer/IndexerConstants.h"
#include "../indexer/Corpus.h"
// Find occurrences of document ends.
typedef size_t Location;
......@@ -26,20 +29,21 @@ public:
ISREndDoc();
DocumentEnding next();
void openChunk(int chunk);
void seek(Location target);
DocumentEnding getCurrentDoc();
Location GetStartingPositionOfDoc( );
unsigned GetDocumentLength ( );
unsigned GetTitleLength ( );
string getURL ( );
size_t currentChunk;
private:
DocumentEnding currentDoc;
char* memMap;
int currentFileHandle;
int currentChunk;
char* memMap;
int currentFile;
vector<WordSeek> seekTable;
Corpus corpus;
};
......
......@@ -4,6 +4,7 @@
#include <iostream>
#include <vector>
#include <fcntl.h>
#include <string>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
......@@ -43,13 +44,11 @@ class ISRWord : public ISR
// ISR *GetDocumentISR( );
string term;
vector< size_t > listOfChunks;
vector< WordSeek > wordSeekLookupTable;
size_t currentChunk;
char *currentMemMap;
//set member variables to all of the chunks that occur, update current chunk
void getChunks ( );
Location getCurrentLocation();
size_t getFrequency();
size_t getDocFrequency();
......@@ -59,10 +58,8 @@ class ISRWord : public ISR
private:
void getWordSeek();
size_t lastLocation;
size_t docFrequency;
size_t frequency;
WordInfo info;
Corpus corpus = Corpus();
......
......@@ -35,6 +35,7 @@ int main ( )
auto url = queryAnd->GetEndDocument()->getCurrentDoc().url;
urls.insert(url);
cout << url << endl;
queryAnd->NextDocument();
}
......
......@@ -32,7 +32,7 @@ int main ( ) {
queries.push_back(queryWord);
}
*/
ISRWord queryWord( "moment" ) ;
ISRWord queryWord( "#penetr" ) ;
queries.push_back(queryWord);
vector<size_t> locations;
set<string> urls;
......@@ -44,7 +44,6 @@ int main ( ) {
while(query.getCurrentLocation() != MAX_Location) {
auto url = query.DocumentEnd->getCurrentDoc().url;
urls.insert( url );
cout << url << endl;
query.NextDocument();
}
......
......@@ -11,6 +11,11 @@ Chunk::Chunk(int number) {
to_string(number) + "-wordseek.txt",
IndexerConstants::chunkWordSeekKeySize,
IndexerConstants::chunkWordSeekValueSize);
string chunkFileName = util::GetCurrentWorkingDir() +
IndexerConstants::pathToIndex +
to_string(number) + ".txt";
chunkFileHandle = open(chunkFileName.c_str(), O_RDONLY);
chunkMap = util::getFileMap(chunkFileHandle);
numberUniqueWords = stoll(seeker.find("=numberUniqueWords"));
numberWords = stoll(seeker.find("=numberWords"));
numberDocs = stoll(seeker.find("=numberDocs"));
......
......@@ -9,6 +9,9 @@ class Chunk {
public:
Chunk(int number);
int chunkFileHandle;
char* chunkMap;
MMDiskHashTable seeker;
MMDiskHashTable wordSeek;
size_t numberUniqueWords;
......
#pragma once
namespace IndexerConstants {
// const string pathToIndex = "/buildIndex/";
const string pathToIndex = "/constraintSolver/index-test-files/twitter/";
const string pathToIndex = "/buildIndex/";
// const string pathToIndex = "/constraintSolver/index-test-files/twitter/";
const size_t maxWordSize = 30;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment