Skip to content
Snippets Groups Projects
ISRWord.cpp 5.71 KiB

#include <string>
#include "ISRWord.h"


size_t FileSize(int f) {
    struct stat fileInfo;
    fstat( f, &fileInfo);
    return fileInfo.st_size;
}

ISRWord::ISRWord ( string word ) {
	term = word;

	getChunks( );
	currentChunk = 0;
	currentLocation = First( );
	DocumentEnd->seek( currentLocation );



}

void ISRWord::getChunks() {
    MMDiskHashTable diskHashTable(util::GetCurrentWorkingDir() + pathToIndex + "master.txt" , 30, 168);
	string value = diskHashTable.find(term);
    string chunkInput = "";
    for(char val : value) {
        if(isnumber(val)) {
            chunkInput += val;
        } else if(val != '\t') {
            listOfChunks.push_back(stoll(chunkInput));
            chunkInput = "";
        }
    }
    if(chunkInput != "") {
        frequency = stoll(chunkInput);
    }
}

//Go to current chunk
//Look in seek dictionary for chunk (mem map, binary search)
//Have offset into chunk, find post seek to post, return value
//update ISR currentLocation
//set current memory map
//returns offset into corpus

Location ISRWord::First ( )
	{
	if(listOfChunks.empty()) {
		currentLocation = MAX_Location;
		return MAX_Location;

	}
	string currentChunkSeekFileLocation =
			util::GetCurrentWorkingDir( ) + pathToIndex + to_string( listOfChunks[ currentChunk ] ) +
			"-seek.txt";
    MMDiskHashTable currentChunkSeekFileHashTable = MMDiskHashTable(currentChunkSeekFileLocation, 30, 8);
    string loc = currentChunkSeekFileHashTable.find(term);
	string currentChunkFileLocation =
			util::GetCurrentWorkingDir( ) + pathToIndex + to_string( listOfChunks[ currentChunk ] ) +
			".txt";
	int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
	ssize_t currentChunkFileSize = FileSize( currentChunkFile );
	currentMemMap = ( char * ) mmap( nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0 );
	currentMemMap += stoll(loc);
    string firstLoc = "";
	while ( *currentMemMap != ' ' ) {
		firstLoc += *currentMemMap;
		currentMemMap++;
    }
	currentMemMap++;
	getWordSeek();
	return stoll( firstLoc );
	}

//returns next absolute location in corpus
//looks at memory map
//if new line ( end of current list for that file
//move to next chunk, update chunk
//find new offset, return first location
//else
//find way to increment to next delta
//return new location

Location ISRWord::Next ( )
	{
	if ( *currentMemMap == '\n' )
		{
		currentChunk++;
        if(listOfChunks.size( ) <= currentChunk)
            {
            currentLocation = MAX_Location;
            return currentLocation;
            }

            currentLocation = First( );
        }
	else
		{
		string delta = "";
		while ( *currentMemMap != ' ' )
			{
			delta += *currentMemMap;
			currentMemMap++;
			}
		currentLocation += stoll( delta );
		currentMemMap++;
		}


	//DocumentEnd->seek( currentLocation );
	return currentLocation;
	}

Location ISRWord::getCurrentLocation() {
    return currentLocation;
}

size_t ISRWord::getFrequency() {
	return frequency;
}

void ISRWord::getWordSeek() {
	string currentChunkWordSeekFileLocation =
			util::GetCurrentWorkingDir( ) + pathToIndex + to_string( listOfChunks[ currentChunk ] ) +
			"-wordseek.txt";
	MMDiskHashTable wordSeek = MMDiskHashTable(currentChunkWordSeekFileLocation, 30, 168);
	string result = wordSeek.find(term);
	WordSeek wordDictionaryEntry;
	string token = "";
	for(char comp : result) {
		switch(comp) {
			case '<':
				wordDictionaryEntry = WordSeek();
				break;
			case '>':
				wordDictionaryEntry.seekOffset = stoll(token);
				wordSeekLookupTable.push_back(wordDictionaryEntry);
				token = "";
				break;
			case ',':
				wordDictionaryEntry.realLocation = stoll(token);
				token = "";
				break;
			default:
				token += comp;
		}
	}
}

//look thru each chunk
//check if absolute position at offset in chunk is less then chunk,
//check seek lookup table to find if offset+absulte is bigger than target
//if so, set location to that big chunk
//go to next chunk
Location ISRWord::Seek( Location target ) {
	 if(target <= currentLocation)
		 return currentLocation;

    if(!wordSeekLookupTable.empty()) {
        auto best = wordSeekLookupTable.front();
        for(auto entry : wordSeekLookupTable) {
            if(entry.realLocation < target) {
                best = entry;
            } else {
                string currentChunkFileLocation = util::GetCurrentWorkingDir() + pathToIndex + to_string(listOfChunks[currentChunk]) + ".txt";
                int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
                ssize_t currentChunkFileSize = FileSize(currentChunkFile);
                currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
                currentMemMap += best.seekOffset;
                currentLocation = best.realLocation;
					 DocumentEnd->seek( currentLocation );
                return best.realLocation;
            }
        }
    } else {
        while(Next() <= target) {

        }
		if( currentLocation == MAX_Location)
			return MAX_Location;

		 DocumentEnd->seek( currentLocation );
		 return currentLocation;
    }
}


Location  ISRWord::NextDocument()
	{
	//FixMe
	//seek the isr to the first location after the doc end
	 currentLocation = Seek( DocumentEnd->getCurrentDoc().docEndPosition + 1);
	//update the doc end to the next doc end after the new seek position
	return DocumentEnd->getCurrentDoc().docEndPosition;



	}

ISREndDoc * ISRWord::GetEndDocument()
	{
	//Fixme
	return DocumentEnd;
	}

ISR * ISRWord::GetISRToBeginningOfDocument ( ) {

	Location beginningOfDoc = DocumentEnd->getCurrentDoc().docEndPosition - DocumentEnd->getCurrentDoc().docNumWords;
	ISR* BeginngISR = new ISRWord(this->term);
	BeginngISR->Seek(beginningOfDoc);


	}

string ISRWord::GetTerm()
	{

	return string(this->term);
	}