Skip to content
Snippets Groups Projects
ISRWord.cpp 4.59 KiB
Newer Older
  • Learn to ignore specific revisions
  • //
    // Created by Jake Close on 3/13/18.
    //
    
    
    jsclose's avatar
    jsclose committed
    using namespace std;
    
    
    vcday's avatar
    vcday committed
    ISRWord::ISRWord ( char *word ) : term( word )
    	{
    	getChunks( );
    	currentChunk = 0;
    	currentLocation = first( );
    	}
    
    vcday's avatar
    vcday committed
    vector< size_t > ISRWord::getSeekContents ( string fileName )
    	{
    	int file = open( fileName.c_str( ), O_RDONLY );
    	ssize_t fileSize = FileSize( file );
    	vector< size_t > contents;
    
    jsclose's avatar
    jsclose committed
    
    
    
    vcday's avatar
    vcday committed
    	char *memMap = ( char * ) mmap( nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0 );
    	// char* memMap = util::getFileMap(fileName);
    	string word = "";
    	bool midWord = false;
    	bool midFind = false;
    	if ( memMap != MAP_FAILED )
    		{
    		for ( char *map = memMap; map < memMap + fileSize; map++ )
    			{
    			if ( midFind && isalpha( *map ) )
    				{
    				break;
    				}
    			switch ( *map )
    				{
    				case '\t':
    				case '\n':
    				case '\r':
    				case ' ':
    					if ( midFind && word != "" )
    						{
    						contents.push_back( stoll( word ) );
    						word = "";
    						}
    					else if ( midWord )
    						{
    						midWord = false;
    						if ( word == term )
    							{
    							midFind = true;
    							}
    						word = "";
    						}
    					break;
    				default:
    					word += *map;
    					midWord = true;
    				}
    			}
    		}
    	return contents;
    	}
    
    vcday's avatar
    vcday committed
    void ISRWord::getChunks ( )
    	{
    	string path = util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index-master.txt";
    	listOfChunks = getSeekContents( path );
    
    //    int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
    //    ssize_t chunkFileSize = FileSize(chunkFile);
    //    char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
    //    string word = "";
    //    bool midWord = false;
    //    bool midChunkFind = false;
    //    if(chunkMemMap != MAP_FAILED) {
    //        for(char* map = chunkMemMap; map < chunkMemMap + chunkFileSize; map++) {
    //            if(midChunkFind && isalpha(*map)) {
    //                break;
    //            }
    //            switch(*map) {
    //                case '\t':
    //                case '\n':
    //                case '\r':
    //                case ' ':
    //                    if (midChunkFind && word != "") {
    //                        listOfChunks.push_back(stoll(word));
    //                        word = "";
    //                    } else if (midWord) {
    //                        midWord = false;
    //                        if(word == term) {
    //                            midChunkFind = true;
    //                        }
    //                        word = "";
    //                    }
    //                    break;
    //                default:
    //                    word += *map;
    //                    midWord = true;
    //            }
    //        }
    //    }
    
    vcday's avatar
    vcday committed
    	}
    
    
    //Go to current chunk
    //Look in seek dictionary for chunk (mem map, binary search)
    //Have offset into chunk, find post seek to post, return value
    //update ISR currentLocation
    //set current memory map
    //returns offset into corpus
    
    
    vcday's avatar
    vcday committed
    Location ISRWord::first ( )
    	{
    	if ( listOfChunks.size( ) <= currentChunk )
    		{
    		exit( 0 );
    		}
    	string currentChunkSeekFileLocation =
    			util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
    			"-seek.txt";
    	vector< size_t > location = getSeekContents( currentChunkSeekFileLocation );
    	string currentChunkFileLocation =
    			util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
    			".txt";
    	int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
    	ssize_t currentChunkFileSize = FileSize( currentChunkFile );
    	currentMemMap = ( char * ) mmap( nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0 );
    	currentMemMap += location[ 0 ];
    	string firstLoc = "";
    	while ( *currentMemMap != ' ' )
    		{
    		firstLoc += *currentMemMap;
    		currentMemMap++;
    		}
    	currentMemMap++;
    	return stoll( firstLoc );
    	}
    
    
    //returns next absolute location in corpus
    
    //looks at memory map
    //if new line ( end of current list for that file
    //move to next chunk, update chunk
    //find new offset, return first location
    //else
    //find way to increment to next delta
    //return new location
    
    
    vcday's avatar
    vcday committed
    Location ISRWord::next ( )
    	{
    	if ( *currentMemMap == '\n' )
    		{
    		currentChunk++;
    		currentLocation = first( );
    		}
    	else
    		{
    		string delta = "";
    		while ( *currentMemMap != ' ' )
    			{
    			delta += *currentMemMap;
    			currentMemMap++;
    			}
    		currentLocation += stoll( delta );
    		currentMemMap++;
    		}
    	return currentLocation;
    	}
    
    
    //look thru each chunk
    //check if absolute position at offset in chunk is less then chunk,
    //check seek lookup table to find if offset+absulte is bigger than target
    //if so, set location to that big chunk
    //go to next chunk
    
    
    vcday's avatar
    vcday committed
    Location ISRWord::seek ( Location target )
    	{
    
    vcday's avatar
    vcday committed
    	}