Skip to content
Snippets Groups Projects
Commit c2667b49 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

hacked into making queries

parent 3d5f1a5e
No related branches found
No related tags found
No related merge requests found
......@@ -106,6 +106,7 @@ add_executable(ISRWord-tests
util/util.cpp
constraintSolver/ISRWord.cpp
constraintSolver/tests/ISRWordTests.cpp
constraintSolver/ISREndDoc.cpp
util/stringProcessing.cpp
util/Stemmer.cpp )
......
File added
......@@ -3,3 +3,106 @@
//
#include "ISREndDoc.h"
ISREndDoc::ISREndDoc() {
currentChunk = 0;
}
DocumentEnding ISREndDoc::next() {
if(memMap == nullptr) {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + ".txt";
currentFile = open(fileName.c_str(), O_RDONLY);
vector<size_t> contents = getSeekContents();
memMap = (char*) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile, 0);
memMap += contents[0];
}
string currentOne;
for(char* map = memMap; map < memMap + util::FileSize(currentFile); map++) {
if(*map == '\0') {
currentChunk++;
memMap = nullptr;
return DocumentEnding();
}
if(currentChunk == 8) {
DocumentEnding a = DocumentEnding();
a.url = "aaa";
return a;
}
if(*map == '\n') {
memMap = map;
memMap++;
break;
}
switch(*map) {
case '[':
currentDoc = DocumentEnding();
break;
case ']':
currentDoc.docNumWords = stoll(currentOne);
currentOne = "";
break;
case ',':
if(currentDoc.url == "") {
currentDoc.url = currentOne;
currentOne = "";
} else if(currentDoc.docEndPosition == 0) {
currentDoc.docEndPosition = stoll(currentOne);
currentOne = "";
}
break;
case ' ':
break;
default:
currentOne += *map;
break;
}
}
return currentDoc;
}
DocumentEnding ISREndDoc::getCurrentDoc() {
return currentDoc;
}
vector<size_t> ISREndDoc::getSeekContents() {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + "-seek.txt";
int file = open(fileName.c_str(), O_RDONLY);
ssize_t fileSize = util::FileSize(file);
vector<size_t> contents;
char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0);
// char* memMap = util::getFileMap(fileName);
string word = "";
bool midWord = false;
bool midFind = false;
if(memMap != MAP_FAILED) {
for(char* map = memMap; map < memMap + fileSize; map++) {
if(midFind && isalpha(*map)) {
break;
}
switch(*map) {
case '\n':
case '\r':
case '\t':
case ' ':
if (midFind && word != "") {
contents.push_back(stoll(word));
word = "";
} else if (midWord) {
midWord = false;
if(word == "=docEnding") {
midFind = true;
}
word = "";
}
break;
default:
word += *map;
midWord = true;
}
}
}
return contents;
}
\ No newline at end of file
......@@ -4,22 +4,37 @@
#pragma once
#include "ISR.h"
#include <iostream>
#include <vector>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "WordSeek.h"
#include "../util/util.h"
#include "../indexer/DocumentEnding.h"
// Find occurrences of document ends.
class ISREndDoc : ISR
class ISREndDoc
{
public:
Location GetCurrentLocation ( );
Location GetPreviousLocation ( );
ISREndDoc();
DocumentEnding next();
DocumentEnding getCurrentDoc();
unsigned GetDocumentLength ( );
unsigned GetTitleLength ( );
unsigned GetUrlLength ( );
};
string getURL ( );
private:
DocumentEnding currentDoc;
char* memMap;
int currentChunk;
int currentFile;
vector<size_t> getSeekContents();
};
......@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) {
void ISRWord::getChunks() {
listOfChunks = getSeekContents("index-test-files/twitter/index-master.txt");
listOfChunks = getSeekContents(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt");
// int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
// ssize_t chunkFileSize = FileSize(chunkFile);
// char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
......@@ -117,16 +117,12 @@ void ISRWord::getChunks() {
Location ISRWord::first ( )
{
if ( listOfChunks.size( ) <= currentChunk )
{
exit( 0 );
}
string currentChunkSeekFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
"-seek.txt";
vector< size_t > location = getSeekContents( currentChunkSeekFileLocation );
string currentChunkFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
".txt";
int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
ssize_t currentChunkFileSize = FileSize( currentChunkFile );
......@@ -156,8 +152,14 @@ Location ISRWord::next ( )
if ( *currentMemMap == '\n' )
{
currentChunk++;
currentLocation = first( );
}
if(listOfChunks.size( ) <= currentChunk)
{
currentLocation = 9999999999999;
return currentLocation;
}
currentLocation = first( );
}
else
{
string delta = "";
......@@ -172,6 +174,11 @@ Location ISRWord::next ( )
return currentLocation;
}
Location ISRWord::getCurrentLocation()
{
return currentLocation;
}
//look thru each chunk
//check if absolute position at offset in chunk is less then chunk,
//check seek lookup table to find if offset+absulte is bigger than target
......@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) {
if(entry.realLocation < target) {
best = entry;
} else {
string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
......
......@@ -54,9 +54,9 @@ public:
//set member variables to all of the chunks that occur, update current chunk
void getChunks ( );
Location getCurrentLocation();
private:
};
};
......@@ -3,20 +3,36 @@
//
#include <iostream>
#include <set>
#include "../../indexer/DocumentEnding.h"
#include "../ISRWord.h"
#include "../ISREndDoc.h"
using namespace std;
int main ( )
{
char *w = new char[10];
strcpy( w, "hello" );
ISRWord word = ISRWord( w );
{
char* query;
ISRWord queryWord("iphone");
ISREndDoc endDocs;
vector<size_t> locations;
vector<DocumentEnding> docEnds;
set<string> urls;
while(queryWord.getCurrentLocation() != 9999999999999) {
locations.push_back(queryWord.next());
}
while(endDocs.next().url != "aaa") {
for(auto locs : locations) {
if(locs < endDocs.getCurrentDoc().docEndPosition &&
locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
urls.insert(endDocs.getCurrentDoc().url);
}
}
while ( 1 )
{
cout << word.next( ) << endl;
}
return 0;
}
\ No newline at end of file
}
for(auto urrl : urls) {
cout << urrl << endl;
}
return 0;
}
\ No newline at end of file
No preview for this file type
......@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] )
crawler.WaitOnAllSpiders( );
indexer.WaitForFinish( );
string aa;
cin >> aa;
if(aa == "q") {
return 0;
}
auto f = urlFrontier->Pop( );
int x = 0;
......
......@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
{
set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '=' };
string codedURL = "=";
codedURL += originalText;
(*docIndex)[codedURL].push_back(0);
return tokenize( splitStr( originalText, split, true ), offset, decorator );
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment