Skip to content
Snippets Groups Projects
Commit c2667b49 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

hacked into making queries

parent 3d5f1a5e
No related branches found
No related tags found
No related merge requests found
...@@ -106,6 +106,7 @@ add_executable(ISRWord-tests ...@@ -106,6 +106,7 @@ add_executable(ISRWord-tests
util/util.cpp util/util.cpp
constraintSolver/ISRWord.cpp constraintSolver/ISRWord.cpp
constraintSolver/tests/ISRWordTests.cpp constraintSolver/tests/ISRWordTests.cpp
constraintSolver/ISREndDoc.cpp
util/stringProcessing.cpp util/stringProcessing.cpp
util/Stemmer.cpp ) util/Stemmer.cpp )
......
File added
...@@ -3,3 +3,106 @@ ...@@ -3,3 +3,106 @@
// //
#include "ISREndDoc.h" #include "ISREndDoc.h"
ISREndDoc::ISREndDoc() {
currentChunk = 0;
}
DocumentEnding ISREndDoc::next() {
if(memMap == nullptr) {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + ".txt";
currentFile = open(fileName.c_str(), O_RDONLY);
vector<size_t> contents = getSeekContents();
memMap = (char*) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile, 0);
memMap += contents[0];
}
string currentOne;
for(char* map = memMap; map < memMap + util::FileSize(currentFile); map++) {
if(*map == '\0') {
currentChunk++;
memMap = nullptr;
return DocumentEnding();
}
if(currentChunk == 8) {
DocumentEnding a = DocumentEnding();
a.url = "aaa";
return a;
}
if(*map == '\n') {
memMap = map;
memMap++;
break;
}
switch(*map) {
case '[':
currentDoc = DocumentEnding();
break;
case ']':
currentDoc.docNumWords = stoll(currentOne);
currentOne = "";
break;
case ',':
if(currentDoc.url == "") {
currentDoc.url = currentOne;
currentOne = "";
} else if(currentDoc.docEndPosition == 0) {
currentDoc.docEndPosition = stoll(currentOne);
currentOne = "";
}
break;
case ' ':
break;
default:
currentOne += *map;
break;
}
}
return currentDoc;
}
DocumentEnding ISREndDoc::getCurrentDoc() {
return currentDoc;
}
vector<size_t> ISREndDoc::getSeekContents() {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + "-seek.txt";
int file = open(fileName.c_str(), O_RDONLY);
ssize_t fileSize = util::FileSize(file);
vector<size_t> contents;
char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0);
// char* memMap = util::getFileMap(fileName);
string word = "";
bool midWord = false;
bool midFind = false;
if(memMap != MAP_FAILED) {
for(char* map = memMap; map < memMap + fileSize; map++) {
if(midFind && isalpha(*map)) {
break;
}
switch(*map) {
case '\n':
case '\r':
case '\t':
case ' ':
if (midFind && word != "") {
contents.push_back(stoll(word));
word = "";
} else if (midWord) {
midWord = false;
if(word == "=docEnding") {
midFind = true;
}
word = "";
}
break;
default:
word += *map;
midWord = true;
}
}
}
return contents;
}
\ No newline at end of file
...@@ -4,22 +4,37 @@ ...@@ -4,22 +4,37 @@
#pragma once #pragma once
#include "ISR.h" #include <iostream>
#include <vector>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "WordSeek.h"
#include "../util/util.h"
#include "../indexer/DocumentEnding.h"
// Find occurrences of document ends. // Find occurrences of document ends.
class ISREndDoc : ISR class ISREndDoc
{ {
public: public:
Location GetCurrentLocation ( ); ISREndDoc();
DocumentEnding next();
Location GetPreviousLocation ( ); DocumentEnding getCurrentDoc();
unsigned GetDocumentLength ( ); unsigned GetDocumentLength ( );
unsigned GetTitleLength ( ); unsigned GetTitleLength ( );
string getURL ( );
unsigned GetUrlLength ( ); private:
}; DocumentEnding currentDoc;
char* memMap;
int currentChunk;
int currentFile;
vector<size_t> getSeekContents();
};
...@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) { ...@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) {
void ISRWord::getChunks() { void ISRWord::getChunks() {
listOfChunks = getSeekContents("index-test-files/twitter/index-master.txt"); listOfChunks = getSeekContents(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt");
// int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY); // int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
// ssize_t chunkFileSize = FileSize(chunkFile); // ssize_t chunkFileSize = FileSize(chunkFile);
// char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0); // char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
...@@ -117,16 +117,12 @@ void ISRWord::getChunks() { ...@@ -117,16 +117,12 @@ void ISRWord::getChunks() {
Location ISRWord::first ( ) Location ISRWord::first ( )
{ {
if ( listOfChunks.size( ) <= currentChunk )
{
exit( 0 );
}
string currentChunkSeekFileLocation = string currentChunkSeekFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
"-seek.txt"; "-seek.txt";
vector< size_t > location = getSeekContents( currentChunkSeekFileLocation ); vector< size_t > location = getSeekContents( currentChunkSeekFileLocation );
string currentChunkFileLocation = string currentChunkFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
".txt"; ".txt";
int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY ); int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
ssize_t currentChunkFileSize = FileSize( currentChunkFile ); ssize_t currentChunkFileSize = FileSize( currentChunkFile );
...@@ -156,8 +152,14 @@ Location ISRWord::next ( ) ...@@ -156,8 +152,14 @@ Location ISRWord::next ( )
if ( *currentMemMap == '\n' ) if ( *currentMemMap == '\n' )
{ {
currentChunk++; currentChunk++;
currentLocation = first( ); if(listOfChunks.size( ) <= currentChunk)
} {
currentLocation = 9999999999999;
return currentLocation;
}
currentLocation = first( );
}
else else
{ {
string delta = ""; string delta = "";
...@@ -172,6 +174,11 @@ Location ISRWord::next ( ) ...@@ -172,6 +174,11 @@ Location ISRWord::next ( )
return currentLocation; return currentLocation;
} }
Location ISRWord::getCurrentLocation()
{
return currentLocation;
}
//look thru each chunk //look thru each chunk
//check if absolute position at offset in chunk is less then chunk, //check if absolute position at offset in chunk is less then chunk,
//check seek lookup table to find if offset+absulte is bigger than target //check seek lookup table to find if offset+absulte is bigger than target
...@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) { ...@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) {
if(entry.realLocation < target) { if(entry.realLocation < target) {
best = entry; best = entry;
} else { } else {
string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt"; string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY); int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile); ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0); currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
......
...@@ -54,9 +54,9 @@ public: ...@@ -54,9 +54,9 @@ public:
//set member variables to all of the chunks that occur, update current chunk //set member variables to all of the chunks that occur, update current chunk
void getChunks ( ); void getChunks ( );
Location getCurrentLocation();
private: private:
}; };
...@@ -3,20 +3,36 @@ ...@@ -3,20 +3,36 @@
// //
#include <iostream> #include <iostream>
#include <set>
#include "../../indexer/DocumentEnding.h"
#include "../ISRWord.h" #include "../ISRWord.h"
#include "../ISREndDoc.h"
using namespace std; using namespace std;
int main ( ) int main ( )
{ {
char *w = new char[10]; char* query;
strcpy( w, "hello" ); ISRWord queryWord("iphone");
ISRWord word = ISRWord( w ); ISREndDoc endDocs;
vector<size_t> locations;
vector<DocumentEnding> docEnds;
set<string> urls;
while(queryWord.getCurrentLocation() != 9999999999999) {
locations.push_back(queryWord.next());
}
while(endDocs.next().url != "aaa") {
for(auto locs : locations) {
if(locs < endDocs.getCurrentDoc().docEndPosition &&
locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
urls.insert(endDocs.getCurrentDoc().url);
}
}
while ( 1 ) }
{ for(auto urrl : urls) {
cout << word.next( ) << endl; cout << urrl << endl;
} }
return 0; return 0;
} }
\ No newline at end of file \ No newline at end of file
No preview for this file type
...@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] ) ...@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] )
crawler.WaitOnAllSpiders( ); crawler.WaitOnAllSpiders( );
indexer.WaitForFinish( ); indexer.WaitForFinish( );
string aa;
cin >> aa;
if(aa == "q") {
return 0;
}
auto f = urlFrontier->Pop( ); auto f = urlFrontier->Pop( );
int x = 0; int x = 0;
......
...@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch ...@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
{ {
set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '=' }; '(', ')', '*', '+', ',', ';', '=' };
string codedURL = "=";
codedURL += originalText;
(*docIndex)[codedURL].push_back(0);
return tokenize( splitStr( originalText, split, true ), offset, decorator ); return tokenize( splitStr( originalText, split, true ), offset, decorator );
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment