Skip to content
Snippets Groups Projects
Commit 89159931 authored by benbergk's avatar benbergk
Browse files

Merge branch 'milestone2' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into milestone2

parents 1ab1c97d 129bba17
No related branches found
No related tags found
No related merge requests found
...@@ -120,6 +120,7 @@ add_executable(ISRWord-tests ...@@ -120,6 +120,7 @@ add_executable(ISRWord-tests
util/util.cpp util/util.cpp
constraintSolver/ISRWord.cpp constraintSolver/ISRWord.cpp
constraintSolver/tests/ISRWordTests.cpp constraintSolver/tests/ISRWordTests.cpp
constraintSolver/ISREndDoc.cpp
util/stringProcessing.cpp util/stringProcessing.cpp
util/Stemmer.cpp ) util/Stemmer.cpp )
......
File added
...@@ -3,3 +3,106 @@ ...@@ -3,3 +3,106 @@
// //
#include "ISREndDoc.h" #include "ISREndDoc.h"
ISREndDoc::ISREndDoc() {
currentChunk = 0;
}
DocumentEnding ISREndDoc::next() {
if(memMap == nullptr) {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + ".txt";
currentFile = open(fileName.c_str(), O_RDONLY);
vector<size_t> contents = getSeekContents();
memMap = (char*) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile, 0);
memMap += contents[0];
}
string currentOne;
for(char* map = memMap; map < memMap + util::FileSize(currentFile); map++) {
if(*map == '\0') {
currentChunk++;
memMap = nullptr;
return DocumentEnding();
}
if(currentChunk == 8) {
DocumentEnding a = DocumentEnding();
a.url = "aaa";
return a;
}
if(*map == '\n') {
memMap = map;
memMap++;
break;
}
switch(*map) {
case '[':
currentDoc = DocumentEnding();
break;
case ']':
currentDoc.docNumWords = stoll(currentOne);
currentOne = "";
break;
case ',':
if(currentDoc.url == "") {
currentDoc.url = currentOne;
currentOne = "";
} else if(currentDoc.docEndPosition == 0) {
currentDoc.docEndPosition = stoll(currentOne);
currentOne = "";
}
break;
case ' ':
break;
default:
currentOne += *map;
break;
}
}
return currentDoc;
}
DocumentEnding ISREndDoc::getCurrentDoc() {
return currentDoc;
}
vector<size_t> ISREndDoc::getSeekContents() {
string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + "-seek.txt";
int file = open(fileName.c_str(), O_RDONLY);
ssize_t fileSize = util::FileSize(file);
vector<size_t> contents;
char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0);
// char* memMap = util::getFileMap(fileName);
string word = "";
bool midWord = false;
bool midFind = false;
if(memMap != MAP_FAILED) {
for(char* map = memMap; map < memMap + fileSize; map++) {
if(midFind && isalpha(*map)) {
break;
}
switch(*map) {
case '\n':
case '\r':
case '\t':
case ' ':
if (midFind && word != "") {
contents.push_back(stoll(word));
word = "";
} else if (midWord) {
midWord = false;
if(word == "=docEnding") {
midFind = true;
}
word = "";
}
break;
default:
word += *map;
midWord = true;
}
}
}
return contents;
}
\ No newline at end of file
...@@ -4,22 +4,37 @@ ...@@ -4,22 +4,37 @@
#pragma once #pragma once
#include "ISR.h" #include <iostream>
#include <vector>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "WordSeek.h"
#include "../util/util.h"
#include "../indexer/DocumentEnding.h"
// Find occurrences of document ends. // Find occurrences of document ends.
class ISREndDoc : ISR class ISREndDoc
{ {
public: public:
Location GetCurrentLocation ( ); ISREndDoc();
DocumentEnding next();
Location GetPreviousLocation ( ); DocumentEnding getCurrentDoc();
unsigned GetDocumentLength ( ); unsigned GetDocumentLength ( );
unsigned GetTitleLength ( ); unsigned GetTitleLength ( );
string getURL ( );
unsigned GetUrlLength ( ); private:
}; DocumentEnding currentDoc;
char* memMap;
int currentChunk;
int currentFile;
vector<size_t> getSeekContents();
};
...@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) { ...@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) {
void ISRWord::getChunks() { void ISRWord::getChunks() {
listOfChunks = getSeekContents("index-test-files/twitter/index-master.txt"); listOfChunks = getSeekContents(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt");
// int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY); // int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
// ssize_t chunkFileSize = FileSize(chunkFile); // ssize_t chunkFileSize = FileSize(chunkFile);
// char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0); // char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
...@@ -117,16 +117,12 @@ void ISRWord::getChunks() { ...@@ -117,16 +117,12 @@ void ISRWord::getChunks() {
Location ISRWord::first ( ) Location ISRWord::first ( )
{ {
if ( listOfChunks.size( ) <= currentChunk )
{
exit( 0 );
}
string currentChunkSeekFileLocation = string currentChunkSeekFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
"-seek.txt"; "-seek.txt";
vector< size_t > location = getSeekContents( currentChunkSeekFileLocation ); vector< size_t > location = getSeekContents( currentChunkSeekFileLocation );
string currentChunkFileLocation = string currentChunkFileLocation =
util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) + util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
".txt"; ".txt";
int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY ); int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
ssize_t currentChunkFileSize = FileSize( currentChunkFile ); ssize_t currentChunkFileSize = FileSize( currentChunkFile );
...@@ -156,8 +152,14 @@ Location ISRWord::next ( ) ...@@ -156,8 +152,14 @@ Location ISRWord::next ( )
if ( *currentMemMap == '\n' ) if ( *currentMemMap == '\n' )
{ {
currentChunk++; currentChunk++;
currentLocation = first( ); if(listOfChunks.size( ) <= currentChunk)
} {
currentLocation = 9999999999999;
return currentLocation;
}
currentLocation = first( );
}
else else
{ {
string delta = ""; string delta = "";
...@@ -172,6 +174,11 @@ Location ISRWord::next ( ) ...@@ -172,6 +174,11 @@ Location ISRWord::next ( )
return currentLocation; return currentLocation;
} }
Location ISRWord::getCurrentLocation()
{
return currentLocation;
}
//look thru each chunk //look thru each chunk
//check if absolute position at offset in chunk is less then chunk, //check if absolute position at offset in chunk is less then chunk,
//check seek lookup table to find if offset+absulte is bigger than target //check seek lookup table to find if offset+absulte is bigger than target
...@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) { ...@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) {
if(entry.realLocation < target) { if(entry.realLocation < target) {
best = entry; best = entry;
} else { } else {
string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt"; string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY); int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile); ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0); currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
......
...@@ -54,9 +54,9 @@ public: ...@@ -54,9 +54,9 @@ public:
//set member variables to all of the chunks that occur, update current chunk //set member variables to all of the chunks that occur, update current chunk
void getChunks ( ); void getChunks ( );
Location getCurrentLocation();
private: private:
}; };
...@@ -3,20 +3,36 @@ ...@@ -3,20 +3,36 @@
// //
#include <iostream> #include <iostream>
#include <set>
#include "../../indexer/DocumentEnding.h"
#include "../ISRWord.h" #include "../ISRWord.h"
#include "../ISREndDoc.h"
using namespace std; using namespace std;
int main ( ) int main ( )
{ {
char *w = new char[10]; char* query;
strcpy( w, "hello" ); ISRWord queryWord("iphone");
ISRWord word = ISRWord( w ); ISREndDoc endDocs;
vector<size_t> locations;
vector<DocumentEnding> docEnds;
set<string> urls;
while(queryWord.getCurrentLocation() != 9999999999999) {
locations.push_back(queryWord.next());
}
while(endDocs.next().url != "aaa") {
for(auto locs : locations) {
if(locs < endDocs.getCurrentDoc().docEndPosition &&
locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
urls.insert(endDocs.getCurrentDoc().url);
}
}
while ( 1 ) }
{ for(auto urrl : urls) {
cout << word.next( ) << endl; cout << urrl << endl;
} }
return 0; return 0;
} }
\ No newline at end of file \ No newline at end of file
No preview for this file type
...@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] ) ...@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] )
crawler.WaitOnAllSpiders( ); crawler.WaitOnAllSpiders( );
indexer.WaitForFinish( ); indexer.WaitForFinish( );
string aa;
cin >> aa;
if(aa == "q") {
return 0;
}
auto f = urlFrontier->Pop( ); auto f = urlFrontier->Pop( );
int x = 0; int x = 0;
......
...@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch ...@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
{ {
set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'', set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
'(', ')', '*', '+', ',', ';', '=' }; '(', ')', '*', '+', ',', ';', '=' };
string codedURL = "=";
codedURL += originalText;
(*docIndex)[codedURL].push_back(0);
return tokenize( splitStr( originalText, split, true ), offset, decorator ); return tokenize( splitStr( originalText, split, true ), offset, decorator );
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment