Skip to content
Snippets Groups Projects
Commit 5495317c authored by aanvi's avatar aanvi
Browse files

Merge branch 'milestone1' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into milestone1

parents a73589cb 76386b83
No related branches found
No related tags found
1 merge request!2WIP:Crawler parser 2 merge into duplicate url-crawler
...@@ -54,7 +54,7 @@ add_executable(URLTEST shared/url.h shared/urlTest.cpp) ...@@ -54,7 +54,7 @@ add_executable(URLTEST shared/url.h shared/urlTest.cpp)
add_executable(search-engine search.cpp query/Query.cpp) add_executable(search-engine search.cpp query/Query.cpp)
add_executable(ISRWord-tests util/util.cpp constraintSolver/ISRWord.cpp constraintSolver/tests/ISRWordTests.cpp ) add_executable(ISRWord-tests constraintSolver/ISRWord.cpp constraintSolver/tests/ISRWordTests.cpp )
find_package(OpenSSL REQUIRED) find_package(OpenSSL REQUIRED)
......
...@@ -2,9 +2,14 @@ ...@@ -2,9 +2,14 @@
// Created by Jake Close on 3/13/18. // Created by Jake Close on 3/13/18.
// //
#include <string>
#include "ISRWord.h" #include "ISRWord.h"
using namespace std; size_t FileSize(int f) {
struct stat fileInfo;
fstat( f, &fileInfo);
return fileInfo.st_size;
}
ISRWord::ISRWord(char* word) : term(word) { ISRWord::ISRWord(char* word) : term(word) {
getChunks(); getChunks();
...@@ -24,15 +29,28 @@ vector<size_t> ISRWord::getSeekContents(string fileName) { ...@@ -24,15 +29,28 @@ vector<size_t> ISRWord::getSeekContents(string fileName) {
string word = ""; string word = "";
bool midWord = false; bool midWord = false;
bool midFind = false; bool midFind = false;
WordSeek wordDictionaryEntry;
if(memMap != MAP_FAILED) { if(memMap != MAP_FAILED) {
for(char* map = memMap; map < memMap + fileSize; map++) { for(char* map = memMap; map < memMap + fileSize; map++) {
if(midFind && isalpha(*map)) { if(midFind && isalpha(*map)) {
break; break;
} }
switch(*map) { switch(*map) {
case '\t': if(midFind) {
case '<':
wordDictionaryEntry = WordSeek();
break;
case '>':
wordDictionaryEntry.seekOffset = stoll(word);
wordSeekLookupTable.push_back(wordDictionaryEntry);
break;
case ',':
wordDictionaryEntry.realLocation = stoll(word);
break;
}
case '\n': case '\n':
case '\r': case '\r':
case '\t':
case ' ': case ' ':
if (midFind && word != "") { if (midFind && word != "") {
contents.push_back(stoll(word)); contents.push_back(stoll(word));
...@@ -151,6 +169,25 @@ Location ISRWord::next() { ...@@ -151,6 +169,25 @@ Location ISRWord::next() {
//go to next chunk //go to next chunk
Location ISRWord::seek( Location target ) { Location ISRWord::seek( Location target ) {
if(!wordSeekLookupTable.empty()) {
auto best = wordSeekLookupTable.front();
for(auto entry : wordSeekLookupTable) {
if(entry.realLocation < target) {
best = entry;
} else {
string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
ssize_t currentChunkFileSize = FileSize(currentChunkFile);
currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);
currentMemMap += best.seekOffset;
currentLocation = best.realLocation;
return best.realLocation;
}
}
} else {
while(next() <= target) {
}
return currentLocation;
}
} }
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#pragma once #pragma once
//#include "ISR.h" //#include "ISR.h"
#include <iostream>
#include <vector> #include <vector>
#include <fcntl.h> #include <fcntl.h>
#include <stdio.h> #include <stdio.h>
...@@ -13,18 +14,13 @@ ...@@ -13,18 +14,13 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include "WordSeek.h"
//#include "../util/util.h" //#include "../util/util.h"
size_t FileSize(int f) {
struct stat fileInfo;
fstat( f, &fileInfo);
return fileInfo.st_size;
}
using namespace std; using namespace std;
//Find occurrences of individual words //Find occurrences of individual words
typedef size_t Location; typedef size_t Location;
...@@ -50,6 +46,7 @@ public: ...@@ -50,6 +46,7 @@ public:
char* term; char* term;
char* masterIndex; char* masterIndex;
vector<size_t> listOfChunks; vector<size_t> listOfChunks;
vector<WordSeek> wordSeekLookupTable;
size_t currentChunk; size_t currentChunk;
char* currentMemMap; char* currentMemMap;
......
#pragma once
class WordSeek {
public:
ssize_t seekOffset;
size_t realLocation;
};
\ No newline at end of file
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <iostream> #include <iostream>
#include "../ISRWord.h" #include "../ISRWord.h"
#include "../ISRWord.cpp"
using namespace std; using namespace std;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment