Skip to content
Snippets Groups Projects
Commit 30338cd9 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

very simple statistics for each block + doc endings, TODO: SEEK FILE

parent 96e1e7d1
No related branches found
No related tags found
No related merge requests found
#pragma once
#include <iostream>
using namespace std;
class DocumentEnding {
public:
DocumentEnding() {
docEndPosition = 0;
docNumWords = 0;
url = "";
}
size_t docEndPosition;
size_t docNumWords;
string url;
};
\ No newline at end of file
//
// Created by nick on 2/6/18.
//
#include "IndexStreamReader.h"
IndexStreamReader::IndexStreamReader(string word) {
this->word = word;
}
int IndexStreamReader::first() {
}
int IndexStreamReader::last() {
}
int IndexStreamReader::next(int location) {
}
\ No newline at end of file
//
// Created by nick on 2/6/18.
//
#ifndef EECS398_SEARCH_INDEXSTREAMREADER_H
#define EECS398_SEARCH_INDEXSTREAMREADER_H
#include <iostream>
using namespace std;
class IndexStreamReader {
public:
int first();
int last();
int next(int location);
private:
IndexStreamReader(string word);
string word;
};
#endif //EECS398_SEARCH_INDEXSTREAMREADER_H
......@@ -5,20 +5,27 @@ Indexer::Indexer() {
currentFile = 0;
totalIndexed = 0;
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
void Indexer::run() {
while(pointerToDictionaries.Size() != 0) {
save();
reset();
unordered_map<string, vector<int>>* dictionary = pointerToDictionaries.Pop();
currentBlockNumberDocs++;
for(auto word : *dictionary) {
indexedCount += word.second.size();
totalIndexed += word.second.size();
currentBlockNumberWords += word.second.size();
for(auto location : word.second) {
masterDictionary[word.first].push_back(currentlyIndexed + location);
}
}
if(currentBlockNumberWords >= 300000) {
save();
reset();
}
currentlyIndexed += indexedCount;
indexedCount = 0;
}
......@@ -39,8 +46,21 @@ void Indexer::verbose_run() {
void Indexer::save() {
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
map<string, size_t> seeker;
string fileName = "index" + to_string(currentFile) + ".txt";
int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
// TODO: these should really be c strings
string header = "===STATS===\n";
string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n";
string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n";
string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n";
string footer = "===SEEK===\n";
write(file, header.c_str(), strlen(header.c_str()));
write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str()));
write(file, numberWords.c_str(), strlen(numberWords.c_str()));
write(file, numberDocs.c_str(), strlen(numberDocs.c_str()));
write(file, footer.c_str(), strlen(footer.c_str()));
for(auto word : maps) {
string wordBreak = word.first + "\n";
write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
......@@ -48,11 +68,21 @@ void Indexer::save() {
string locationSpace = to_string(location) + " ";
write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
}
seeker[word.first] = 013;
write(file, "\n", 1);
}
// TODO: seek dictionary
string seekFileName = "index" + to_string(currentFile) + "-seek.txt";
int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
for(auto word : seeker) {
string line = word.first + " " + to_string(word.second) + "\n";
write(seekFile, line.c_str(), strlen(line.c_str()));
}
close(file);
currentFile++;
}
}
void Indexer::verbose_save() {
map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
......@@ -67,5 +97,15 @@ void Indexer::verbose_save() {
}
void Indexer::reset() {
unordered_map<string, vector<size_t>> lastOne;
for(auto bucket : masterDictionary) {
lastOne[bucket.first].push_back(bucket.second.back());
}
this->lastOne = lastOne;
masterDictionary.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
}
......@@ -19,7 +19,7 @@ master index.
TODO:
Use deltas between the offsets
Save with UTF-8 encoding
Concrete block size - 500MB per block?
Concrete block size - 100MB per block?
Save document endings and other relevant metadata?
*/
......@@ -36,11 +36,17 @@ class Indexer {
private:
void save();
void reset();
unordered_map<string, vector<size_t> > masterDictionary;
unordered_map<string, vector<size_t> > lastOne;
size_t indexedCount;
size_t currentFile;
size_t totalIndexed;
size_t currentlyIndexed;
size_t currentBlockNumberWords;
size_t currentBlockNumberDocs;
};
#endif /*indexer_h*/
......@@ -30,6 +30,7 @@ int main() {
id++;
}
}
test1["=tests/test1.txt"].push_back(0);
id = 0;
while(ifstream2 >> word) {
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
......@@ -39,6 +40,7 @@ int main() {
id++;
}
}
test2["=tests/test2.txt"].push_back(0);
id = 0;
while(ifstream3 >> word) {
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
......@@ -48,6 +50,7 @@ int main() {
id++;
}
}
test3["=tests/test3.txt"].push_back(0);
id = 0;
while(ifstream4 >> word) {
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
......@@ -57,6 +60,7 @@ int main() {
id++;
}
}
test4["=tests/test4.txt"].push_back(0);
indexer.pointerToDictionaries.Push(&test1);
indexer.pointerToDictionaries.Push(&test2);
indexer.pointerToDictionaries.Push(&test3);
......
//
// Created by nick on 3/13/18.
//
#include <iostream>
#include <fcntl.h>
#include <unistd.h>
using namespace std;
int main() {
int index1 = open("index0.txt", O_CREAT|O_WRONLY, S_IRWXU);
lseek(index1, 25, SEEK_SET);
string fs = "hello";
if(write(index1, fs.c_str(), strlen(fs.c_str())) != strlen(fs.c_str())) {
cout << "ERROR" << endl;
}
close(index1);
return 0;
}
\ No newline at end of file
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment