Skip to content
Snippets Groups Projects
Commit f16b76a2 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

making new twitter test files with mmdht

parent 97b9eaae
No related branches found
No related tags found
1 merge request!3Indexer
......@@ -125,11 +125,14 @@ add_executable(DataStructures-DiskHashTable-tests
DataStructures/DiskHashTable/DiskHashTable.h
DataStructures/DiskHashTable/DiskHashTableTests.cpp)
add_executable(Indexer-tests
DataStructures/DiskHashTable/DiskHashTable.h
Indexer/Indexer.cpp
add_executable(Indexer-twitter-tests
DataStructures/DiskHashTable/MMDiskHashTable.h
indexer/Indexer.cpp
indexer/json.hpp
Indexer/IndexerTwitterTests.cpp)
util/stringProcessing.cpp
util/Stemmer.cpp
util/util.cpp
indexer/IndexerTwitterTests.cpp)
find_package(OpenSSL REQUIRED)
......
File added
File added
......@@ -18,7 +18,6 @@ void Indexer::run ( )
while(cond) {
DocIndex * dictionary = pointerToDictionaries->Pop();
cout << "INDEX GOT A NEW Dictionary" << endl;
DocumentEnding docEnd = DocumentEnding();
size_t indexedCount = 0;
currentBlockNumberDocs++;
......@@ -42,7 +41,7 @@ void Indexer::run ( )
docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 10000) {
if(currentBlockNumberWords >= 20000) {
save();
reset();
}
......
......@@ -53,7 +53,7 @@ private:
ProducerConsumerQueue< DocIndex * > *pointerToDictionaries;
unordered_map< string, vector< size_t > > masterDictionary;
map< string, vector< size_t > > chunkDictionary;
unordered_map< string, vector< size_t > > chunkDictionary;
unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable;
vector< DocumentEnding > docEndings;
......
......@@ -5,67 +5,73 @@
#include <vector>
#include <fstream>
#include <unordered_map>
#include "../shared/ProducerConsumerQueue.h"
#include "json.hpp"
#include "../util/util.h"
#include "Indexer.h"
using namespace std;
using json = nlohmann::json;
int main ( )
{
// Indexer indexer = Indexer();
//
// vector<ifstream*> files;
// for(int i = 0; i < 60; i++) {
// string fileName = "tests/twitter/" + to_string(i) + ".json";
// if(i < 10) {
// fileName = "tests/twitter/0" + to_string(i) + ".json";
// }
// files.push_back(new ifstream(fileName));
// }
// string line = "";
// for(int i = 0; i < 60; i++) {
// int tweetId = 0;
// while(getline(*files[i], line)) {
// json j = json::parse(line);
// auto local = new unordered_map<string, vector<int> >();
// int id = 0;
// if(j.find("text") != j.end()) {
// string text = j.at("text");
// string word = "";
// bool midword = false;
// for(auto character : text) {
// switch(character) {
// case '\n':
// case ' ':
// if(midword) {
// std::transform(word.begin(), word.end(), word.begin(), ::tolower);
// word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end());
// if(word != "") {
// local->operator[](word).push_back(id);
// id++;
// }
// word = "";
// }
// midword = false;
// break;
// default:
// word += character;
// midword = true;
// }
// }
// if(local->size() != 0) {
// size_t id = j.at("id");
// local->operator[]("=file " + to_string(i) + " tweet " + to_string(id)).push_back(0);
// tweetId++;
// }
// }
// if(local->size() != 0) {
// indexer.pointerToDictionaries.Push(local);
// }
// }
// }
// indexer.run();
using DocIndex = const unordered_map< string, vector< unsigned long > >;
int main ( ) {
vector<ifstream *> files;
ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
for (int i = 0; i < 60; i++) {
string fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/" + to_string(i) + ".json";
if (i < 10) {
fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/0" + to_string(i) + ".json";
}
files.push_back(new ifstream(fileName));
}
string line = "";
for (int i = 0; i < 60; i++) {
int tweetId = 0;
while (getline(*files[i], line)) {
json j = json::parse(line);
auto local = new unordered_map< string, vector< unsigned long > >();
int id = 0;
if (j.find("text") != j.end()) {
string text = j.at("text");
string word = "";
bool midword = false;
for (auto character : text) {
switch (character) {
case '\n':
case ' ':
if (midword) {
std::transform(word.begin(), word.end(), word.begin(), ::tolower);
word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); }),
word.end());
if (word != "") {
local->operator[](word).push_back(id);
id++;
}
word = "";
}
midword = false;
break;
default:
word += character;
midword = true;
}
}
if (local->size() != 0) {
size_t id = j.at("id");
local->operator[]("=file " + to_string(i) + " tweet " + to_string(id)).push_back(0);
tweetId++;
}
}
if (local->size() != 0) {
IndexerQueue->Push(local);
}
}
}
Indexer indexer = Indexer(IndexerQueue);
indexer.StartThread( );
indexer.WaitForFinish();
/*
string query;
cout << "What is your query?" << endl;
getline( cin, query );
......@@ -236,3 +242,5 @@ int main ( )
}
}
*/
}
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment