Skip to content
Snippets Groups Projects
Commit f16b76a2 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

making new twitter test files with mmdht

parent 97b9eaae
No related branches found
No related tags found
1 merge request!3Indexer
...@@ -125,11 +125,14 @@ add_executable(DataStructures-DiskHashTable-tests ...@@ -125,11 +125,14 @@ add_executable(DataStructures-DiskHashTable-tests
DataStructures/DiskHashTable/DiskHashTable.h DataStructures/DiskHashTable/DiskHashTable.h
DataStructures/DiskHashTable/DiskHashTableTests.cpp) DataStructures/DiskHashTable/DiskHashTableTests.cpp)
add_executable(Indexer-tests add_executable(Indexer-twitter-tests
DataStructures/DiskHashTable/DiskHashTable.h DataStructures/DiskHashTable/MMDiskHashTable.h
Indexer/Indexer.cpp indexer/Indexer.cpp
indexer/json.hpp indexer/json.hpp
Indexer/IndexerTwitterTests.cpp) util/stringProcessing.cpp
util/Stemmer.cpp
util/util.cpp
indexer/IndexerTwitterTests.cpp)
find_package(OpenSSL REQUIRED) find_package(OpenSSL REQUIRED)
......
File added
File added
...@@ -18,7 +18,6 @@ void Indexer::run ( ) ...@@ -18,7 +18,6 @@ void Indexer::run ( )
while(cond) { while(cond) {
DocIndex * dictionary = pointerToDictionaries->Pop(); DocIndex * dictionary = pointerToDictionaries->Pop();
cout << "INDEX GOT A NEW Dictionary" << endl;
DocumentEnding docEnd = DocumentEnding(); DocumentEnding docEnd = DocumentEnding();
size_t indexedCount = 0; size_t indexedCount = 0;
currentBlockNumberDocs++; currentBlockNumberDocs++;
...@@ -42,7 +41,7 @@ void Indexer::run ( ) ...@@ -42,7 +41,7 @@ void Indexer::run ( )
docEnd.docNumWords = indexedCount; docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd); docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 10000) { if(currentBlockNumberWords >= 20000) {
save(); save();
reset(); reset();
} }
......
...@@ -53,7 +53,7 @@ private: ...@@ -53,7 +53,7 @@ private:
ProducerConsumerQueue< DocIndex * > *pointerToDictionaries; ProducerConsumerQueue< DocIndex * > *pointerToDictionaries;
unordered_map< string, vector< size_t > > masterDictionary; unordered_map< string, vector< size_t > > masterDictionary;
map< string, vector< size_t > > chunkDictionary; unordered_map< string, vector< size_t > > chunkDictionary;
unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable; unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable;
vector< DocumentEnding > docEndings; vector< DocumentEnding > docEndings;
......
...@@ -5,67 +5,73 @@ ...@@ -5,67 +5,73 @@
#include <vector> #include <vector>
#include <fstream> #include <fstream>
#include <unordered_map> #include <unordered_map>
#include "../shared/ProducerConsumerQueue.h"
#include "json.hpp" #include "json.hpp"
#include "../util/util.h"
#include "Indexer.h" #include "Indexer.h"
using namespace std; using namespace std;
using json = nlohmann::json; using json = nlohmann::json;
int main ( ) using DocIndex = const unordered_map< string, vector< unsigned long > >;
{
// Indexer indexer = Indexer(); int main ( ) {
// vector<ifstream *> files;
// vector<ifstream*> files; ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
// for(int i = 0; i < 60; i++) { for (int i = 0; i < 60; i++) {
// string fileName = "tests/twitter/" + to_string(i) + ".json"; string fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/" + to_string(i) + ".json";
// if(i < 10) { if (i < 10) {
// fileName = "tests/twitter/0" + to_string(i) + ".json"; fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/0" + to_string(i) + ".json";
// } }
// files.push_back(new ifstream(fileName)); files.push_back(new ifstream(fileName));
// } }
// string line = ""; string line = "";
// for(int i = 0; i < 60; i++) { for (int i = 0; i < 60; i++) {
// int tweetId = 0; int tweetId = 0;
// while(getline(*files[i], line)) { while (getline(*files[i], line)) {
// json j = json::parse(line); json j = json::parse(line);
// auto local = new unordered_map<string, vector<int> >(); auto local = new unordered_map< string, vector< unsigned long > >();
// int id = 0; int id = 0;
// if(j.find("text") != j.end()) { if (j.find("text") != j.end()) {
// string text = j.at("text"); string text = j.at("text");
// string word = ""; string word = "";
// bool midword = false; bool midword = false;
// for(auto character : text) { for (auto character : text) {
// switch(character) { switch (character) {
// case '\n': case '\n':
// case ' ': case ' ':
// if(midword) { if (midword) {
// std::transform(word.begin(), word.end(), word.begin(), ::tolower); std::transform(word.begin(), word.end(), word.begin(), ::tolower);
// word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); } ), word.end()); word.erase(remove_if(word.begin(), word.end(), [](char c) { return !isalpha(c); }),
// if(word != "") { word.end());
// local->operator[](word).push_back(id); if (word != "") {
// id++; local->operator[](word).push_back(id);
// } id++;
// word = ""; }
// } word = "";
// midword = false; }
// break; midword = false;
// default: break;
// word += character; default:
// midword = true; word += character;
// } midword = true;
// } }
// if(local->size() != 0) { }
// size_t id = j.at("id"); if (local->size() != 0) {
// local->operator[]("=file " + to_string(i) + " tweet " + to_string(id)).push_back(0); size_t id = j.at("id");
// tweetId++; local->operator[]("=file " + to_string(i) + " tweet " + to_string(id)).push_back(0);
// } tweetId++;
// } }
// if(local->size() != 0) { }
// indexer.pointerToDictionaries.Push(local); if (local->size() != 0) {
// } IndexerQueue->Push(local);
// } }
// } }
// indexer.run(); }
Indexer indexer = Indexer(IndexerQueue);
indexer.StartThread( );
indexer.WaitForFinish();
/*
string query; string query;
cout << "What is your query?" << endl; cout << "What is your query?" << endl;
getline( cin, query ); getline( cin, query );
...@@ -236,3 +242,5 @@ int main ( ) ...@@ -236,3 +242,5 @@ int main ( )
} }
} }
*/
}
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment