Skip to content
Snippets Groups Projects
Commit c1db25c5 authored by Nicholas Yang's avatar Nicholas Yang
Browse files

doc frequency 4 words + total number of docs in corpus

parent 2a9bed04
Branches
No related tags found
2 merge requests!8Origin/constraint solver,!4doc frequency 4 words + total number of docs in corpus
No preview for this file type
......@@ -132,7 +132,15 @@ add_executable(Indexer-twitter-tests
util/stringProcessing.cpp
util/Stemmer.cpp
util/util.cpp
indexer/IndexerTwitterTests.cpp)
indexer/IndexerTwitterTests.cpp indexer/WordInfo.h)
add_executable(MasterReader-tests
DataStructures/DiskHashTable/MMDiskHashTable.h
util/stringProcessing.cpp
util/Stemmer.cpp
util/util.cpp
indexer/MasterReader.cpp
)
find_package(OpenSSL REQUIRED)
......
No preview for this file type
File added
#include "Indexer.h"
Indexer::Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ) : pointerToDictionaries(
doc_index_queue_in )
#define pathToIndex "/build/"
Indexer::Indexer( ProducerConsumerQueue < DocIndex * > *doc_index_queue_in,
ProducerConsumerQueue < unordered_map < string, DocIndex * > > *anchor_in) :
pointerToDictionaries( doc_index_queue_in ), AnchorQueue( anchor_in )
{
totalWordsIndexed = 0;
currentFile = 0;
currentFile = 0;
currentlyIndexed = 0;
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
numberDocsIndexed = 0;
}
void Indexer::run ( )
void Indexer::run()
{
bool cond = true;
while(cond) {
DocIndex * dictionary = pointerToDictionaries->Pop();
DocumentEnding docEnd = DocumentEnding();
size_t indexedCount = 0;
currentBlockNumberDocs++;
for(auto word : *dictionary) {
if(word.first.at(0) == '=') {
docEnd.url = word.first.substr(1, word.first.length());
continue;
}
indexedCount += word.second.size();
currentBlockNumberWords += word.second.size();
totalWordsIndexed += word.second.size();
for(auto location : word.second) {
masterDictionary[word.first].push_back(currentlyIndexed + location);
}
}
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back(docEnd);
if(currentBlockNumberWords >= 20000) {
save();
saveWordSeek();
reset();
}
}
save();
saveWordSeek();
reset();
saveChunkDictionary();
}
void Indexer::verbose_run() {
/*
while(pointerToDictionaries.Size() != 0) {
DocIndex *pointerToDictionaries.Pop();
for(auto word : dictionary) {
for(auto location : word.second) {
// indexedCount++;
masterDictionary[word.first].push_back(location);
}
}
}
*/
while ( *alive || pointerToDictionaries->Size( ) > 0 )
{
if( pointerToDictionaries->Size( ) > 0)
{
DocIndex *dictionary = pointerToDictionaries->Pop( );
numberDocsIndexed++;
DocumentEnding docEnd = DocumentEnding( );
size_t indexedCount = 0;
currentBlockNumberDocs++;
for ( auto word : *dictionary )
{
if ( word.first.at( 0 ) == '=' )
{
docEnd.url = word.first.substr( 1, word.first.length( ));
continue;
}
chunkDictionary[word.first].docFrequency++;
indexedCount += word.second.size( );
currentBlockNumberWords += word.second.size( );
for ( auto location : word.second )
{
masterDictionary[ word.first ].push_back( currentlyIndexed + location );
}
}
currentlyIndexed += indexedCount;
docEnd.docEndPosition = currentlyIndexed;
docEnd.docNumWords = indexedCount;
docEndings.push_back( docEnd );
//add the url to the ->doc end map
urlToDocEndings[ docEnd.url ] = docEnd.docEndPosition;
if ( currentBlockNumberWords >= 20000 )
{
cout << " --- Saving current chunk --- " << endl;
save( );
saveWordSeek( );
reset( );
}
delete dictionary;
}
}
cout << "Indexer is shutting down" << endl;
save( );
saveWordSeek( );
reset( );
saveChunkDictionary( );
unordered_map < string, DocIndex * > anchorDict = AnchorQueue->Pop( );
SaveAnchorText( &anchorDict );
cout << " Indexer has finished running" << endl;
return;
}
void Indexer::save ( )
void Indexer::save()
{
MMDiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8 );
string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt";
MMDiskHashTable seeker( util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + "-seek.txt", 30, 8 );
string fileName = util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + ".txt";
int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );
// TODO: these should really be c strings
string statsHeader = "===STATS==="
"\nunique words: " + to_string( masterDictionary.size( ) ) +
"\nnumber words: " + to_string( currentBlockNumberWords ) +
"\nnumber docs: " + to_string( currentBlockNumberDocs ) +
"\n===========\n";
write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( ) ) );
"\nunique words: " + to_string( masterDictionary.size( )) +
"\nnumber words: " + to_string( currentBlockNumberWords ) +
"\nnumber docs: " + to_string( currentBlockNumberDocs ) +
"\n===========\n";
write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( )));
// REALLY GROSS HACK
size_t seekOffset = strlen( statsHeader.c_str( ) );
size_t seekOffset = strlen( statsHeader.c_str( ));
size_t chunkEnd = 0;
for ( auto word : masterDictionary )
{
if(word.first.size() > 30) {
string resized = word.first;
resized.resize(30);
seeker.insert(resized, to_string(seekOffset));
} else {
seeker.insert(word.first, to_string(seekOffset));
if ( word.first.size( ) > 30 )
{
string resized = word.first;
resized.resize( 30 );
seeker.insert( resized, to_string( seekOffset ));
}
else
{
seeker.insert( word.first, to_string( seekOffset ));
}
chunkDictionary[ word.first ].chunks.push_back( currentFile );
chunkDictionary[ word.first ].first.push_back( currentFile );
// string wordBreak = word.first + "\n";
// write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
// seekOffset += strlen(wordBreak.c_str());
bool firstPost = true;
size_t lastOne = 0;
int numIndexed = 0;
for ( auto location : word.second )
{
chunkDictionary[word.first].second++;
if(chunkEnd < location) {
chunkEnd = location;
}
chunkDictionary[ word.first ].frequency++;
numIndexed++;
if ( numIndexed == 100 )
{
PostingsSeekTableEntry entry = PostingsSeekTableEntry( );
SeekEntry entry = SeekEntry( );
entry.offset = seekOffset;
entry.realLocation = location;
postingsSeekTable[ word.first ].push_back( entry );
seekDictionary[ word.first ].push_back( entry );
numIndexed = 0;
}
if ( firstPost )
{
string locationSpace = to_string( location ) + " ";
write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( ) ) );
seekOffset += strlen( locationSpace.c_str( ) );
write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( )));
seekOffset += strlen( locationSpace.c_str( ));
firstPost = false;
}
else
{
size_t delta = location - lastOne;
string deltaSpace = to_string( delta ) + " ";
write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( ) ) );
seekOffset += strlen( deltaSpace.c_str( ) );
write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( )));
seekOffset += strlen( deltaSpace.c_str( ));
}
lastOne = location;
}
chunkDictionary[ word.first ].lastLocation = lastOne;
write( file, "\n", 1 );
seekOffset += 1;
}
string docEndingHeader = "===Document Endings===\n";
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) );
seekOffset += strlen( docEndingHeader.c_str( ) );
seeker.insert("=docEnding", to_string(seekOffset));
int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( )));
seekOffset += strlen( docEndingHeader.c_str( ));
seeker.insert( "=docEnding", to_string( seekOffset ));
int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
for ( auto ending : docEndings )
{
string docEndString = "[" +
ending.url + ", " +
to_string( ending.docEndPosition ) + ", " +
to_string( ending.docNumWords ) + "]\n";
write( file, docEndString.c_str( ), strlen( docEndString.c_str( ) ) );
docEndSeekCounter++;
if(docEndSeekCounter == 100)
{
docEndSeekCounter = 0;
docEndingsSeek.push_back({ ending.docEndPosition, seekOffset });
}
seekOffset += strlen(docEndString.c_str());
ending.url + ", " +
to_string( ending.docEndPosition ) + ", " +
to_string( ending.docNumWords ) + "]\n";
write( file, docEndString.c_str( ), strlen( docEndString.c_str( )));
docEndSeekCounter++;
if ( docEndSeekCounter == 100 )
{
docEndSeekCounter = 0;
seekDictionary["=docEnding"].push_back( SeekEntry(ending.docEndPosition, seekOffset ));
}
seekOffset += strlen( docEndString.c_str( ));
}
chunkEndLocation.push_back(chunkEnd);
close( file );
//seeker.CloseFile();
}
void Indexer::saveChunkDictionary ( )
void Indexer::saveChunkDictionary()
{
MMDiskHashTable dhtChunk = MMDiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/master.txt", 30, 168);
for ( auto word : chunkDictionary )
MMDiskHashTable dhtChunk = MMDiskHashTable( util::GetCurrentWorkingDir( ) + pathToIndex + "master.txt", 30, 168 );
for ( auto word : chunkDictionary )
{
string key = word.first;
if(key.size() > 30) {
key.resize(30);
string key = word.first;
if ( key.size( ) > 30 )
{
key.resize( 30 );
}
string value = "";
for ( auto chunk : word.second.first )
string value = "";
for ( auto chunk : word.second.chunks )
{
value += to_string( chunk ) + " ";
}
value += "\t" + to_string(word.second.second);
dhtChunk.insert(key, value);
value += "\t" + to_string( word.second.frequency );
value += "\t" + to_string( word.second.lastLocation);
value += "\t" + to_string( word.second.docFrequency);
dhtChunk.insert( key, value );
}
dhtChunk.insert("=totalNumberIndexed", to_string(totalWordsIndexed));
}
dhtChunk.insert( "=totalNumberIndexed", to_string( currentlyIndexed ));
dhtChunk.insert("=totalDocsIndexed", to_string(numberDocsIndexed));
int currentChunk = 0;
for(auto location : chunkEndLocation) {
string key = "=chunk" + to_string(currentChunk);
dhtChunk.insert(key, to_string(location));
}
}
void Indexer::saveWordSeek() {
void Indexer::saveWordSeek()
{
MMDiskHashTable wordSeek = MMDiskHashTable(
util::GetCurrentWorkingDir() + "/indexer/output/" + to_string(currentFile) + "-wordseek.txt", 30, 168);
for (auto word : postingsSeekTable) {
util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + "-wordseek.txt", 30, 168 );
for ( auto word : seekDictionary )
{
string key = word.first;
if (key.size() > 30) {
key.resize(30);
if(key == "=docEnding") {
continue;
}
if ( key.size( ) > 30 )
{
key.resize( 30 );
}
string value = "";
for (auto entry : word.second) {
value += ("<" + to_string(entry.offset) + ", " + to_string(entry.realLocation) + "> ");
for ( auto entry : word.second )
{
value += ("<" + to_string( entry.offset ) + ", " + to_string( entry.realLocation ) + "> ");
}
wordSeek.insert( key, value );
}
wordSeek.insert(key, value);
}
string key = "=docEnding";
string value = "";
int currentEndingPartition = 0;
for(size_t i = 0; i < docEndingsSeek.size(); i++) {
string prospectiveDocEnding = "<" +
to_string(docEndingsSeek[i].first) +
", " + to_string(docEndingsSeek[i].second) + "> ";
if(value.size() + prospectiveDocEnding.size() <= 168) {
value += prospectiveDocEnding;
} else {
wordSeek.insert(key + to_string(currentEndingPartition), value);
currentEndingPartition++;
value = prospectiveDocEnding;
}
}
}
void Indexer::verbose_save ( )
{
map< string, vector< size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ) );
for ( auto word : maps )
string key = "=docEnding";
string value = "";
int currentEndingPartition = 0;
for ( size_t i = 0; i < seekDictionary["=docEnding"].size( ); i++ )
{
cout << word.first << endl;
for ( auto location : word.second )
string prospectiveDocEnding = "<" +
to_string( seekDictionary["=docEnding"][ i ].offset ) +
", " + to_string( seekDictionary["=docEnding"][ i ].realLocation ) + "> ";
if ( value.size( ) + prospectiveDocEnding.size( ) <= 168 )
{
value += prospectiveDocEnding;
}
else
{
cout << location << " ";
wordSeek.insert( key + to_string( currentEndingPartition ), value );
currentEndingPartition++;
value = prospectiveDocEnding;
}
cout << endl;
}
currentFile++;
currentFile++;
}
void Indexer::reset ( )
void Indexer::reset()
{
masterDictionary.clear( );
docEndings.clear( );
postingsSeekTable.clear( );
docEndingsSeek.clear();
seekDictionary.clear();
currentBlockNumberWords = 0;
currentBlockNumberDocs = 0;
currentFile++;
}
}
void Indexer::Kill()
{
*(this->alive) = false;
currentFile++;
}
void Indexer::SaveAnchorText( unordered_map < string, DocIndex * > *anchorDict )
{
//TODO create pointer to anchor
//pointerToAnchor->Pop();
//pass a dictionary of
//map <url string> - > vector<anchor word>
//for each url in map
//look up url string in url -> docEnding map
//for each anchor text in url map
// create a anchor text - > list of doc endings
//write to disk
cout << " -- SAVING ANCHOR TEXT --- " << endl;
for ( auto const &ent1 : *anchorDict )
{
auto const &outer_key = ent1.first;
cout << "url: " << outer_key << endl;
if ( urlToDocEndings.find( outer_key ) != urlToDocEndings.end( ))
{
size_t docEndForUrl = urlToDocEndings[ outer_key ];
cout << "Urls doc end : " << docEndForUrl << endl;
}
DocIndex *inner_map = ent1.second;
for ( auto const &ent2 : *inner_map )
{
auto const &inner_key = ent2.first;
auto const &inner_value = ent2.second;
//cout << "url: " << outer_key << endl;
//cout << "anchor text : " << inner_key << endl;
//for(auto offset :inner_value)
// cout << "offset " << offset << endl;
}
}
}
......@@ -4,9 +4,10 @@
#include "../shared/ProducerConsumerQueue.h"
#include "../shared/ThreadClass.h"
#include "DocumentEnding.h"
#include "PostingsSeekTableEntry.h"
#include "SeekEntry.h"
#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
#include "../util/util.h"
#include "WordInfo.h"
#include <unordered_map>
#include <map>
#include <vector>
......@@ -34,35 +35,39 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >;
class Indexer : public ThreadClass
{
public:
Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in );
Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ,
ProducerConsumerQueue < unordered_map<string , DocIndex * > > *anchor_in );
void run ( );
void verbose_run ( );
void verbose_save ( );
void Kill ( );
private:
void save ( );
void saveWordSeek();
void saveChunkDictionary ( );
void SaveAnchorText( unordered_map<string , DocIndex*> * anchorDict );
void reset ( );
ProducerConsumerQueue< DocIndex * > *pointerToDictionaries;
unordered_map< string, vector< size_t > > masterDictionary;
unordered_map< string, pair<vector< size_t >, size_t> > chunkDictionary; // <chunks>, occurances
unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable;
ProducerConsumerQueue< unordered_map<string , DocIndex * > > *AnchorQueue;
// for master.txt file - includes chunks the words appear in, the last real location of the word, and frequency
unordered_map< string, WordInfo > chunkDictionary;
vector<size_t> chunkEndLocation;
unordered_map< string, vector< size_t > > masterDictionary;
unordered_map< string, vector< SeekEntry > > seekDictionary;
vector< DocumentEnding > docEndings;
vector< pair<size_t, size_t> > docEndingsSeek; // <realLocation, offset (to the correspond docEnding)>
size_t totalWordsIndexed;
unordered_map< string, size_t> urlToDocEndings;
size_t currentFile;
size_t currentlyIndexed;
size_t currentBlockNumberWords;
size_t currentBlockNumberDocs;
size_t numberDocsIndexed;
atomic_bool* alive = new atomic_bool(true);
};
......
......@@ -18,6 +18,7 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >;
int main ( ) {
vector<ifstream *> files;
ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
ProducerConsumerQueue< unordered_map<string , DocIndex * > > *AnchorQueue = new ProducerConsumerQueue< unordered_map<string , DocIndex * > >( );
for (int i = 0; i < 60; i++) {
string fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/" + to_string(i) + ".json";
if (i < 10) {
......@@ -68,9 +69,10 @@ int main ( ) {
}
}
}
Indexer indexer = Indexer(IndexerQueue);
Indexer indexer = Indexer(IndexerQueue, AnchorQueue);
indexer.StartThread( );
indexer.WaitForFinish();
return 0;
/*
string query;
cout << "What is your query?" << endl;
......
#include <iostream>
#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
#include "../util/util.h"
int main() {
string fileName = util::GetCurrentWorkingDir() + "/build/master.txt";
MMDiskHashTable master = MMDiskHashTable(fileName, 30, 168);
int currentChunk = 0;
string key = "=totalDocsIndexed";
string value = master.find(key);
cout << value << endl;
}
#pragma once
class PostingsSeekTableEntry
class SeekEntry
{
public:
PostingsSeekTableEntry ( )
SeekEntry ( )
{
offset = 0;
realLocation = 0;
}
SeekEntry (size_t offset_in, size_t realLocation_in) : offset(offset_in),
realLocation(realLocation_in)
{}
size_t offset;
size_t realLocation;
};
\ No newline at end of file
#pragma once
#include <vector>
class WordInfo {
public:
std::vector<int> chunks;
size_t frequency;
size_t docFrequency;
size_t lastLocation;
};
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment