doc frequency 4 words + total number of docs in corpus

c1db25c5 · Nicholas Yang · 2a9bed04 · c1db25c5 · c1db25c5 · c1db25c5
Commit c1db25c5 authored 7 years ago by Nicholas Yang
--- a/.DS_Store
+++ b/.DS_Store
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,7 +132,15 @@ add_executable(Indexer-twitter-tests
        util/stringProcessing.cpp
        util/Stemmer.cpp
        util/util.cpp
-        indexer/IndexerTwitterTests.cpp)
+        indexer/IndexerTwitterTests.cpp indexer/WordInfo.h)
+
+add_executable(MasterReader-tests
+        DataStructures/DiskHashTable/MMDiskHashTable.h
+        util/stringProcessing.cpp
+        util/Stemmer.cpp
+        util/util.cpp
+        indexer/MasterReader.cpp
+        )

 find_package(OpenSSL REQUIRED)


--- a/Indexer-twitter-tests
+++ b/Indexer-twitter-tests
--- a/MasterReader-tests
+++ b/MasterReader-tests
--- a/indexer/Indexer.cpp
+++ b/indexer/Indexer.cpp
 #include "Indexer.h"

-Indexer::Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ) : pointerToDictionaries(
-		doc_index_queue_in )
+
+#define  pathToIndex "/build/"
+
+Indexer::Indexer( ProducerConsumerQueue < DocIndex * > *doc_index_queue_in,
+						ProducerConsumerQueue < unordered_map < string, DocIndex * > > *anchor_in) :
+		pointerToDictionaries( doc_index_queue_in ), AnchorQueue( anchor_in )
 	{
-    totalWordsIndexed = 0;
-    currentFile = 0;
+	currentFile = 0;
 	currentlyIndexed = 0;
 	currentBlockNumberWords = 0;
 	currentBlockNumberDocs = 0;
-
+	numberDocsIndexed = 0;
 	}

-void Indexer::run ( )
+void Indexer::run()
 	{

-	bool cond = true;
-
-    while(cond) {
-        DocIndex * dictionary = pointerToDictionaries->Pop();
-        DocumentEnding docEnd = DocumentEnding();
-        size_t indexedCount = 0;
-        currentBlockNumberDocs++;
-
-        for(auto word : *dictionary) {
-            if(word.first.at(0) == '=') {
-                docEnd.url = word.first.substr(1, word.first.length());
-                continue;
-            }
-
-            indexedCount += word.second.size();
-            currentBlockNumberWords += word.second.size();
-            totalWordsIndexed += word.second.size();
-
-            for(auto location : word.second) {
-                masterDictionary[word.first].push_back(currentlyIndexed + location);
-            }
-        }
-
-        currentlyIndexed += indexedCount;
-        docEnd.docEndPosition = currentlyIndexed;
-        docEnd.docNumWords = indexedCount;
-        docEndings.push_back(docEnd);
-
-        if(currentBlockNumberWords >= 20000) {
-            save();
-			saveWordSeek();
-            reset();
-        }
-    }
-
-    save();
-	saveWordSeek();
-    reset();
-    saveChunkDictionary();
-}
-
-void Indexer::verbose_run() {
-	/*
-    while(pointerToDictionaries.Size() != 0) {
-		 	DocIndex *pointerToDictionaries.Pop();
-        for(auto word : dictionary) {
-	        for(auto location : word.second) {
-//                indexedCount++;
-                masterDictionary[word.first].push_back(location);
-                }
-            }
-        }
-        */
+
+	while ( *alive  || pointerToDictionaries->Size( ) > 0 )
+		{
+		if( pointerToDictionaries->Size( ) > 0)
+			{
+
+
+			DocIndex *dictionary = pointerToDictionaries->Pop( );
+			numberDocsIndexed++;
+            DocumentEnding docEnd = DocumentEnding( );
+			size_t indexedCount = 0;
+			currentBlockNumberDocs++;
+
+			for ( auto word : *dictionary )
+				{
+				if ( word.first.at( 0 ) == '=' )
+					{
+					docEnd.url = word.first.substr( 1, word.first.length( ));
+					continue;
+					}
+                chunkDictionary[word.first].docFrequency++;
+                indexedCount += word.second.size( );
+				currentBlockNumberWords += word.second.size( );
+
+				for ( auto location : word.second )
+					{
+					masterDictionary[ word.first ].push_back( currentlyIndexed + location );
+					}
+				}
+
+			currentlyIndexed += indexedCount;
+			docEnd.docEndPosition = currentlyIndexed;
+			docEnd.docNumWords = indexedCount;
+			docEndings.push_back( docEnd );
+			//add the url to the ->doc end map
+			urlToDocEndings[ docEnd.url ] = docEnd.docEndPosition;
+
+
+			if ( currentBlockNumberWords >= 20000 )
+				{
+				cout << " --- Saving current chunk --- " << endl;
+				save( );
+				saveWordSeek( );
+				reset( );
+				}
+			delete dictionary;
+			}
+
+
+		}
+	cout << "Indexer is shutting down" << endl;
+	save( );
+	saveWordSeek( );
+	reset( );
+	saveChunkDictionary( );
+
+	unordered_map < string, DocIndex * > anchorDict = AnchorQueue->Pop( );
+	SaveAnchorText( &anchorDict );
+	cout << " Indexer has finished running" << endl;
+	return;
 	}

-void Indexer::save ( )
+void Indexer::save()
 	{
-	MMDiskHashTable seeker(util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + "-seek.txt", 30, 8 );
-	string fileName = util::GetCurrentWorkingDir( ) + "/indexer/output/" + to_string( currentFile ) + ".txt";
+	MMDiskHashTable seeker( util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + "-seek.txt", 30, 8 );
+	string fileName = util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + ".txt";
 	int file = open( fileName.c_str( ), O_CREAT | O_WRONLY, S_IRWXU );

 	// TODO: these should really be c strings
 	string statsHeader = "===STATS==="
-			                     "\nunique words: " + to_string( masterDictionary.size( ) ) +
-	                     "\nnumber words: " + to_string( currentBlockNumberWords ) +
-	                     "\nnumber docs: " + to_string( currentBlockNumberDocs ) +
-	                     "\n===========\n";
-	write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( ) ) );
+                        "\nunique words: " + to_string( masterDictionary.size( )) +
+                        "\nnumber words: " + to_string( currentBlockNumberWords ) +
+                        "\nnumber docs: " + to_string( currentBlockNumberDocs ) +
+                        "\n===========\n";
+	write( file, statsHeader.c_str( ), strlen( statsHeader.c_str( )));

 	// REALLY GROSS HACK
-	size_t seekOffset = strlen( statsHeader.c_str( ) );
+	size_t seekOffset = strlen( statsHeader.c_str( ));
+	size_t chunkEnd = 0;

 	for ( auto word : masterDictionary )
 		{
-			if(word.first.size() > 30) {
-				string resized = word.first;
-				resized.resize(30);
-				seeker.insert(resized, to_string(seekOffset));
-			} else {
-				seeker.insert(word.first, to_string(seekOffset));
+		if ( word.first.size( ) > 30 )
+			{
+			string resized = word.first;
+			resized.resize( 30 );
+			seeker.insert( resized, to_string( seekOffset ));
 			}
+		else
+			{
+			seeker.insert( word.first, to_string( seekOffset ));
+			}
+
+		chunkDictionary[ word.first ].chunks.push_back( currentFile );

-		chunkDictionary[ word.first ].first.push_back( currentFile );
-//        string wordBreak = word.first + "\n";
-//        write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
-//        seekOffset += strlen(wordBreak.c_str());
 		bool firstPost = true;
 		size_t lastOne = 0;
 		int numIndexed = 0;
 		for ( auto location : word.second )
 			{
-                chunkDictionary[word.first].second++;
+			if(chunkEnd < location) {
+				chunkEnd = location;
+			}
+			chunkDictionary[ word.first ].frequency++;
 			numIndexed++;
 			if ( numIndexed == 100 )
 				{
-				PostingsSeekTableEntry entry = PostingsSeekTableEntry( );
+				SeekEntry entry = SeekEntry( );
 				entry.offset = seekOffset;
 				entry.realLocation = location;
-				postingsSeekTable[ word.first ].push_back( entry );
+                seekDictionary[ word.first ].push_back( entry );
 				numIndexed = 0;
 				}
 			if ( firstPost )
 				{
 				string locationSpace = to_string( location ) + " ";
-				write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( ) ) );
-				seekOffset += strlen( locationSpace.c_str( ) );
+				write( file, locationSpace.c_str( ), strlen( locationSpace.c_str( )));
+				seekOffset += strlen( locationSpace.c_str( ));
 				firstPost = false;
 				}
 			else
 				{
 				size_t delta = location - lastOne;
 				string deltaSpace = to_string( delta ) + " ";
-				write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( ) ) );
-				seekOffset += strlen( deltaSpace.c_str( ) );
+				write( file, deltaSpace.c_str( ), strlen( deltaSpace.c_str( )));
+				seekOffset += strlen( deltaSpace.c_str( ));
 				}
 			lastOne = location;
 			}
+		chunkDictionary[ word.first ].lastLocation = lastOne;
 		write( file, "\n", 1 );
 		seekOffset += 1;
 		}

 	string docEndingHeader = "===Document Endings===\n";
-	write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( ) ) );
-	seekOffset += strlen( docEndingHeader.c_str( ) );
-	seeker.insert("=docEnding", to_string(seekOffset));
-    int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
+	write( file, docEndingHeader.c_str( ), strlen( docEndingHeader.c_str( )));
+	seekOffset += strlen( docEndingHeader.c_str( ));
+	seeker.insert( "=docEnding", to_string( seekOffset ));
+	int docEndSeekCounter = 0; // save seek every 100 doc ends in the chunk
 	for ( auto ending : docEndings )
 		{
 		string docEndString = "[" +
-		                      ending.url + ", " +
-		                      to_string( ending.docEndPosition ) + ", " +
-		                      to_string( ending.docNumWords ) + "]\n";
-		write( file, docEndString.c_str( ), strlen( docEndString.c_str( ) ) );
-        docEndSeekCounter++;
-        if(docEndSeekCounter == 100)
-            {
-            docEndSeekCounter = 0;
-            docEndingsSeek.push_back({ ending.docEndPosition, seekOffset });
-            }
-        seekOffset += strlen(docEndString.c_str());
+                                 ending.url + ", " +
+                                 to_string( ending.docEndPosition ) + ", " +
+                                 to_string( ending.docNumWords ) + "]\n";
+		write( file, docEndString.c_str( ), strlen( docEndString.c_str( )));
+		docEndSeekCounter++;
+		if ( docEndSeekCounter == 100 )
+			{
+			docEndSeekCounter = 0;
+			seekDictionary["=docEnding"].push_back( SeekEntry(ending.docEndPosition, seekOffset ));
+			}
+		seekOffset += strlen( docEndString.c_str( ));
 		}
-
+	chunkEndLocation.push_back(chunkEnd);
 	close( file );
+        //seeker.CloseFile();
 	}

-void Indexer::saveChunkDictionary ( )
+void Indexer::saveChunkDictionary()
 	{
-		MMDiskHashTable dhtChunk = MMDiskHashTable(util::GetCurrentWorkingDir() + "/indexer/output/master.txt", 30, 168);
-		for ( auto word : chunkDictionary )
+
+	MMDiskHashTable dhtChunk = MMDiskHashTable( util::GetCurrentWorkingDir( ) + pathToIndex + "master.txt", 30, 168 );
+	for ( auto word : chunkDictionary )
 		{
-			string key = word.first;
-			if(key.size() > 30) {
-				key.resize(30);
+		string key = word.first;
+		if ( key.size( ) > 30 )
+			{
+			key.resize( 30 );
 			}
-			string value = "";
-		for ( auto chunk : word.second.first )
+		string value = "";
+		for ( auto chunk : word.second.chunks )
 			{
 			value += to_string( chunk ) + " ";
 			}
-            value += "\t" + to_string(word.second.second);
-			dhtChunk.insert(key, value);
+		value += "\t" + to_string( word.second.frequency );
+		value += "\t" + to_string( word.second.lastLocation);
+		value += "\t" + to_string( word.second.docFrequency);
+		dhtChunk.insert( key, value );
 		}
-        dhtChunk.insert("=totalNumberIndexed", to_string(totalWordsIndexed));
-    }
+	dhtChunk.insert( "=totalNumberIndexed", to_string( currentlyIndexed ));
+	dhtChunk.insert("=totalDocsIndexed", to_string(numberDocsIndexed));
+	int currentChunk = 0;
+	for(auto location : chunkEndLocation) {
+		string key = "=chunk" + to_string(currentChunk);
+		dhtChunk.insert(key, to_string(location));
+	}
+	}

-void Indexer::saveWordSeek() {
+void Indexer::saveWordSeek()
+	{
 	MMDiskHashTable wordSeek = MMDiskHashTable(
-			util::GetCurrentWorkingDir() + "/indexer/output/" + to_string(currentFile) + "-wordseek.txt", 30, 168);
-	for (auto word : postingsSeekTable) {
+			util::GetCurrentWorkingDir( ) + pathToIndex + to_string( currentFile ) + "-wordseek.txt", 30, 168 );
+	for ( auto word : seekDictionary )
+		{
 		string key = word.first;
-		if (key.size() > 30) {
-			key.resize(30);
+		if(key == "=docEnding") {
+			continue;
 		}
+		if ( key.size( ) > 30 )
+			{
+			key.resize( 30 );
+			}
 		string value = "";
-		for (auto entry : word.second) {
-			value += ("<" + to_string(entry.offset) + ", " + to_string(entry.realLocation) + "> ");
+		for ( auto entry : word.second )
+			{
+			value += ("<" + to_string( entry.offset ) + ", " + to_string( entry.realLocation ) + "> ");
+			}
+		wordSeek.insert( key, value );
 		}
-		wordSeek.insert(key, value);
-	}
-    string key = "=docEnding";
-    string value = "";
-    int currentEndingPartition = 0;
-    for(size_t i = 0; i < docEndingsSeek.size(); i++) {
-        string prospectiveDocEnding = "<" +
-                to_string(docEndingsSeek[i].first) +
-                ", " + to_string(docEndingsSeek[i].second) + "> ";
-        if(value.size() + prospectiveDocEnding.size() <= 168) {
-            value += prospectiveDocEnding;
-        } else {
-            wordSeek.insert(key + to_string(currentEndingPartition), value);
-            currentEndingPartition++;
-            value = prospectiveDocEnding;
-        }
-    }
-}
-
-void Indexer::verbose_save ( )
-	{
-	map< string, vector< size_t > > maps( masterDictionary.begin( ), masterDictionary.end( ) );
-	for ( auto word : maps )
+	string key = "=docEnding";
+	string value = "";
+	int currentEndingPartition = 0;
+	for ( size_t i = 0; i < seekDictionary["=docEnding"].size( ); i++ )
 		{
-		cout << word.first << endl;
-		for ( auto location : word.second )
+		string prospectiveDocEnding = "<" +
+												to_string( seekDictionary["=docEnding"][ i ].offset ) +
+												", " + to_string( seekDictionary["=docEnding"][ i ].realLocation ) + "> ";
+		if ( value.size( ) + prospectiveDocEnding.size( ) <= 168 )
+			{
+			value += prospectiveDocEnding;
+			}
+		else
 			{
-			cout << location << " ";
+			wordSeek.insert( key + to_string( currentEndingPartition ), value );
+			currentEndingPartition++;
+			value = prospectiveDocEnding;
 			}
-		cout << endl;
 		}
-	currentFile++;
+		currentFile++;
 	}

-void Indexer::reset ( )
+void Indexer::reset()
 	{
 	masterDictionary.clear( );
 	docEndings.clear( );
-	postingsSeekTable.clear( );
-    docEndingsSeek.clear();
+	seekDictionary.clear();
 	currentBlockNumberWords = 0;
 	currentBlockNumberDocs = 0;
-    currentFile++;
-    }
+	}
+
+
+void Indexer::Kill()
+	{
+	*(this->alive) = false;
+	currentFile++;
+	}
+
+
+void Indexer::SaveAnchorText( unordered_map < string, DocIndex * > *anchorDict )
+	{
+
+	//TODO create pointer to anchor
+
+
+	//pointerToAnchor->Pop();
+	//pass a dictionary of
+	//map <url string> - >  vector<anchor word>
+	//for each url in map
+	//look up url string in  url -> docEnding map
+	//for each anchor text in url map
+	// create a  anchor text - > list of doc endings
+	//write to disk
+
+	cout << " -- SAVING ANCHOR TEXT --- " << endl;
+	for ( auto const &ent1 : *anchorDict )
+		{
+		auto const &outer_key = ent1.first;
+		cout << "url: " << outer_key << endl;
+
+		if ( urlToDocEndings.find( outer_key ) != urlToDocEndings.end( ))
+			{
+			size_t docEndForUrl = urlToDocEndings[ outer_key ];
+			cout << "Urls doc end : " << docEndForUrl << endl;
+
+			}
+
+
+		DocIndex *inner_map = ent1.second;
+
+		for ( auto const &ent2 : *inner_map )
+			{
+
+
+			auto const &inner_key = ent2.first;
+			auto const &inner_value = ent2.second;
+			//cout << "url: " << outer_key << endl;
+			//cout << "anchor text : " << inner_key << endl;
+			//for(auto offset :inner_value)
+			//	cout << "offset " << offset << endl;
+
+
+			}
+
+		}
+
+
+	}
+
+
--- a/indexer/Indexer.h
+++ b/indexer/Indexer.h
@@ -4,9 +4,10 @@
 #include "../shared/ProducerConsumerQueue.h"
 #include "../shared/ThreadClass.h"
 #include "DocumentEnding.h"
-#include "PostingsSeekTableEntry.h"
+#include "SeekEntry.h"
 #include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
 #include "../util/util.h"
+#include "WordInfo.h"
 #include <unordered_map>
 #include <map>
 #include <vector>
@@ -34,35 +35,39 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >;
 class Indexer : public ThreadClass
 	{
 public:
-	Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in );
-
+	Indexer ( ProducerConsumerQueue< DocIndex * > *doc_index_queue_in ,
+			  ProducerConsumerQueue < unordered_map<string , DocIndex * > >  *anchor_in  );

 	void run ( );
-
-	void verbose_run ( );
-
-	void verbose_save ( );
-
+	void Kill ( );

 private:
 	void save ( );
    void saveWordSeek();
    void saveChunkDictionary ( );
-
+	void SaveAnchorText( unordered_map<string , DocIndex*> * anchorDict );
 	void reset ( );

 	ProducerConsumerQueue< DocIndex * > *pointerToDictionaries;
-	unordered_map< string, vector< size_t > > masterDictionary;
-	unordered_map< string, pair<vector< size_t >, size_t> > chunkDictionary;		// <chunks>, occurances
-	unordered_map< string, vector< PostingsSeekTableEntry > > postingsSeekTable;
+	ProducerConsumerQueue< unordered_map<string , DocIndex * > > *AnchorQueue;

+	// for master.txt file - includes chunks the words appear in, the last real location of the word, and frequency
+	unordered_map< string, WordInfo > chunkDictionary;
+	vector<size_t> chunkEndLocation;
+
+	unordered_map< string, vector< size_t > > masterDictionary;
+	unordered_map< string, vector< SeekEntry > > seekDictionary;
 	vector< DocumentEnding > docEndings;
-	vector< pair<size_t, size_t> > docEndingsSeek;		// <realLocation, offset (to the correspond docEnding)>
-	size_t totalWordsIndexed;
+
+	unordered_map< string, size_t> urlToDocEndings;
+
 	size_t currentFile;
 	size_t currentlyIndexed;
 	size_t currentBlockNumberWords;
 	size_t currentBlockNumberDocs;
+	size_t numberDocsIndexed;
+
+	atomic_bool* alive = new atomic_bool(true);

 };


--- a/indexer/IndexerTwitterTests.cpp
+++ b/indexer/IndexerTwitterTests.cpp
@@ -18,6 +18,7 @@ using DocIndex = const unordered_map< string, vector< unsigned long > >;
 int main ( ) {
 	vector<ifstream *> files;
 	ProducerConsumerQueue< DocIndex * > *IndexerQueue = new ProducerConsumerQueue< DocIndex * >( );
+	ProducerConsumerQueue< unordered_map<string , DocIndex * >  > *AnchorQueue = new ProducerConsumerQueue< unordered_map<string , DocIndex * >  >( );
 	for (int i = 0; i < 60; i++) {
 		string fileName = util::GetCurrentWorkingDir() + "/indexer/tests/twitter/" + to_string(i) + ".json";
 		if (i < 10) {
@@ -68,9 +69,10 @@ int main ( ) {
 			}
 		}
 	}
-	Indexer indexer = Indexer(IndexerQueue);
+	Indexer indexer = Indexer(IndexerQueue, AnchorQueue);
    indexer.StartThread( );
    indexer.WaitForFinish();
+    return 0;
 	/*
 	string query;
 	cout << "What is your query?" << endl;

--- a/indexer/MasterReader.cpp
+++ b/indexer/MasterReader.cpp
+#include <iostream>
+#include "../DataStructures/DiskHashTable/MMDiskHashTable.h"
+#include "../util/util.h"
+
+int main() {
+    string fileName = util::GetCurrentWorkingDir() + "/build/master.txt";
+    MMDiskHashTable master = MMDiskHashTable(fileName, 30, 168);
+    int currentChunk = 0;
+    string key = "=totalDocsIndexed";
+    string value = master.find(key);
+    cout << value << endl;
+}
--- a/indexer/PostingsSeekTableEntry.h
+++ b/indexer/PostingsSeekTableEntry.h
 #pragma once

-class PostingsSeekTableEntry
+class SeekEntry
 	{
 public:
-	PostingsSeekTableEntry ( )
+    SeekEntry ( )
 		{
 		offset = 0;
 		realLocation = 0;
 		}

+    SeekEntry (size_t offset_in, size_t realLocation_in) : offset(offset_in),
+																		realLocation(realLocation_in)
+		{}
+
 	size_t offset;
 	size_t realLocation;
 	};
\ No newline at end of file
--- a/indexer/WordInfo.h
+++ b/indexer/WordInfo.h
+#pragma once
+
+#include <vector>
+
+class WordInfo {
+public:
+    std::vector<int> chunks;
+    size_t frequency;
+    size_t docFrequency;
+    size_t lastLocation;
+};
\ No newline at end of file