Merge branch 'milestone2' of https://gitlab.eecs.umich.edu/vcday/eecs398-search into milestone2

89159931 · benbergk · 1ab1c97d · 129bba17 · 89159931 · 89159931
Commit 89159931 authored 6 years ago by benbergk
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,6 +120,7 @@ add_executable(ISRWord-tests
        util/util.cpp
        constraintSolver/ISRWord.cpp
        constraintSolver/tests/ISRWordTests.cpp
+        constraintSolver/ISREndDoc.cpp
        util/stringProcessing.cpp
        util/Stemmer.cpp )

--- a/ISRWord-tests
+++ b/ISRWord-tests
--- a/constraintSolver/ISREndDoc.cpp
+++ b/constraintSolver/ISREndDoc.cpp
@@ -3,3 +3,106 @@
 //
 #include "ISREndDoc.h"
+ISREndDoc::ISREndDoc() {
+    currentChunk = 0;
+}
+DocumentEnding ISREndDoc::next() {
+    if(memMap == nullptr) {
+        string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + ".txt";
+        currentFile = open(fileName.c_str(), O_RDONLY);
+        vector<size_t> contents = getSeekContents();
+        memMap = (char*) mmap(nullptr, util::FileSize(currentFile), PROT_READ, MAP_PRIVATE, currentFile, 0);
+        memMap += contents[0];
+    }
+    string currentOne;
+    for(char* map = memMap; map < memMap + util::FileSize(currentFile); map++) {
+        if(*map == '\0') {
+            currentChunk++;
+            memMap = nullptr;
+            return DocumentEnding();
+        }
+        if(currentChunk == 8) {
+            DocumentEnding a = DocumentEnding();
+            a.url = "aaa";
+            return a;
+        }
+        if(*map == '\n') {
+            memMap = map;
+            memMap++;
+            break;
+        }
+        switch(*map) {
+            case '[':
+                currentDoc = DocumentEnding();
+                break;
+            case ']':
+                currentDoc.docNumWords = stoll(currentOne);
+                currentOne = "";
+                break;
+            case ',':
+                if(currentDoc.url == "") {
+                    currentDoc.url = currentOne;
+                    currentOne = "";
+                } else if(currentDoc.docEndPosition == 0) {
+                    currentDoc.docEndPosition = stoll(currentOne);
+                    currentOne = "";
+                }
+                break;
+            case ' ':
+                break;
+            default:
+                currentOne += *map;
+                break;
+        }
+    }
+    return currentDoc;
+}
+DocumentEnding ISREndDoc::getCurrentDoc() {
+    return currentDoc;
+}
+vector<size_t> ISREndDoc::getSeekContents() {
+    string fileName = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(currentChunk) + "-seek.txt";
+    int file = open(fileName.c_str(), O_RDONLY);
+    ssize_t fileSize = util::FileSize(file);
+    vector<size_t> contents;
+    char* memMap = (char*) mmap(nullptr, fileSize, PROT_READ, MAP_PRIVATE, file, 0);
+    // char* memMap = util::getFileMap(fileName);
+    string word = "";
+    bool midWord = false;
+    bool midFind = false;
+    if(memMap != MAP_FAILED) {
+        for(char* map = memMap; map < memMap + fileSize; map++) {
+            if(midFind && isalpha(*map)) {
+                break;
+            }
+            switch(*map) {
+                case '\n':
+                case '\r':
+                case '\t':
+                case ' ':
+                    if (midFind && word != "") {
+                        contents.push_back(stoll(word));
+                        word = "";
+                    } else if (midWord) {
+                        midWord = false;
+                        if(word == "=docEnding") {
+                            midFind = true;
+                        }
+                        word = "";
+                    }
+                    break;
+                default:
+                    word += *map;
+                    midWord = true;
+            }
+        }
+    }
+    return contents;
+}
\ No newline at end of file
--- a/constraintSolver/ISREndDoc.h
+++ b/constraintSolver/ISREndDoc.h
@@ -4,22 +4,37 @@
 #pragma once
-#include "ISR.h"
+#include <iostream>
+#include <vector>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "WordSeek.h"
+#include "../util/util.h"
+#include "../indexer/DocumentEnding.h"
 // Find occurrences of document ends.
-class ISREndDoc : ISR
+class ISREndDoc
 	{
 public:
-	Location GetCurrentLocation ( );
+	ISREndDoc();
+	DocumentEnding next();
-	Location GetPreviousLocation ( );
+	DocumentEnding getCurrentDoc();
 	unsigned GetDocumentLength ( );
 	unsigned GetTitleLength ( );
+	string getURL ( );
-	unsigned GetUrlLength ( );
+private:
-	};
+	DocumentEnding currentDoc;
+	char* memMap;
+	int currentChunk;
+    int currentFile;
+    vector<size_t> getSeekContents();
+};
--- a/constraintSolver/ISRWord.cpp
+++ b/constraintSolver/ISRWord.cpp
@@ -72,7 +72,7 @@ vector<size_t> ISRWord::getSeekContents(string fileName) {
 void ISRWord::getChunks() {
-    listOfChunks = getSeekContents("index-test-files/twitter/index-master.txt");
+    listOfChunks = getSeekContents(util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index-master.txt");
 //    int chunkFile = open("index-test-files/twitter/index-master.txt", O_RDONLY);
 //    ssize_t chunkFileSize = FileSize(chunkFile);
 //    char* chunkMemMap = (char*) mmap(nullptr, chunkFileSize, PROT_READ, MAP_PRIVATE, chunkFile, 0);
@@ -117,16 +117,12 @@ void ISRWord::getChunks() {
 Location ISRWord::first ( )
 	{
-	if ( listOfChunks.size( ) <= currentChunk )
-		{
-		exit( 0 );
-		}
 	string currentChunkSeekFileLocation =
-			util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
+			util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
 			"-seek.txt";
 	vector< size_t > location = getSeekContents( currentChunkSeekFileLocation );
 	string currentChunkFileLocation =
-			util::GetCurrentWorkingDir( ) + "/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
+			util::GetCurrentWorkingDir( ) + "/constraintSolver/index-test-files/twitter/index" + to_string( listOfChunks[ currentChunk ] ) +
 			".txt";
 	int currentChunkFile = open( currentChunkFileLocation.c_str( ), O_RDONLY );
 	ssize_t currentChunkFileSize = FileSize( currentChunkFile );
@@ -156,8 +152,14 @@ Location ISRWord::next ( )
 	if ( *currentMemMap == '\n' )
 		{
 		currentChunk++;
-		currentLocation = first( );
+        if(listOfChunks.size( ) <= currentChunk)
-		}
+            {
+            currentLocation = 9999999999999;
+            return currentLocation;
+            }
+            currentLocation = first( );
+        }
 	else
 		{
 		string delta = "";
@@ -172,6 +174,11 @@ Location ISRWord::next ( )
 	return currentLocation;
 	}
+Location ISRWord::getCurrentLocation()
+    {
+    return currentLocation;
+    }
 //look thru each chunk
 //check if absolute position at offset in chunk is less then chunk,
 //check seek lookup table to find if offset+absulte is bigger than target
@@ -184,7 +191,7 @@ Location ISRWord::seek( Location target ) {
            if(entry.realLocation < target) {
                best = entry;
            } else {
-                string currentChunkFileLocation = "index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
+                string currentChunkFileLocation = util::GetCurrentWorkingDir() + "/constraintSolver/index-test-files/twitter/index" + to_string(listOfChunks[currentChunk]) + ".txt";
                int currentChunkFile = open(currentChunkFileLocation.c_str(), O_RDONLY);
                ssize_t currentChunkFileSize = FileSize(currentChunkFile);
                currentMemMap = (char*) mmap(nullptr, currentChunkFileSize, PROT_READ, MAP_PRIVATE, currentChunkFile, 0);

--- a/constraintSolver/ISRWord.h
+++ b/constraintSolver/ISRWord.h
@@ -54,9 +54,9 @@ public:
 	//set member variables to all of the chunks that occur, update current chunk
 	void getChunks ( );
+	Location getCurrentLocation();
 private:
-	};
+};
--- a/constraintSolver/tests/ISRWordTests.cpp
+++ b/constraintSolver/tests/ISRWordTests.cpp
@@ -3,20 +3,36 @@
 //
 #include <iostream>
+#include <set>
+#include "../../indexer/DocumentEnding.h"
 #include "../ISRWord.h"
+#include "../ISREndDoc.h"
 using namespace std;
 int main ( )
-	{
+{
-	char *w = new char[10];
+    char* query;
-	strcpy( w, "hello" );
+    ISRWord queryWord("iphone");
-	ISRWord word = ISRWord( w );
+    ISREndDoc endDocs;
+    vector<size_t> locations;
+    vector<DocumentEnding> docEnds;
+    set<string> urls;
+    while(queryWord.getCurrentLocation() != 9999999999999) {
+        locations.push_back(queryWord.next());
+    }
+    while(endDocs.next().url != "aaa") {
+        for(auto locs : locations) {
+            if(locs < endDocs.getCurrentDoc().docEndPosition &&
+               locs >= (endDocs.getCurrentDoc().docEndPosition - endDocs.getCurrentDoc().docNumWords)) {
+                urls.insert(endDocs.getCurrentDoc().url);
+            }
+        }
-	while ( 1 )
+    }
-		{
+    for(auto urrl : urls) {
-		cout << word.next( ) << endl;
+        cout << urrl << endl;
-		}
+    }
-	return 0;
+    return 0;
-	}
+}
\ No newline at end of file
--- a/crawler-parser-indexer-test
+++ b/crawler-parser-indexer-test
--- a/main.cpp
+++ b/main.cpp
@@ -144,6 +144,12 @@ int main ( int argc, char *argv[] )
 	crawler.WaitOnAllSpiders( );
 	indexer.WaitForFinish( );
+	string aa;
+	cin >> aa;
+	if(aa == "q") {
+		return 0;
+	}
 	auto f = urlFrontier->Pop( );
 	int x = 0;

--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -39,7 +39,9 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
 		{
 		set< char > split = { '.', ':', '/', '\\', '_', '?', '-', '~', '#', '[', ']', '@', '!', '$', '&', '\'',
 		                      '(', ')', '*', '+', ',', ';', '=' };
+        string codedURL = "=";
+        codedURL += originalText;
+        (*docIndex)[codedURL].push_back(0);
 		return tokenize( splitStr( originalText, split, true ), offset, decorator );
 		}