Skip to content
Snippets Groups Projects
Indexer.cpp 5.01 KiB
Newer Older
  • Learn to ignore specific revisions
  • yangni's avatar
    yangni committed
    #include "Indexer.h"
    
    Indexer::Indexer() {
    
        currentFile = 0;
    
    
        currentBlockNumberWords = 0;
        currentBlockNumberDocs = 0;
    
    void Indexer::run() {
        while(pointerToDictionaries.Size() != 0) {
    
            unordered_map<string, vector<int> >* dictionary = pointerToDictionaries.Pop();
            DocumentEnding docEnd = DocumentEnding();
    
    Nicholas Yang's avatar
    Nicholas Yang committed
            size_t indexedCount = 0;
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
            for(auto word : *dictionary) {
    
                if(word.first.at(0) == '=') {
    
                    docEnd.url = word.first.substr(1, word.first.length());
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
                indexedCount += word.second.size();
                currentBlockNumberWords += word.second.size();
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
                for(auto location : word.second) {
    
                    masterDictionary[word.first].push_back(currentlyIndexed + location);
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
            currentlyIndexed += indexedCount;
            docEnd.docEndPosition = currentlyIndexed;
            docEnd.docNumWords = indexedCount;
            docEndings.push_back(docEnd);
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
            if(currentBlockNumberWords >= 300000) {
                save();
                reset();
            }
    
    yangni's avatar
    yangni committed
        }
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
    zldunn's avatar
    zldunn committed
    void Indexer::verbose_run() {
        while(pointerToDictionaries.Size() != 0) {
            unordered_map<string, vector<int>> dictionary = *pointerToDictionaries.Pop();
            for(auto word : dictionary) {
    	        for(auto location : word.second) {
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    //                indexedCount++;
    
    zldunn's avatar
    zldunn committed
                    masterDictionary[word.first].push_back(location);
                    }
                }
            }
        }
    
    
    void Indexer::save() {
    
        map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
    
        string fileName = "index" + to_string(currentFile) + ".txt";
        int file = open(fileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
        // TODO: these should really be c strings
        string header = "===STATS===\n";
        string uniqueWords = "unique words: " + to_string(masterDictionary.size()) + "\n";
        string numberWords = "number words: " + to_string(currentBlockNumberWords) + "\n";
        string numberDocs = "number docs: " + to_string(currentBlockNumberDocs) + "\n";
    
        string footer = "===========\n";
    
        write(file, header.c_str(), strlen(header.c_str()));
        write(file, uniqueWords.c_str(), strlen(uniqueWords.c_str()));
        write(file, numberWords.c_str(), strlen(numberWords.c_str()));
        write(file, numberDocs.c_str(), strlen(numberDocs.c_str()));
        write(file, footer.c_str(), strlen(footer.c_str()));
    
    Nicholas Yang's avatar
    Nicholas Yang committed
    
    
        // REALLY GROSS HACK
    
    Nicholas Yang's avatar
    Nicholas Yang committed
        size_t seekOffset = strlen(header.c_str()) +
    
                         strlen(numberDocs.c_str()) +
                         strlen(numberWords.c_str()) +
                         strlen(uniqueWords.c_str()) +
                         strlen(footer.c_str());
    
    
        for(auto word : maps) {
    
    Nicholas Yang's avatar
    Nicholas Yang committed
            seeker[word.first] = seekOffset;
    
    //        string wordBreak = word.first + "\n";
    //        write(file, wordBreak.c_str(), strlen(wordBreak.c_str()));
    //        seekOffset += strlen(wordBreak.c_str());
    
    Nicholas Yang's avatar
    Nicholas Yang committed
            bool firstPost = true;
    
            size_t lastOne = 0;
    
            for(auto location : word.second) {
    
    Nicholas Yang's avatar
    Nicholas Yang committed
                if(firstPost) {
    
                    string locationSpace = to_string(location) + " ";
                    write(file, locationSpace.c_str(), strlen(locationSpace.c_str()));
                    seekOffset += strlen(locationSpace.c_str());
    
    Nicholas Yang's avatar
    Nicholas Yang committed
                    firstPost = false;
    
                } else {
                    size_t delta = location - lastOne;
                    string deltaSpace = to_string(delta) + " ";
                    write(file, deltaSpace.c_str(), strlen(deltaSpace.c_str()));
                    seekOffset += strlen(deltaSpace.c_str());
                }
                lastOne = location;
    
            write(file, "\n", 1);
    
            seekOffset += 1;
    
        string docEndingHeader = "===Document Endings===\n";
        write(file, docEndingHeader.c_str(), strlen(docEndingHeader.c_str()));
    
        for(auto ending : docEndings) {
            string docEndString = "[" +
                    ending.url + ", " +
                    to_string(ending.docEndPosition) + ", " +
                    to_string(ending.docNumWords) + "]\n";
            write(file, docEndString.c_str(), strlen(docEndString.c_str()));
        }
    
    
        // TODO: seek dictionary
        string seekFileName = "index" + to_string(currentFile) + "-seek.txt";
        int seekFile = open(seekFileName.c_str(), O_CREAT | O_WRONLY, S_IRWXU);
        for(auto word : seeker) {
            string line = word.first + " " + to_string(word.second) + "\n";
            write(seekFile, line.c_str(), strlen(line.c_str()));
        }
    
    
        close(file);
    
        currentFile++;
    
    zldunn's avatar
    zldunn committed
    
    void Indexer::verbose_save() {
    
        map<string, vector<size_t> > maps(masterDictionary.begin(), masterDictionary.end());
    
    zldunn's avatar
    zldunn committed
        for(auto word : maps) {
            cout << word.first << endl;
            for(auto location : word.second) {
                cout << location << " ";
                }
    	    cout << endl;
            }
        currentFile++;
        }
    
    
    void Indexer::reset() {
        masterDictionary.clear();
    
        docEndings.clear();
    
    
        currentBlockNumberWords = 0;
        currentBlockNumberDocs = 0;