diff --git a/Parser.cpp b/Parser.cpp index 259a1f1e4c94e72fcb7c1a4c3a2cf35f4bd24e52..3aaf3e1158234672be5ca998a338ae5428a3efdd 100644 --- a/Parser.cpp +++ b/Parser.cpp @@ -2,6 +2,10 @@ // Created by Veronica Day on 1/28/18. // +// keep running count of offset, if stop word: don't incrememnt and remove stopword +// tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue +// + #include <string> #include <functional> #include <queue> @@ -41,9 +45,10 @@ public: if (!inFile) cerr << "Unable to open file datafile.txt"; + Tokenizer tokenizer(); + parse(inFile, &tokenizer); - parse(inFile); - + return tokenizer.get(); @@ -60,18 +65,25 @@ private: * @param inFile * @return */ - string parse(ifstream inFile) + + string parse(ifstream inFile, Tokenizer *tokenizer) { + //figure out file handle syntax - pointer to file string word = ""; while (!inFile.eof()) { inFile >> word; // checks for url - check_url(word); - // checks for title tags - Tokenizer.execute(check_title(word)); + if (!check_url(word)) + { + // checks for title tags + tokenizer->execute(check_title(word)); + + } + + } } @@ -80,7 +92,7 @@ private: * Checks for url in string word * @param word */ - void check_url(string &word) + bool check_url(string &word) { if (char* pos = strstr("href", word)) { @@ -92,12 +104,17 @@ private: ++pos; while (pos != "\"" && pos != "\'") { + //filter out everything except http, https url += *pos; } // send it back to the crawler URL_PQ.push(url); + return true; + } + + return false; } /** diff --git a/Tokenizer.cpp b/Tokenizer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5ab66e4b1303184b28e8b407be7b1747c5f2cff0 --- /dev/null +++ b/Tokenizer.cpp @@ -0,0 +1,41 @@ +// +// Created by anvia on 1/31/2018. +// + +#include <string> +#include <stdio.h> +#include <string.h> +#include <unordered_map> +#include <vector> +#include <stringProcessing.h> +using namespace std; + +class Tokenizer +{ +public: + Tokenizer() {} + + const unordered_map<string, vector<int>> & get() const + { + return &doc_index; + } + + void execute(string originalText) + { + int offset = 0; + vector<string> splitText = splitStr(originalText, ' '); + string lowerString = ""; + for (int i = 0; i < splitText.size(); ++i) + { + lowerString = toLower(splitText); + if (!isStopWord(lowerString)) + { + doc_index[lowerString].push_back(offset); + ++offset; + } + } + } + +private: + unordered_map<string, vector<int>> doc_index; +}; diff --git a/stringProcessing.h b/stringProcessing.h new file mode 100644 index 0000000000000000000000000000000000000000..239b15bda1cf4ea41d599472c5ea04ccd93ffc12 --- /dev/null +++ b/stringProcessing.h @@ -0,0 +1,68 @@ +// +// Created by anvia on 1/31/2018. +// + +#ifndef EECS398_SEARCH_STRINGPROCESSING_H +#define EECS398_SEARCH_STRINGPROCESSING_H + +#include <string> +#include <unordered_map> +#include <vector> + +using namespace std; + +//TODO +//remove tag words + +set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how", + "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that", + "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" }; + +vector<string> splitStr(string &originalText, char delim) +{ + vector<string> splitWords; + auto begin = originalText.begin(); + auto end = originalText.end(); + while (begin != end) + { + string word = ""; + while (begin != delim && begin != end) + { + word += *begin; + ++begin; + } + + splitWords.push_back(word); + } + + return splitWords; + +} + +bool isStopWord(string &word) +{ + return stopWords.find(word) != nullptr; + +} + +string toLower(string &word) +{ + auto iter = word.begin(); + lowerWord = ""; + while (iter != word.end()) + { + if (*iter < 65 || *iter > 90) + { + lowerWord += (*iter + 32); + } + + else + { + lowerWord += *iter; + } + } + + return lowerWord; +} + +#endif //EECS398_SEARCH_STRINGPROCESSING_H