From 4fe3b4ee8ba5e26035becc94fbbb1b6008e498db Mon Sep 17 00:00:00 2001 From: aanvi <aanvi@umich.edu> Date: Tue, 13 Feb 2018 18:18:32 -0500 Subject: [PATCH] Adding functions --- Parser.cpp | 77 ++++++++++++++++++++++++++++++++++ stringProcessing.h | 100 +++++++++++++++++++++++++++++++++++++++++++++ test.cpp | 20 +++++++++ 3 files changed, 197 insertions(+) create mode 100644 test.cpp diff --git a/Parser.cpp b/Parser.cpp index 3aaf3e1..e4669fe 100644 --- a/Parser.cpp +++ b/Parser.cpp @@ -6,11 +6,13 @@ // tokenizer returns pointer to document dictionary, parser puts it on the indexer's queue // + #include <string> #include <functional> #include <queue> #include <iostream> #include <fstream> +#include <stringProcessing.h> using namespace std; @@ -88,6 +90,44 @@ private: } + /* + * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST + * Instead of bool, just directly adds on to url queue + */ + void add_urls(string word) + { + string http_tag = "<a href=http"; + + auto word_iter = word.begin(); + auto http_substr = http_tag.begin(); + string url_name = ""; + + //will add all instances of wanted URLs until it hits end of string + while (word_iter != nullptr) + { + // sets word_iterator to next instance of URL from where it currently is + word_iter = findStr(word_iter, http_substr); + //in the case there is no url in the word + if (word_iter != nullptr) + { + //sets iterator to start of url content + word_iter += 12; + url_name = "http"; + //end of url tag + auto tag_end = ("</a>", word); + while (word_iter != tag_end) + { + url_name += *word_iter; + ++word_iter; + } + //sets iterator to one past the closing tag </a> + word_iter += 4; + URL_PQ.push(url_name); + //resets substr so it can look for next instance + http_substr = http_tag.begin(); + } + } + } /** * Checks for url in string word * @param word @@ -121,6 +161,43 @@ private: * <title >AJF</title> * @param word */ + + string check_title_handle(string &word) + { + string titleTag = "<title>"; + string closeTitleTag = "</title>"; + auto wordBegin = word.begin(); + + string allTitles = ""; + while (wordBegin != nullptr) + { + wordBegin = findStr(wordBegin, titleTag); + + if (wordBegin == nullptr) + { + return allTitles; + } + + //increments until first letter past opening title tag + wordBegin += 7; + auto end_title = findStr(wordBegin, closeTitleTag); + while (wordBegin != end_title) + { + allTitles += *wordBegin; + ++wordBegin; + + if (wordBegin == nullptr) + { + return allTitles; + } + } + + //increments until first letter past closing title tag + wordBegin += 8; + } + + return allTitles; + } string check_title(string &word) { if (char* pos = strstr("<title>", word)) diff --git a/stringProcessing.h b/stringProcessing.h index 239b15b..bc830b6 100644 --- a/stringProcessing.h +++ b/stringProcessing.h @@ -8,18 +8,118 @@ #include <string> #include <unordered_map> #include <vector> +#include <iterator> using namespace std; //TODO //remove tag words +//assuming we have a buffer from File Handle +//returns pointer to subStr in original text +/*char* preFindStr(char* originalText, char* subStr) +{ + bool isFound = true; + char* temp = originalText; + while ( isFound ) + { + if ( *originalText == *subStr ) + { + + } + } +}*/ +/*char* postFindStr(char* originalText, char* subStr) +{ + bool isFound = true; + while ( originalText != nullptr ) + { + if ( *originalText != *subStr ) + { + ++originalText; + } + + if ( *originalText == *substr ) + if (originalText == nullptr || substr == nullptr) + { + isFound = false; + } + + else if ( *originalText != *subStr ) + { + isFound = false; + } + + else + { + isFound = true; + } + + ++subStr; + ++originalText; + } + + return originalText; + +}*/ + + +/* + * Takes in an iterator to the original text and a substring: specifically for a parser functionality + * Potentially make one that takes in two strings? Is this needed? + */ +string::iterator findStr(string::iterator originalText, string &subStr) +{ + + auto begin_sub = subStr.begin(); + auto begin_original = originalText; + + while ( begin_original != nullptr) + { + //keep looking for instance of a match + if ( *begin_original != *begin_sub ) + { + ++begin_original; + } + + else if ( *begin_original == *begin_sub ) + { + /* want to keep the original iterator where it is so it + can return the beginning of the matched word if found */ + auto temp = begin_original; + while ( *temp == *begin_sub ) + { + ++temp; + ++begin_sub; + //if it hits the end of the substring, it signifies an exact match + if ( begin_sub == nullptr) + { + //this is pointing at the beginning of the match + return begin_original; + } + + } + //need to reset because still has to search rest of the string for a match + begin_sub = subStr.begin(); + //sets the original text pointer to where the last search left off + begin_original = temp; + } + + else + { + //DO NOTHING + } + } + + return begin_original; +} set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how", "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that", "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" }; vector<string> splitStr(string &originalText, char delim) { + vector<string> splitWords; auto begin = originalText.begin(); auto end = originalText.end(); diff --git a/test.cpp b/test.cpp new file mode 100644 index 0000000..155d22d --- /dev/null +++ b/test.cpp @@ -0,0 +1,20 @@ +// +// Created by anvia on 2/6/2018. +// + +#include <string> +#include <stringProcessing.h> +#include <iostream> +using namespace std; + +int main() +{ + string original = "It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout." + "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," + "making it look like readable English. "; + + string subStr = "readable"; + auto iter = findStr(subStr, original); + cout << *iter << endl; +} + -- GitLab