From 910c3ec512727e12f63b5873697e99b6ad5de609 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Wed, 21 Feb 2018 21:56:31 -0500 Subject: [PATCH] fixed style --- parser/Parser.h | 222 +++++++++++++------------ parser/Tokenizer.h | 53 +++--- parser/tests/StringProcessing_unit.cpp | 110 ++++++------ parser/tests/TokenizerTest_unit.cpp | 26 +-- util/stringProcessing.h | 200 +++++++++++----------- 5 files changed, 313 insertions(+), 298 deletions(-) diff --git a/parser/Parser.h b/parser/Parser.h index 11f07f9..52e4cbe 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -18,8 +18,8 @@ using namespace std; // Doc Id -std::priority_queue<int> DOCID_PQ; -std::priority_queue<string> URL_PQ; +std::priority_queue< int > DOCID_PQ; +std::priority_queue< string > URL_PQ; string PATH = "/doc"; //TEMP - remove once getting actual crawler input @@ -32,124 +32,134 @@ string PATH = "/doc"; // if find url; send to crawler // if find title send string to tokenizer class Parser -{ + { public: - struct raw_data { - string url; - string html_data; + struct raw_data + { + string url; + string html_data; - raw_data(string u, string h) : url(u), html_data(h){} - }; + raw_data ( string u, string h ) : url ( u ), html_data ( h ) + { } + }; - /** - * Parser - * @return - */ - // input: object with char* and URL string - // - const unordered_map<string, vector<int>> execute() - { - Tokenizer tokenizer; - //TEMP - until we get real input from crawler - raw_data data("url", "html"); - parse(data.html_data, &tokenizer); - return tokenizer.get(); - } + /** + * Parser + * @return + */ + // input: object with char* and URL string + // + const unordered_map< string, vector< int>> execute ( ) + { + Tokenizer tokenizer; + //TEMP - until we get real input from crawler + raw_data data ( "url", "html" ); + parse ( data.html_data, &tokenizer ); + return tokenizer.get ( ); + } private: - /** - * Parses file - * @param inFile - * @return - */ - - string parse(string &html_data, Tokenizer *tokenizer) - { - //figure out file handle syntax - pointer to file - string tokenizerInput = ""; - string currentTerm = ""; - for (int i = 0; i < html_data.size(); ++i) { - while (html_data[i] != ' ') { - currentTerm += html_data[i]; - } - - //one method that directly adds urls onto frontier instead of checking for them - add_urls(currentTerm); - check_title(currentTerm); - tokenizerInput += currentTerm; - } - - tokenizer->execute(tokenizerInput); - } - - /* - * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST - * Instead of bool, just directly adds on to url queue - */ - void add_urls(string &word) - { - string a_tag = "<a"; - string http_start = "href=http"; - string http_end_tag = ">"; - - auto word_iter = word.begin(); - string url = ""; - word_iter = findStr(word_iter, a_tag); - if (word_iter != nullptr) { - auto found_http = findStr(word_iter, http_start); - if (found_http != nullptr) { - url = "http"; - found_http += 9; - auto end_http = findStr(word_iter, http_end_tag); - while (found_http != end_http) { - url += *found_http; - ++found_http; - } - } - } - - else { - return; - } - - if (url != "") { - URL_PQ.push(url); - } - - - } - /** - * <title >AJF</title> - * @param word - */ - - bool check_title(string &word) - { - if (char* pos = strstr("<title>", word)) - { - pos += 6; - auto end_pos = strstr("</title>", word); - string title = ""; - while (pos != end_pos) - { - ++pos; - title += *pos; - - } - - return title; - } + /** + * Parses file + * @param inFile + * @return + */ + + string parse ( string & html_data, Tokenizer *tokenizer ) + { + //figure out file handle syntax - pointer to file + string tokenizerInput = ""; + string currentTerm = ""; + for ( int i = 0; i < html_data.size ( ); ++i ) + { + while ( html_data[ i ] != ' ' ) + { + currentTerm += html_data[ i ]; + } + + //one method that directly adds urls onto frontier instead of checking for them + add_urls ( currentTerm ); + check_title ( currentTerm ); + tokenizerInput += currentTerm; + } + + tokenizer->execute ( tokenizerInput ); + } + + /* + * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST + * Instead of bool, just directly adds on to url queue + */ + void add_urls ( string & word ) + { + string a_tag = "<a"; + string http_start = "href=http"; + string http_end_tag = ">"; + + auto word_iter = word.begin ( ); + string url = ""; + word_iter = findStr ( word_iter, a_tag ); + if ( word_iter != nullptr ) + { + auto found_http = findStr ( word_iter, http_start ); + if ( found_http != nullptr ) + { + url = "http"; + found_http += 9; + auto end_http = findStr ( word_iter, http_end_tag ); + while ( found_http != end_http ) + { + url += *found_http; + ++found_http; + } + } + } + + else + { + return; + } + + if ( url != "" ) + { + URL_PQ.push ( url ); + } + + + } + + /** + * <title >AJF</title> + * @param word + */ + + bool check_title ( string & word ) + { + if ( char *pos = strstr ( "<title>", word ) ) + { + pos += 6; + auto end_pos = strstr ( "</title>", word ); + string title = ""; + while ( pos != end_pos ) + { + ++pos; + title += *pos; + + } + + return title; + } // string begin_title = "<title>"; // auto word_begin = word.begin(); // auto word_iter = findStr(word_begin, begin_title); - } + } -}; + }; diff --git a/parser/Tokenizer.h b/parser/Tokenizer.h index 8e8c55d..a3443fe 100644 --- a/parser/Tokenizer.h +++ b/parser/Tokenizer.h @@ -6,37 +6,38 @@ #include <unordered_map> #include <vector> #include "../util/stringProcessing.h" + using namespace std; class Tokenizer -{ + { public: - Tokenizer() - { - doc_index = new unordered_map<string, vector<int>>; - } + Tokenizer ( ) + { + doc_index = new unordered_map< string, vector< int>>; + } - unordered_map<string, vector<int>> * get() const - { - return doc_index; - } + unordered_map< string, vector< int>> *get ( ) const + { + return doc_index; + } - void execute(string originalText) - { - int offset = 0; - vector<string> splitText = splitStr(originalText, ' '); - string lowerString = ""; - for (int i = 0; i < splitText.size(); ++i) - { - lowerString = toLower(splitText[i]); - if (!isStopWord(lowerString)) - { - (*doc_index)[lowerString].push_back(offset); - ++offset; - } - } - } + void execute ( string originalText ) + { + int offset = 0; + vector< string > splitText = splitStr ( originalText, ' ' ); + string lowerString = ""; + for ( int i = 0; i < splitText.size ( ); ++i ) + { + lowerString = toLower ( splitText[ i ] ); + if ( !isStopWord ( lowerString ) ) + { + ( *doc_index )[ lowerString ].push_back ( offset ); + ++offset; + } + } + } private: - unordered_map<string, vector<int>> *doc_index; -}; + unordered_map< string, vector< int>> *doc_index; + }; diff --git a/parser/tests/StringProcessing_unit.cpp b/parser/tests/StringProcessing_unit.cpp index 3f6b3e5..3643119 100644 --- a/parser/tests/StringProcessing_unit.cpp +++ b/parser/tests/StringProcessing_unit.cpp @@ -10,12 +10,15 @@ using namespace std; -void test_findStr(string original); -void test_splitStr(string original); -void test_toLower(); -void test_isStopWord(); +void test_findStr ( string original ); -int main() +void test_splitStr ( string original ); + +void test_toLower ( ); + +void test_isStopWord ( ); + +int main ( ) { cout << "Beginning testing for StringProcessing_unit" << endl << endl; @@ -24,85 +27,83 @@ int main() "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "making it look like readable English. "; - test_findStr(original); - test_splitStr(original); - test_toLower(); - test_isStopWord(); + test_findStr ( original ); + test_splitStr ( original ); + test_toLower ( ); + test_isStopWord ( ); cout << "\nTests passed for StringProcessing_unit :D" << endl; } -void test_findStr(string original) +void test_findStr ( string original ) { cout << "Testing findStr..." << endl; string find = "established"; - auto word = findStr(original.begin(), find); - assert(*word == 'e'); + auto word = findStr ( original.begin ( ), find ); + assert( *word == 'e' ); find = "Lorem Ipsum"; - auto word2 = findStr (original.begin(), find); - assert(*word2 == 'L'); + auto word2 = findStr ( original.begin ( ), find ); + assert( *word2 == 'L' ); string title = "<title> This is a test </title>"; find = "<title>"; - auto word3 = findStr (title.begin(), find); - assert(*word3 == '<'); - auto titleIt = title.begin(); - while (word3 != title.end() && titleIt != title.end()) + auto word3 = findStr ( title.begin ( ), find ); + assert( *word3 == '<' ); + auto titleIt = title.begin ( ); + while ( word3 != title.end ( ) && titleIt != title.end ( ) ) { - assert(*word3 == *titleIt); + assert( *word3 == *titleIt ); ++word3; ++titleIt; } find = "</title>"; - auto word4 = findStr (title.begin(), find); - assert(*word4 == '<' && *(word4 + 1) == '/'); + auto word4 = findStr ( title.begin ( ), find ); + assert( *word4 == '<' && *( word4 + 1 ) == '/' ); - auto word0 = findStr (original.begin(), find); - assert(*word0 == '\0'); + auto word0 = findStr ( original.begin ( ), find ); + assert( *word0 == '\0' ); find = "orange"; - auto word5 = findStr (original.begin(), find); - assert(*word5 == '\0'); + auto word5 = findStr ( original.begin ( ), find ); + assert( *word5 == '\0' ); find = "orange"; string test = "apple"; - auto word7 = findStr (test.begin(), find); - assert(*word7 == '\0'); + auto word7 = findStr ( test.begin ( ), find ); + assert( *word7 == '\0' ); find = "bird"; test = "bigbird"; - auto word6 = findStr (test.begin(), find); - assert(*word6 == 'b' && *(word6 + 1) == 'i' && *(word6 + 2) == 'r'); + auto word6 = findStr ( test.begin ( ), find ); + assert( *word6 == 'b' && *( word6 + 1 ) == 'i' && *( word6 + 2 ) == 'r' ); cout << "test_findStr passed" << endl; } -void test_splitStr(string original) +void test_splitStr ( string original ) { cout << "Testing splitStr..." << endl; - vector<string> vec = splitStr (original, ' '); - assert(vec.size() == 53); + vector< string > vec = splitStr ( original, ' ' ); + assert( vec.size ( ) == 53 ); string word = "hello\ngoodbye"; - vec = splitStr (word, '\n'); - assert(vec.size() == 2); - assert(vec[0] == "hello" && vec[1] == "goodbye"); + vec = splitStr ( word, '\n' ); + assert( vec.size ( ) == 2 ); + assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" ); cout << "test_splitStr passed" << endl; } - - -void test_toLower() +void test_toLower ( ) { cout << "Testing toLower..." << endl; @@ -112,25 +113,24 @@ void test_toLower() string word4 = ""; string word5 = " "; - string test = toLower (word); - string test2 = toLower (word2); - string test3 = toLower (word3); - string test4 = toLower (word4); - string test5 = toLower (word5); + string test = toLower ( word ); + string test2 = toLower ( word2 ); + string test3 = toLower ( word3 ); + string test4 = toLower ( word4 ); + string test5 = toLower ( word5 ); - assert (test == "hello"); + assert ( test == "hello" ); cout << test2 << endl; - assert (test2 == "hello"); - assert (test3 == "hello goodbye !"); - assert (test4 == ""); - assert (test5 == " "); + assert ( test2 == "hello" ); + assert ( test3 == "hello goodbye !" ); + assert ( test4 == "" ); + assert ( test5 == " " ); cout << "test_toLower passed" << endl; } - -void test_isStopWord() +void test_isStopWord ( ) { cout << "Testing isStopWord..." << endl; @@ -140,11 +140,11 @@ void test_isStopWord() string blank = ""; string blank2 = " "; - assert (isStopWord (is)); - assert (!isStopWord (hello)); - assert (isStopWord (none)); - assert (!isStopWord (blank)); - assert (!isStopWord (blank2)); + assert ( isStopWord ( is ) ); + assert ( !isStopWord ( hello ) ); + assert ( isStopWord ( none ) ); + assert ( !isStopWord ( blank ) ); + assert ( !isStopWord ( blank2 ) ); cout << "test_isStopWord passed" << endl; diff --git a/parser/tests/TokenizerTest_unit.cpp b/parser/tests/TokenizerTest_unit.cpp index eaf902d..bc189c5 100644 --- a/parser/tests/TokenizerTest_unit.cpp +++ b/parser/tests/TokenizerTest_unit.cpp @@ -10,10 +10,10 @@ using namespace std; -void test_execute(string original); +void test_execute ( string original ); -int main() +int main ( ) { cout << "Beginning testing for TokenizerTest_unit" << endl << endl; @@ -22,27 +22,27 @@ int main() "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "making it look like readable English. "; - test_execute(original); + test_execute ( original ); cout << "\nTests passed for TokenizerTest_unit :D" << endl; } -void test_execute(string original) +void test_execute ( string original ) { Tokenizer my_tokenizer; - my_tokenizer.execute(original); + my_tokenizer.execute ( original ); - auto dict = my_tokenizer.get(); + auto dict = my_tokenizer.get ( ); - for ( auto it = dict->begin(); it != dict->end(); it++ ) + for ( auto it = dict->begin ( ); it != dict->end ( ); it++ ) { - cout << it->first << ':'; - for (int i = 0; i < it->second.size(); ++i) - { - cout << it->second[i] << " "; - } - cout << std::endl ; + cout << it->first << ':'; + for ( int i = 0; i < it->second.size ( ); ++i ) + { + cout << it->second[ i ] << " "; + } + cout << std::endl; } } diff --git a/util/stringProcessing.h b/util/stringProcessing.h index 1b29a84..8c746f4 100644 --- a/util/stringProcessing.h +++ b/util/stringProcessing.h @@ -18,103 +18,107 @@ using namespace std; * Takes in an iterator to the original text and a substring: specifically for a parser functionality * Potentially make one that takes in two strings? Is this needed? */ -string::iterator findStr(string::iterator originalText, string &subStr) -{ - - auto begin_sub = subStr.begin(); - auto begin_original = originalText; - - while ( *begin_original != '\0') //*(forward++) != '\0' - { - //keep looking for instance of a match - if ( *begin_original != *begin_sub ) - { - ++begin_original; - } - - else if ( *begin_original == *begin_sub ) - { - /* want to keep the original iterator where it is so it - can return the beginning of the matched word if found */ - auto temp = begin_original; - while ( *temp == *begin_sub ) - { - ++temp; - ++begin_sub; - //if it hits the end of the substring, it signifies an exact match - if ( *begin_sub == '\0') - { - //this is pointing at the beginning of the match - return begin_original; - } - - } - //need to reset because still has to search rest of the string for a match - begin_sub = subStr.begin(); - //sets the original text pointer to where the last search left off - begin_original = temp; - } - - else - { - //DO NOTHING - } - } - - return begin_original; - -} -set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how", - "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that", - "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" }; - -vector<string> splitStr(string &originalText, char delim) -{ - vector<string> splitWords; - auto begin = originalText.begin(); - - while (*begin != '\0') - { - string word = ""; - while (*begin != delim && *begin != '\0') - { - word += *begin; - ++begin; - } - - splitWords.push_back(word); - ++begin; - } - - return splitWords; - -} - -bool isStopWord(string &word) -{ - return (stopWords.find(word) != stopWords.end()); - -} - -string toLower(string &word) -{ - auto iter = word.begin(); - string lowerWord = ""; - while (*iter != '\0') - { - if (*iter >= 'A' && *iter <= 'Z') - { - lowerWord += (*iter + 32); - } - - else - { - lowerWord += *iter; - } - ++iter; - } - - return lowerWord; -} +string::iterator findStr ( string::iterator originalText, string & subStr ) + { + + auto begin_sub = subStr.begin ( ); + auto begin_original = originalText; + + while ( *begin_original != '\0' ) //*(forward++) != '\0' + { + //keep looking for instance of a match + if ( *begin_original != *begin_sub ) + { + ++begin_original; + } + + else if ( *begin_original == *begin_sub ) + { + /* want to keep the original iterator where it is so it + can return the beginning of the matched word if found */ + auto temp = begin_original; + while ( *temp == *begin_sub ) + { + ++temp; + ++begin_sub; + //if it hits the end of the substring, it signifies an exact match + if ( *begin_sub == '\0' ) + { + //this is pointing at the beginning of the match + return begin_original; + } + + } + //need to reset because still has to search rest of the string for a match + begin_sub = subStr.begin ( ); + //sets the original text pointer to where the last search left off + begin_original = temp; + } + + else + { + //DO NOTHING + } + } + + return begin_original; + + } + +set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", + "for", "have", "he", "her", "here", "him", "his", "how", + "i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she", + "some", "the", "their", "them", "there", "they", "that", + "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", + "you", "your" }; + +vector< string > splitStr ( string & originalText, char delim ) + { + vector< string > splitWords; + auto begin = originalText.begin ( ); + + while ( *begin != '\0' ) + { + string word = ""; + while ( *begin != delim && *begin != '\0' ) + { + word += *begin; + ++begin; + } + + splitWords.push_back ( word ); + ++begin; + } + + return splitWords; + + } + +bool isStopWord ( string & word ) + { + return ( stopWords.find ( word ) != stopWords.end ( ) ); + + } + +string toLower ( string & word ) + { + auto iter = word.begin ( ); + string lowerWord = ""; + while ( *iter != '\0' ) + { + if ( *iter >= 'A' && *iter <= 'Z' ) + { + lowerWord += ( *iter + 32 ); + } + + else + { + lowerWord += *iter; + } + ++iter; + } + + return lowerWord; + } #endif //EECS398_SEARCH_STRINGPROCESSING_H -- GitLab