From 910c3ec512727e12f63b5873697e99b6ad5de609 Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Wed, 21 Feb 2018 21:56:31 -0500
Subject: [PATCH] fixed style

---
 parser/Parser.h                        | 222 +++++++++++++------------
 parser/Tokenizer.h                     |  53 +++---
 parser/tests/StringProcessing_unit.cpp | 110 ++++++------
 parser/tests/TokenizerTest_unit.cpp    |  26 +--
 util/stringProcessing.h                | 200 +++++++++++-----------
 5 files changed, 313 insertions(+), 298 deletions(-)

diff --git a/parser/Parser.h b/parser/Parser.h
index 11f07f9..52e4cbe 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -18,8 +18,8 @@
 using namespace std;
 
 // Doc Id
-std::priority_queue<int> DOCID_PQ;
-std::priority_queue<string> URL_PQ;
+std::priority_queue< int > DOCID_PQ;
+std::priority_queue< string > URL_PQ;
 string PATH = "/doc";
 
 //TEMP - remove once getting actual crawler input
@@ -32,124 +32,134 @@ string PATH = "/doc";
 // if find url; send to crawler
 // if find title send string to tokenizer
 class Parser
-{
+	{
 
 public:
 
-    struct raw_data {
-        string url;
-        string html_data;
+	struct raw_data
+		{
+		string url;
+		string html_data;
 
-        raw_data(string u, string h) : url(u), html_data(h){}
-    };
+		raw_data ( string u, string h ) : url ( u ), html_data ( h )
+			{ }
+		};
 
 
-    /**
-     * Parser
-     * @return
-     */
-    // input: object with char*  and URL string
-    //
-    const unordered_map<string, vector<int>> execute()
-    {
-        Tokenizer tokenizer;
-        //TEMP - until we get real input from crawler
-        raw_data data("url", "html");
-        parse(data.html_data, &tokenizer);
-        return tokenizer.get();
-    }
+	/**
+	 * Parser
+	 * @return
+	 */
+	// input: object with char*  and URL string
+	//
+	const unordered_map< string, vector< int>> execute ( )
+		{
+		Tokenizer tokenizer;
+		//TEMP - until we get real input from crawler
+		raw_data data ( "url", "html" );
+		parse ( data.html_data, &tokenizer );
+		return tokenizer.get ( );
+		}
 
 
 private:
 
-    /**
-     * Parses file
-     * @param inFile
-     * @return
-     */
-
-    string parse(string &html_data, Tokenizer *tokenizer)
-    {
-        //figure out file handle syntax - pointer to file
-        string tokenizerInput = "";
-        string currentTerm = "";
-        for (int i = 0; i < html_data.size(); ++i) {
-            while (html_data[i] != ' ') {
-                currentTerm += html_data[i];
-            }
-
-            //one method that directly adds urls onto frontier instead of checking for them
-	        add_urls(currentTerm);
-            check_title(currentTerm);
-            tokenizerInput += currentTerm;
-        }
-
-        tokenizer->execute(tokenizerInput);
-    }
-
-    /*
-     * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
-     * Instead of bool, just directly adds on to url queue
-     */
-    void add_urls(string &word)
-    {
-        string a_tag = "<a";
-        string http_start = "href=http";
-        string http_end_tag = ">";
-
-        auto word_iter = word.begin();
-        string url = "";
-        word_iter = findStr(word_iter, a_tag);
-        if (word_iter != nullptr) {
-            auto found_http = findStr(word_iter, http_start);
-            if (found_http != nullptr) {
-	            url = "http";
-                found_http += 9;
-                auto end_http = findStr(word_iter, http_end_tag);
-                while (found_http != end_http) {
-                    url += *found_http;
-                    ++found_http;
-                }
-            }
-        }
-
-        else {
-            return;
-        }
-
-        if (url != "") {
-            URL_PQ.push(url);
-        }
-
-
-    }
-    /**
-     * <title >AJF</title>
-     * @param word
-     */
-
-    bool check_title(string &word)
-    {
-        if (char* pos = strstr("<title>", word))
-        {
-            pos += 6;
-            auto end_pos = strstr("</title>", word);
-            string title = "";
-            while (pos != end_pos)
-            {
-                ++pos;
-                title += *pos;
-
-            }
-
-            return title;
-        }
+	/**
+	 * Parses file
+	 * @param inFile
+	 * @return
+	 */
+
+	string parse ( string & html_data, Tokenizer *tokenizer )
+		{
+		//figure out file handle syntax - pointer to file
+		string tokenizerInput = "";
+		string currentTerm = "";
+		for ( int i = 0; i < html_data.size ( ); ++i )
+			{
+			while ( html_data[ i ] != ' ' )
+				{
+				currentTerm += html_data[ i ];
+				}
+
+			//one method that directly adds urls onto frontier instead of checking for them
+			add_urls ( currentTerm );
+			check_title ( currentTerm );
+			tokenizerInput += currentTerm;
+			}
+
+		tokenizer->execute ( tokenizerInput );
+		}
+
+	/*
+	 * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
+	 * Instead of bool, just directly adds on to url queue
+	 */
+	void add_urls ( string & word )
+		{
+		string a_tag = "<a";
+		string http_start = "href=http";
+		string http_end_tag = ">";
+
+		auto word_iter = word.begin ( );
+		string url = "";
+		word_iter = findStr ( word_iter, a_tag );
+		if ( word_iter != nullptr )
+			{
+			auto found_http = findStr ( word_iter, http_start );
+			if ( found_http != nullptr )
+				{
+				url = "http";
+				found_http += 9;
+				auto end_http = findStr ( word_iter, http_end_tag );
+				while ( found_http != end_http )
+					{
+					url += *found_http;
+					++found_http;
+					}
+				}
+			}
+
+		else
+			{
+			return;
+			}
+
+		if ( url != "" )
+			{
+			URL_PQ.push ( url );
+			}
+
+
+		}
+
+	/**
+	 * <title >AJF</title>
+	 * @param word
+	 */
+
+	bool check_title ( string & word )
+		{
+		if ( char *pos = strstr ( "<title>", word ) )
+			{
+			pos += 6;
+			auto end_pos = strstr ( "</title>", word );
+			string title = "";
+			while ( pos != end_pos )
+				{
+				++pos;
+				title += *pos;
+
+				}
+
+			return title;
+			}
 
 //        string begin_title = "<title>";
 //        auto word_begin = word.begin();
 //        auto word_iter = findStr(word_begin, begin_title);
 
-    }
+		}
 
-};
+	};
 
diff --git a/parser/Tokenizer.h b/parser/Tokenizer.h
index 8e8c55d..a3443fe 100644
--- a/parser/Tokenizer.h
+++ b/parser/Tokenizer.h
@@ -6,37 +6,38 @@
 #include <unordered_map>
 #include <vector>
 #include "../util/stringProcessing.h"
+
 using namespace std;
 
 class Tokenizer
-{
+	{
 public:
-    Tokenizer()
-	    {
-        doc_index = new unordered_map<string, vector<int>>;
-        }
+	Tokenizer ( )
+		{
+		doc_index = new unordered_map< string, vector< int>>;
+		}
 
-    unordered_map<string, vector<int>> * get() const
-    {
-        return doc_index;
-    }
+	unordered_map< string, vector< int>> *get ( ) const
+		{
+		return doc_index;
+		}
 
-    void execute(string originalText)
-    {
-        int offset = 0;
-        vector<string> splitText = splitStr(originalText, ' ');
-        string lowerString = "";
-        for (int i = 0; i < splitText.size(); ++i)
-        {
-            lowerString = toLower(splitText[i]);
-            if (!isStopWord(lowerString))
-            {
-	            (*doc_index)[lowerString].push_back(offset);
-                ++offset;
-            }
-        }
-    }
+	void execute ( string originalText )
+		{
+		int offset = 0;
+		vector< string > splitText = splitStr ( originalText, ' ' );
+		string lowerString = "";
+		for ( int i = 0; i < splitText.size ( ); ++i )
+			{
+			lowerString = toLower ( splitText[ i ] );
+			if ( !isStopWord ( lowerString ) )
+				{
+				( *doc_index )[ lowerString ].push_back ( offset );
+				++offset;
+				}
+			}
+		}
 
 private:
-    unordered_map<string, vector<int>> *doc_index;
-};
+	unordered_map< string, vector< int>> *doc_index;
+	};
diff --git a/parser/tests/StringProcessing_unit.cpp b/parser/tests/StringProcessing_unit.cpp
index 3f6b3e5..3643119 100644
--- a/parser/tests/StringProcessing_unit.cpp
+++ b/parser/tests/StringProcessing_unit.cpp
@@ -10,12 +10,15 @@
 
 using namespace std;
 
-void test_findStr(string original);
-void test_splitStr(string original);
-void test_toLower();
-void test_isStopWord();
+void test_findStr ( string original );
 
-int main()
+void test_splitStr ( string original );
+
+void test_toLower ( );
+
+void test_isStopWord ( );
+
+int main ( )
 	{
 
 	cout << "Beginning testing for StringProcessing_unit" << endl << endl;
@@ -24,85 +27,83 @@ int main()
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";
 
-	test_findStr(original);
-	test_splitStr(original);
-	test_toLower();
-	test_isStopWord();
+	test_findStr ( original );
+	test_splitStr ( original );
+	test_toLower ( );
+	test_isStopWord ( );
 
 	cout << "\nTests passed for StringProcessing_unit :D" << endl;
 
 	}
 
-void test_findStr(string original)
+void test_findStr ( string original )
 	{
 	cout << "Testing findStr..." << endl;
 
 	string find = "established";
-	auto word = findStr(original.begin(), find);
-	assert(*word == 'e');
+	auto word = findStr ( original.begin ( ), find );
+	assert( *word == 'e' );
 
 	find = "Lorem Ipsum";
-	auto word2 = findStr (original.begin(), find);
-	assert(*word2 == 'L');
+	auto word2 = findStr ( original.begin ( ), find );
+	assert( *word2 == 'L' );
 
 	string title = "<title> This is a test </title>";
 	find = "<title>";
-	auto word3 = findStr (title.begin(), find);
-	assert(*word3 == '<');
-	auto titleIt = title.begin();
-	while (word3 != title.end() && titleIt != title.end())
+	auto word3 = findStr ( title.begin ( ), find );
+	assert( *word3 == '<' );
+	auto titleIt = title.begin ( );
+	while ( word3 != title.end ( ) && titleIt != title.end ( ) )
 		{
-		assert(*word3 == *titleIt);
+		assert( *word3 == *titleIt );
 		++word3;
 		++titleIt;
 		}
 
 	find = "</title>";
-	auto word4 = findStr (title.begin(), find);
-	assert(*word4 == '<' && *(word4 + 1) == '/');
+	auto word4 = findStr ( title.begin ( ), find );
+	assert( *word4 == '<' && *( word4 + 1 ) == '/' );
 
-	auto word0 = findStr (original.begin(), find);
-	assert(*word0 == '\0');
+	auto word0 = findStr ( original.begin ( ), find );
+	assert( *word0 == '\0' );
 
 	find = "orange";
-	auto word5 = findStr (original.begin(), find);
-	assert(*word5 == '\0');
+	auto word5 = findStr ( original.begin ( ), find );
+	assert( *word5 == '\0' );
 
 	find = "orange";
 	string test = "apple";
-	auto word7 = findStr (test.begin(), find);
-	assert(*word7 == '\0');
+	auto word7 = findStr ( test.begin ( ), find );
+	assert( *word7 == '\0' );
 
 	find = "bird";
 	test = "bigbird";
-	auto word6 = findStr (test.begin(), find);
-	assert(*word6 == 'b' && *(word6 + 1) == 'i' && *(word6 + 2) == 'r');
+	auto word6 = findStr ( test.begin ( ), find );
+	assert( *word6 == 'b' && *( word6 + 1 ) == 'i' && *( word6 + 2 ) == 'r' );
 
 	cout << "test_findStr passed" << endl;
 
 	}
 
 
-void test_splitStr(string original)
+void test_splitStr ( string original )
 	{
 	cout << "Testing splitStr..." << endl;
 
-	vector<string> vec = splitStr (original, ' ');
-	assert(vec.size() == 53);
+	vector< string > vec = splitStr ( original, ' ' );
+	assert( vec.size ( ) == 53 );
 
 	string word = "hello\ngoodbye";
-	vec = splitStr (word, '\n');
-	assert(vec.size() == 2);
-	assert(vec[0] == "hello" && vec[1] == "goodbye");
+	vec = splitStr ( word, '\n' );
+	assert( vec.size ( ) == 2 );
+	assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
 
 	cout << "test_splitStr passed" << endl;
 
 	}
 
 
-
-
-void test_toLower()
+void test_toLower ( )
 	{
 	cout << "Testing toLower..." << endl;
 
@@ -112,25 +113,24 @@ void test_toLower()
 	string word4 = "";
 	string word5 = " ";
 
-	string test = toLower (word);
-	string test2 = toLower (word2);
-	string test3 = toLower (word3);
-	string test4 = toLower (word4);
-	string test5 = toLower (word5);
+	string test = toLower ( word );
+	string test2 = toLower ( word2 );
+	string test3 = toLower ( word3 );
+	string test4 = toLower ( word4 );
+	string test5 = toLower ( word5 );
 
-	assert (test == "hello");
+	assert ( test == "hello" );
 	cout << test2 << endl;
-	assert (test2 == "hello");
-	assert (test3 == "hello goodbye !");
-	assert (test4 == "");
-	assert (test5 == " ");
+	assert ( test2 == "hello" );
+	assert ( test3 == "hello goodbye !" );
+	assert ( test4 == "" );
+	assert ( test5 == " " );
 
 	cout << "test_toLower passed" << endl;
 	}
 
 
-
-void test_isStopWord()
+void test_isStopWord ( )
 	{
 	cout << "Testing isStopWord..." << endl;
 
@@ -140,11 +140,11 @@ void test_isStopWord()
 	string blank = "";
 	string blank2 = " ";
 
-	assert (isStopWord (is));
-	assert (!isStopWord (hello));
-	assert (isStopWord (none));
-	assert (!isStopWord (blank));
-	assert (!isStopWord (blank2));
+	assert ( isStopWord ( is ) );
+	assert ( !isStopWord ( hello ) );
+	assert ( isStopWord ( none ) );
+	assert ( !isStopWord ( blank ) );
+	assert ( !isStopWord ( blank2 ) );
 
 	cout << "test_isStopWord passed" << endl;
 
diff --git a/parser/tests/TokenizerTest_unit.cpp b/parser/tests/TokenizerTest_unit.cpp
index eaf902d..bc189c5 100644
--- a/parser/tests/TokenizerTest_unit.cpp
+++ b/parser/tests/TokenizerTest_unit.cpp
@@ -10,10 +10,10 @@
 
 using namespace std;
 
-void test_execute(string original);
+void test_execute ( string original );
 
 
-int main()
+int main ( )
 	{
 
 	cout << "Beginning testing for TokenizerTest_unit" << endl << endl;
@@ -22,27 +22,27 @@ int main()
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";
 
-	test_execute(original);
+	test_execute ( original );
 
 	cout << "\nTests passed for TokenizerTest_unit :D" << endl;
 
 	}
 
-void test_execute(string original)
+void test_execute ( string original )
 	{
 	Tokenizer my_tokenizer;
-	my_tokenizer.execute(original);
+	my_tokenizer.execute ( original );
 
-	auto dict = my_tokenizer.get();
+	auto dict = my_tokenizer.get ( );
 
-	for ( auto it = dict->begin(); it != dict->end(); it++ )
+	for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
 		{
-		cout << it->first  << ':';
-		 for (int i = 0; i < it->second.size(); ++i)
-			 {
-			 cout << it->second[i] << " ";
-			 }
-		cout << std::endl ;
+		cout << it->first << ':';
+		for ( int i = 0; i < it->second.size ( ); ++i )
+			{
+			cout << it->second[ i ] << " ";
+			}
+		cout << std::endl;
 		}
 
 	}
diff --git a/util/stringProcessing.h b/util/stringProcessing.h
index 1b29a84..8c746f4 100644
--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -18,103 +18,107 @@ using namespace std;
  * Takes in an iterator to the original text and a substring: specifically for a parser functionality
  * Potentially make one that takes in two strings? Is this needed?
  */
-string::iterator findStr(string::iterator originalText, string &subStr)
-{
-
-    auto begin_sub = subStr.begin();
-    auto begin_original = originalText;
-
-    while ( *begin_original != '\0') //*(forward++) != '\0'
-    {
-        //keep looking for instance of a match
-        if ( *begin_original != *begin_sub )
-        {
-            ++begin_original;
-        }
-
-        else if ( *begin_original == *begin_sub )
-        {
-            /* want to keep the original iterator where it is so it
-               can return the beginning of the matched word if found */
-            auto temp = begin_original;
-            while ( *temp == *begin_sub )
-            {
-                ++temp;
-                ++begin_sub;
-                //if it hits the end of the substring, it signifies an exact match
-                if ( *begin_sub  == '\0')
-                {
-                    //this is pointing at the beginning of the match
-                    return begin_original;
-                }
-
-            }
-            //need to reset because still has to search rest of the string for a match
-            begin_sub = subStr.begin();
-            //sets the original text pointer to where the last search left off
-            begin_original = temp;
-        }
-
-        else
-        {
-            //DO NOTHING
-        }
-    }
-
-    return begin_original;
-
-}
-set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
-                         "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
-                         "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
-
-vector<string> splitStr(string &originalText, char delim)
-{
-    vector<string> splitWords;
-    auto begin = originalText.begin();
-
-    while (*begin != '\0')
-    {
-        string word = "";
-        while (*begin != delim && *begin != '\0')
-        {
-            word += *begin;
-            ++begin;
-        }
-
-        splitWords.push_back(word);
-	    ++begin;
-    }
-
-    return splitWords;
-
-}
-
-bool isStopWord(string &word)
-{
-    return (stopWords.find(word) != stopWords.end());
-
-}
-
-string toLower(string &word)
-{
-    auto iter = word.begin();
-    string lowerWord = "";
-    while (*iter != '\0')
-    {
-        if (*iter >= 'A' && *iter <= 'Z')
-        {
-            lowerWord += (*iter + 32);
-        }
-
-        else
-        {
-            lowerWord += *iter;
-        }
-	    ++iter;
-    }
-
-    return lowerWord;
-}
+string::iterator findStr ( string::iterator originalText, string & subStr )
+	{
+
+	auto begin_sub = subStr.begin ( );
+	auto begin_original = originalText;
+
+	while ( *begin_original != '\0' ) //*(forward++) != '\0'
+		{
+		//keep looking for instance of a match
+		if ( *begin_original != *begin_sub )
+			{
+			++begin_original;
+			}
+
+		else if ( *begin_original == *begin_sub )
+			{
+			/* want to keep the original iterator where it is so it
+				can return the beginning of the matched word if found */
+			auto temp = begin_original;
+			while ( *temp == *begin_sub )
+				{
+				++temp;
+				++begin_sub;
+				//if it hits the end of the substring, it signifies an exact match
+				if ( *begin_sub == '\0' )
+					{
+					//this is pointing at the beginning of the match
+					return begin_original;
+					}
+
+				}
+			//need to reset because still has to search rest of the string for a match
+			begin_sub = subStr.begin ( );
+			//sets the original text pointer to where the last search left off
+			begin_original = temp;
+			}
+
+		else
+			{
+			//DO NOTHING
+			}
+		}
+
+	return begin_original;
+
+	}
+
+set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
+                            "for", "have", "he", "her", "here", "him", "his", "how",
+                            "i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
+                            "some", "the", "their", "them", "there", "they", "that",
+                            "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
+                            "you", "your" };
+
+vector< string > splitStr ( string & originalText, char delim )
+	{
+	vector< string > splitWords;
+	auto begin = originalText.begin ( );
+
+	while ( *begin != '\0' )
+		{
+		string word = "";
+		while ( *begin != delim && *begin != '\0' )
+			{
+			word += *begin;
+			++begin;
+			}
+
+		splitWords.push_back ( word );
+		++begin;
+		}
+
+	return splitWords;
+
+	}
+
+bool isStopWord ( string & word )
+	{
+	return ( stopWords.find ( word ) != stopWords.end ( ) );
+
+	}
+
+string toLower ( string & word )
+	{
+	auto iter = word.begin ( );
+	string lowerWord = "";
+	while ( *iter != '\0' )
+		{
+		if ( *iter >= 'A' && *iter <= 'Z' )
+			{
+			lowerWord += ( *iter + 32 );
+			}
+
+		else
+			{
+			lowerWord += *iter;
+			}
+		++iter;
+		}
+
+	return lowerWord;
+	}
 
 #endif //EECS398_SEARCH_STRINGPROCESSING_H
-- 
GitLab