fixed style

910c3ec5 · vcday · cbe1e290 · 910c3ec5 · 910c3ec5 · 910c3ec5
Commit 910c3ec5 authored 7 years ago by vcday
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -18,8 +18,8 @@
 using namespace std;
 // Doc Id
-std::priority_queue<int> DOCID_PQ;
+std::priority_queue< int > DOCID_PQ;
-std::priority_queue<string> URL_PQ;
+std::priority_queue< string > URL_PQ;
 string PATH = "/doc";
 //TEMP - remove once getting actual crawler input
@@ -32,124 +32,134 @@ string PATH = "/doc";
 // if find url; send to crawler
 // if find title send string to tokenizer
 class Parser
-{
+	{
 public:
-    struct raw_data {
+	struct raw_data
-        string url;
+		{
-        string html_data;
+		string url;
+		string html_data;
-        raw_data(string u, string h) : url(u), html_data(h){}
+		raw_data ( string u, string h ) : url ( u ), html_data ( h )
-    };
+			{ }
+		};
-    /**
+	/**
-     * Parser
+	 * Parser
-     * @return
+	 * @return
-     */
+	 */
-    // input: object with char*  and URL string
+	// input: object with char*  and URL string
-    //
+	//
-    const unordered_map<string, vector<int>> execute()
+	const unordered_map< string, vector< int>> execute ( )
-    {
+		{
-        Tokenizer tokenizer;
+		Tokenizer tokenizer;
-        //TEMP - until we get real input from crawler
+		//TEMP - until we get real input from crawler
-        raw_data data("url", "html");
+		raw_data data ( "url", "html" );
-        parse(data.html_data, &tokenizer);
+		parse ( data.html_data, &tokenizer );
-        return tokenizer.get();
+		return tokenizer.get ( );
-    }
+		}
 private:
-    /**
+	/**
-     * Parses file
+	 * Parses file
-     * @param inFile
+	 * @param inFile
-     * @return
+	 * @return
-     */
+	 */
-    string parse(string &html_data, Tokenizer *tokenizer)
+	string parse ( string & html_data, Tokenizer *tokenizer )
-    {
+		{
-        //figure out file handle syntax - pointer to file
+		//figure out file handle syntax - pointer to file
-        string tokenizerInput = "";
+		string tokenizerInput = "";
-        string currentTerm = "";
+		string currentTerm = "";
-        for (int i = 0; i < html_data.size(); ++i) {
+		for ( int i = 0; i < html_data.size ( ); ++i )
-            while (html_data[i] != ' ') {
+			{
-                currentTerm += html_data[i];
+			while ( html_data[ i ] != ' ' )
-            }
+				{
+				currentTerm += html_data[ i ];
-            //one method that directly adds urls onto frontier instead of checking for them
+				}
-	        add_urls(currentTerm);
-            check_title(currentTerm);
+			//one method that directly adds urls onto frontier instead of checking for them
-            tokenizerInput += currentTerm;
+			add_urls ( currentTerm );
-        }
+			check_title ( currentTerm );
+			tokenizerInput += currentTerm;
-        tokenizer->execute(tokenizerInput);
+			}
-    }
+		tokenizer->execute ( tokenizerInput );
-    /*
+		}
-     * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
-     * Instead of bool, just directly adds on to url queue
+	/*
-     */
+	 * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
-    void add_urls(string &word)
+	 * Instead of bool, just directly adds on to url queue
-    {
+	 */
-        string a_tag = "<a";
+	void add_urls ( string & word )
-        string http_start = "href=http";
+		{
-        string http_end_tag = ">";
+		string a_tag = "<a";
+		string http_start = "href=http";
-        auto word_iter = word.begin();
+		string http_end_tag = ">";
-        string url = "";
-        word_iter = findStr(word_iter, a_tag);
+		auto word_iter = word.begin ( );
-        if (word_iter != nullptr) {
+		string url = "";
-            auto found_http = findStr(word_iter, http_start);
+		word_iter = findStr ( word_iter, a_tag );
-            if (found_http != nullptr) {
+		if ( word_iter != nullptr )
-	            url = "http";
+			{
-                found_http += 9;
+			auto found_http = findStr ( word_iter, http_start );
-                auto end_http = findStr(word_iter, http_end_tag);
+			if ( found_http != nullptr )
-                while (found_http != end_http) {
+				{
-                    url += *found_http;
+				url = "http";
-                    ++found_http;
+				found_http += 9;
-                }
+				auto end_http = findStr ( word_iter, http_end_tag );
-            }
+				while ( found_http != end_http )
-        }
+					{
+					url += *found_http;
-        else {
+					++found_http;
-            return;
+					}
-        }
+				}
+			}
-        if (url != "") {
-            URL_PQ.push(url);
+		else
-        }
+			{
+			return;
+			}
-    }
-    /**
+		if ( url != "" )
-     * <title >AJF</title>
+			{
-     * @param word
+			URL_PQ.push ( url );
-     */
+			}
-    bool check_title(string &word)
-    {
+		}
-        if (char* pos = strstr("<title>", word))
-        {
+	/**
-            pos += 6;
+	 * <title >AJF</title>
-            auto end_pos = strstr("</title>", word);
+	 * @param word
-            string title = "";
+	 */
-            while (pos != end_pos)
-            {
+	bool check_title ( string & word )
-                ++pos;
+		{
-                title += *pos;
+		if ( char *pos = strstr ( "<title>", word ) )
+			{
-            }
+			pos += 6;
+			auto end_pos = strstr ( "</title>", word );
-            return title;
+			string title = "";
-        }
+			while ( pos != end_pos )
+				{
+				++pos;
+				title += *pos;
+				}
+			return title;
+			}
 //        string begin_title = "<title>";
 //        auto word_begin = word.begin();
 //        auto word_iter = findStr(word_begin, begin_title);
-    }
+		}
-};
+	};
--- a/parser/Tokenizer.h
+++ b/parser/Tokenizer.h
@@ -6,37 +6,38 @@
 #include <unordered_map>
 #include <vector>
 #include "../util/stringProcessing.h"
 using namespace std;
 class Tokenizer
-{
+	{
 public:
-    Tokenizer()
+	Tokenizer ( )
-	    {
+		{
-        doc_index = new unordered_map<string, vector<int>>;
+		doc_index = new unordered_map< string, vector< int>>;
-        }
+		}
-    unordered_map<string, vector<int>> * get() const
+	unordered_map< string, vector< int>> *get ( ) const
-    {
+		{
-        return doc_index;
+		return doc_index;
-    }
+		}
-    void execute(string originalText)
+	void execute ( string originalText )
-    {
+		{
-        int offset = 0;
+		int offset = 0;
-        vector<string> splitText = splitStr(originalText, ' ');
+		vector< string > splitText = splitStr ( originalText, ' ' );
-        string lowerString = "";
+		string lowerString = "";
-        for (int i = 0; i < splitText.size(); ++i)
+		for ( int i = 0; i < splitText.size ( ); ++i )
-        {
+			{
-            lowerString = toLower(splitText[i]);
+			lowerString = toLower ( splitText[ i ] );
-            if (!isStopWord(lowerString))
+			if ( !isStopWord ( lowerString ) )
-            {
+				{
-	            (*doc_index)[lowerString].push_back(offset);
+				( *doc_index )[ lowerString ].push_back ( offset );
-                ++offset;
+				++offset;
-            }
+				}
-        }
+			}
-    }
+		}
 private:
-    unordered_map<string, vector<int>> *doc_index;
+	unordered_map< string, vector< int>> *doc_index;
-};
+	};
--- a/parser/tests/StringProcessing_unit.cpp
+++ b/parser/tests/StringProcessing_unit.cpp
@@ -10,12 +10,15 @@
 using namespace std;
-void test_findStr(string original);
+void test_findStr ( string original );
-void test_splitStr(string original);
-void test_toLower();
-void test_isStopWord();
-int main()
+void test_splitStr ( string original );
+void test_toLower ( );
+void test_isStopWord ( );
+int main ( )
 	{
 	cout << "Beginning testing for StringProcessing_unit" << endl << endl;
@@ -24,85 +27,83 @@ int main()
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";
-	test_findStr(original);
+	test_findStr ( original );
-	test_splitStr(original);
+	test_splitStr ( original );
-	test_toLower();
+	test_toLower ( );
-	test_isStopWord();
+	test_isStopWord ( );
 	cout << "\nTests passed for StringProcessing_unit :D" << endl;
 	}
-void test_findStr(string original)
+void test_findStr ( string original )
 	{
 	cout << "Testing findStr..." << endl;
 	string find = "established";
-	auto word = findStr(original.begin(), find);
+	auto word = findStr ( original.begin ( ), find );
-	assert(*word == 'e');
+	assert( *word == 'e' );
 	find = "Lorem Ipsum";
-	auto word2 = findStr (original.begin(), find);
+	auto word2 = findStr ( original.begin ( ), find );
-	assert(*word2 == 'L');
+	assert( *word2 == 'L' );
 	string title = "<title> This is a test </title>";
 	find = "<title>";
-	auto word3 = findStr (title.begin(), find);
+	auto word3 = findStr ( title.begin ( ), find );
-	assert(*word3 == '<');
+	assert( *word3 == '<' );
-	auto titleIt = title.begin();
+	auto titleIt = title.begin ( );
-	while (word3 != title.end() && titleIt != title.end())
+	while ( word3 != title.end ( ) && titleIt != title.end ( ) )
 		{
-		assert(*word3 == *titleIt);
+		assert( *word3 == *titleIt );
 		++word3;
 		++titleIt;
 		}
 	find = "</title>";
-	auto word4 = findStr (title.begin(), find);
+	auto word4 = findStr ( title.begin ( ), find );
-	assert(*word4 == '<' && *(word4 + 1) == '/');
+	assert( *word4 == '<' && *( word4 + 1 ) == '/' );
-	auto word0 = findStr (original.begin(), find);
+	auto word0 = findStr ( original.begin ( ), find );
-	assert(*word0 == '\0');
+	assert( *word0 == '\0' );
 	find = "orange";
-	auto word5 = findStr (original.begin(), find);
+	auto word5 = findStr ( original.begin ( ), find );
-	assert(*word5 == '\0');
+	assert( *word5 == '\0' );
 	find = "orange";
 	string test = "apple";
-	auto word7 = findStr (test.begin(), find);
+	auto word7 = findStr ( test.begin ( ), find );
-	assert(*word7 == '\0');
+	assert( *word7 == '\0' );
 	find = "bird";
 	test = "bigbird";
-	auto word6 = findStr (test.begin(), find);
+	auto word6 = findStr ( test.begin ( ), find );
-	assert(*word6 == 'b' && *(word6 + 1) == 'i' && *(word6 + 2) == 'r');
+	assert( *word6 == 'b' && *( word6 + 1 ) == 'i' && *( word6 + 2 ) == 'r' );
 	cout << "test_findStr passed" << endl;
 	}
-void test_splitStr(string original)
+void test_splitStr ( string original )
 	{
 	cout << "Testing splitStr..." << endl;
-	vector<string> vec = splitStr (original, ' ');
+	vector< string > vec = splitStr ( original, ' ' );
-	assert(vec.size() == 53);
+	assert( vec.size ( ) == 53 );
 	string word = "hello\ngoodbye";
-	vec = splitStr (word, '\n');
+	vec = splitStr ( word, '\n' );
-	assert(vec.size() == 2);
+	assert( vec.size ( ) == 2 );
-	assert(vec[0] == "hello" && vec[1] == "goodbye");
+	assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
 	cout << "test_splitStr passed" << endl;
 	}
+void test_toLower ( )
-void test_toLower()
 	{
 	cout << "Testing toLower..." << endl;
@@ -112,25 +113,24 @@ void test_toLower()
 	string word4 = "";
 	string word5 = " ";
-	string test = toLower (word);
+	string test = toLower ( word );
-	string test2 = toLower (word2);
+	string test2 = toLower ( word2 );
-	string test3 = toLower (word3);
+	string test3 = toLower ( word3 );
-	string test4 = toLower (word4);
+	string test4 = toLower ( word4 );
-	string test5 = toLower (word5);
+	string test5 = toLower ( word5 );
-	assert (test == "hello");
+	assert ( test == "hello" );
 	cout << test2 << endl;
-	assert (test2 == "hello");
+	assert ( test2 == "hello" );
-	assert (test3 == "hello goodbye !");
+	assert ( test3 == "hello goodbye !" );
-	assert (test4 == "");
+	assert ( test4 == "" );
-	assert (test5 == " ");
+	assert ( test5 == " " );
 	cout << "test_toLower passed" << endl;
 	}
+void test_isStopWord ( )
-void test_isStopWord()
 	{
 	cout << "Testing isStopWord..." << endl;
@@ -140,11 +140,11 @@ void test_isStopWord()
 	string blank = "";
 	string blank2 = " ";
-	assert (isStopWord (is));
+	assert ( isStopWord ( is ) );
-	assert (!isStopWord (hello));
+	assert ( !isStopWord ( hello ) );
-	assert (isStopWord (none));
+	assert ( isStopWord ( none ) );
-	assert (!isStopWord (blank));
+	assert ( !isStopWord ( blank ) );
-	assert (!isStopWord (blank2));
+	assert ( !isStopWord ( blank2 ) );
 	cout << "test_isStopWord passed" << endl;

--- a/parser/tests/TokenizerTest_unit.cpp
+++ b/parser/tests/TokenizerTest_unit.cpp
@@ -10,10 +10,10 @@
 using namespace std;
-void test_execute(string original);
+void test_execute ( string original );
-int main()
+int main ( )
 	{
 	cout << "Beginning testing for TokenizerTest_unit" << endl << endl;
@@ -22,27 +22,27 @@ int main()
 			"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
 			"making it look like readable English. ";
-	test_execute(original);
+	test_execute ( original );
 	cout << "\nTests passed for TokenizerTest_unit :D" << endl;
 	}
-void test_execute(string original)
+void test_execute ( string original )
 	{
 	Tokenizer my_tokenizer;
-	my_tokenizer.execute(original);
+	my_tokenizer.execute ( original );
-	auto dict = my_tokenizer.get();
+	auto dict = my_tokenizer.get ( );
-	for ( auto it = dict->begin(); it != dict->end(); it++ )
+	for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
 		{
-		cout << it->first  << ':';
+		cout << it->first << ':';
-		 for (int i = 0; i < it->second.size(); ++i)
+		for ( int i = 0; i < it->second.size ( ); ++i )
-			 {
+			{
-			 cout << it->second[i] << " ";
+			cout << it->second[ i ] << " ";
-			 }
+			}
-		cout << std::endl ;
+		cout << std::endl;
 		}
 	}
--- a/util/stringProcessing.h
+++ b/util/stringProcessing.h
@@ -18,103 +18,107 @@ using namespace std;
 * Takes in an iterator to the original text and a substring: specifically for a parser functionality
 * Potentially make one that takes in two strings? Is this needed?
 */
-string::iterator findStr(string::iterator originalText, string &subStr)
+string::iterator findStr ( string::iterator originalText, string & subStr )
-{
+	{
-    auto begin_sub = subStr.begin();
+	auto begin_sub = subStr.begin ( );
-    auto begin_original = originalText;
+	auto begin_original = originalText;
-    while ( *begin_original != '\0') //*(forward++) != '\0'
+	while ( *begin_original != '\0' ) //*(forward++) != '\0'
-    {
+		{
-        //keep looking for instance of a match
+		//keep looking for instance of a match
-        if ( *begin_original != *begin_sub )
+		if ( *begin_original != *begin_sub )
-        {
+			{
-            ++begin_original;
+			++begin_original;
-        }
+			}
-        else if ( *begin_original == *begin_sub )
+		else if ( *begin_original == *begin_sub )
-        {
+			{
-            /* want to keep the original iterator where it is so it
+			/* want to keep the original iterator where it is so it
-               can return the beginning of the matched word if found */
+				can return the beginning of the matched word if found */
-            auto temp = begin_original;
+			auto temp = begin_original;
-            while ( *temp == *begin_sub )
+			while ( *temp == *begin_sub )
-            {
+				{
-                ++temp;
+				++temp;
-                ++begin_sub;
+				++begin_sub;
-                //if it hits the end of the substring, it signifies an exact match
+				//if it hits the end of the substring, it signifies an exact match
-                if ( *begin_sub  == '\0')
+				if ( *begin_sub == '\0' )
-                {
+					{
-                    //this is pointing at the beginning of the match
+					//this is pointing at the beginning of the match
-                    return begin_original;
+					return begin_original;
-                }
+					}
-            }
+				}
-            //need to reset because still has to search rest of the string for a match
+			//need to reset because still has to search rest of the string for a match
-            begin_sub = subStr.begin();
+			begin_sub = subStr.begin ( );
-            //sets the original text pointer to where the last search left off
+			//sets the original text pointer to where the last search left off
-            begin_original = temp;
+			begin_original = temp;
-        }
+			}
-        else
+		else
-        {
+			{
-            //DO NOTHING
+			//DO NOTHING
-        }
+			}
-    }
+		}
-    return begin_original;
+	return begin_original;
-}
+	}
-set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
-                         "i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that",
+set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
-                         "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" };
+                            "for", "have", "he", "her", "here", "him", "his", "how",
+                            "i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
-vector<string> splitStr(string &originalText, char delim)
+                            "some", "the", "their", "them", "there", "they", "that",
-{
+                            "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
-    vector<string> splitWords;
+                            "you", "your" };
-    auto begin = originalText.begin();
+vector< string > splitStr ( string & originalText, char delim )
-    while (*begin != '\0')
+	{
-    {
+	vector< string > splitWords;
-        string word = "";
+	auto begin = originalText.begin ( );
-        while (*begin != delim && *begin != '\0')
-        {
+	while ( *begin != '\0' )
-            word += *begin;
+		{
-            ++begin;
+		string word = "";
-        }
+		while ( *begin != delim && *begin != '\0' )
+			{
-        splitWords.push_back(word);
+			word += *begin;
-	    ++begin;
+			++begin;
-    }
+			}
-    return splitWords;
+		splitWords.push_back ( word );
+		++begin;
-}
+		}
-bool isStopWord(string &word)
+	return splitWords;
-{
-    return (stopWords.find(word) != stopWords.end());
+	}
-}
+bool isStopWord ( string & word )
+	{
-string toLower(string &word)
+	return ( stopWords.find ( word ) != stopWords.end ( ) );
-{
-    auto iter = word.begin();
+	}
-    string lowerWord = "";
-    while (*iter != '\0')
+string toLower ( string & word )
-    {
+	{
-        if (*iter >= 'A' && *iter <= 'Z')
+	auto iter = word.begin ( );
-        {
+	string lowerWord = "";
-            lowerWord += (*iter + 32);
+	while ( *iter != '\0' )
-        }
+		{
+		if ( *iter >= 'A' && *iter <= 'Z' )
-        else
+			{
-        {
+			lowerWord += ( *iter + 32 );
-            lowerWord += *iter;
+			}
-        }
-	    ++iter;
+		else
-    }
+			{
+			lowerWord += *iter;
-    return lowerWord;
+			}
-}
+		++iter;
+		}
+	return lowerWord;
+	}
 #endif //EECS398_SEARCH_STRINGPROCESSING_H