Skip to content
Snippets Groups Projects
Commit 910c3ec5 authored by vcday's avatar vcday
Browse files

fixed style

parent cbe1e290
No related branches found
No related tags found
No related merge requests found
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
using namespace std; using namespace std;
// Doc Id // Doc Id
std::priority_queue<int> DOCID_PQ; std::priority_queue< int > DOCID_PQ;
std::priority_queue<string> URL_PQ; std::priority_queue< string > URL_PQ;
string PATH = "/doc"; string PATH = "/doc";
//TEMP - remove once getting actual crawler input //TEMP - remove once getting actual crawler input
...@@ -32,124 +32,134 @@ string PATH = "/doc"; ...@@ -32,124 +32,134 @@ string PATH = "/doc";
// if find url; send to crawler // if find url; send to crawler
// if find title send string to tokenizer // if find title send string to tokenizer
class Parser class Parser
{ {
public: public:
struct raw_data { struct raw_data
string url; {
string html_data; string url;
string html_data;
raw_data(string u, string h) : url(u), html_data(h){} raw_data ( string u, string h ) : url ( u ), html_data ( h )
}; { }
};
/** /**
* Parser * Parser
* @return * @return
*/ */
// input: object with char* and URL string // input: object with char* and URL string
// //
const unordered_map<string, vector<int>> execute() const unordered_map< string, vector< int>> execute ( )
{ {
Tokenizer tokenizer; Tokenizer tokenizer;
//TEMP - until we get real input from crawler //TEMP - until we get real input from crawler
raw_data data("url", "html"); raw_data data ( "url", "html" );
parse(data.html_data, &tokenizer); parse ( data.html_data, &tokenizer );
return tokenizer.get(); return tokenizer.get ( );
} }
private: private:
/** /**
* Parses file * Parses file
* @param inFile * @param inFile
* @return * @return
*/ */
string parse(string &html_data, Tokenizer *tokenizer) string parse ( string & html_data, Tokenizer *tokenizer )
{ {
//figure out file handle syntax - pointer to file //figure out file handle syntax - pointer to file
string tokenizerInput = ""; string tokenizerInput = "";
string currentTerm = ""; string currentTerm = "";
for (int i = 0; i < html_data.size(); ++i) { for ( int i = 0; i < html_data.size ( ); ++i )
while (html_data[i] != ' ') { {
currentTerm += html_data[i]; while ( html_data[ i ] != ' ' )
} {
currentTerm += html_data[ i ];
//one method that directly adds urls onto frontier instead of checking for them }
add_urls(currentTerm);
check_title(currentTerm); //one method that directly adds urls onto frontier instead of checking for them
tokenizerInput += currentTerm; add_urls ( currentTerm );
} check_title ( currentTerm );
tokenizerInput += currentTerm;
tokenizer->execute(tokenizerInput); }
}
tokenizer->execute ( tokenizerInput );
/* }
* Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
* Instead of bool, just directly adds on to url queue /*
*/ * Uses findStr function in stringProcessing.h: STILL HAVE TO TEST
void add_urls(string &word) * Instead of bool, just directly adds on to url queue
{ */
string a_tag = "<a"; void add_urls ( string & word )
string http_start = "href=http"; {
string http_end_tag = ">"; string a_tag = "<a";
string http_start = "href=http";
auto word_iter = word.begin(); string http_end_tag = ">";
string url = "";
word_iter = findStr(word_iter, a_tag); auto word_iter = word.begin ( );
if (word_iter != nullptr) { string url = "";
auto found_http = findStr(word_iter, http_start); word_iter = findStr ( word_iter, a_tag );
if (found_http != nullptr) { if ( word_iter != nullptr )
url = "http"; {
found_http += 9; auto found_http = findStr ( word_iter, http_start );
auto end_http = findStr(word_iter, http_end_tag); if ( found_http != nullptr )
while (found_http != end_http) { {
url += *found_http; url = "http";
++found_http; found_http += 9;
} auto end_http = findStr ( word_iter, http_end_tag );
} while ( found_http != end_http )
} {
url += *found_http;
else { ++found_http;
return; }
} }
}
if (url != "") {
URL_PQ.push(url); else
} {
return;
}
}
/** if ( url != "" )
* <title >AJF</title> {
* @param word URL_PQ.push ( url );
*/ }
bool check_title(string &word)
{ }
if (char* pos = strstr("<title>", word))
{ /**
pos += 6; * <title >AJF</title>
auto end_pos = strstr("</title>", word); * @param word
string title = ""; */
while (pos != end_pos)
{ bool check_title ( string & word )
++pos; {
title += *pos; if ( char *pos = strstr ( "<title>", word ) )
{
} pos += 6;
auto end_pos = strstr ( "</title>", word );
return title; string title = "";
} while ( pos != end_pos )
{
++pos;
title += *pos;
}
return title;
}
// string begin_title = "<title>"; // string begin_title = "<title>";
// auto word_begin = word.begin(); // auto word_begin = word.begin();
// auto word_iter = findStr(word_begin, begin_title); // auto word_iter = findStr(word_begin, begin_title);
} }
}; };
...@@ -6,37 +6,38 @@ ...@@ -6,37 +6,38 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "../util/stringProcessing.h" #include "../util/stringProcessing.h"
using namespace std; using namespace std;
class Tokenizer class Tokenizer
{ {
public: public:
Tokenizer() Tokenizer ( )
{ {
doc_index = new unordered_map<string, vector<int>>; doc_index = new unordered_map< string, vector< int>>;
} }
unordered_map<string, vector<int>> * get() const unordered_map< string, vector< int>> *get ( ) const
{ {
return doc_index; return doc_index;
} }
void execute(string originalText) void execute ( string originalText )
{ {
int offset = 0; int offset = 0;
vector<string> splitText = splitStr(originalText, ' '); vector< string > splitText = splitStr ( originalText, ' ' );
string lowerString = ""; string lowerString = "";
for (int i = 0; i < splitText.size(); ++i) for ( int i = 0; i < splitText.size ( ); ++i )
{ {
lowerString = toLower(splitText[i]); lowerString = toLower ( splitText[ i ] );
if (!isStopWord(lowerString)) if ( !isStopWord ( lowerString ) )
{ {
(*doc_index)[lowerString].push_back(offset); ( *doc_index )[ lowerString ].push_back ( offset );
++offset; ++offset;
} }
} }
} }
private: private:
unordered_map<string, vector<int>> *doc_index; unordered_map< string, vector< int>> *doc_index;
}; };
...@@ -10,12 +10,15 @@ ...@@ -10,12 +10,15 @@
using namespace std; using namespace std;
void test_findStr(string original); void test_findStr ( string original );
void test_splitStr(string original);
void test_toLower();
void test_isStopWord();
int main() void test_splitStr ( string original );
void test_toLower ( );
void test_isStopWord ( );
int main ( )
{ {
cout << "Beginning testing for StringProcessing_unit" << endl << endl; cout << "Beginning testing for StringProcessing_unit" << endl << endl;
...@@ -24,85 +27,83 @@ int main() ...@@ -24,85 +27,83 @@ int main()
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. "; "making it look like readable English. ";
test_findStr(original); test_findStr ( original );
test_splitStr(original); test_splitStr ( original );
test_toLower(); test_toLower ( );
test_isStopWord(); test_isStopWord ( );
cout << "\nTests passed for StringProcessing_unit :D" << endl; cout << "\nTests passed for StringProcessing_unit :D" << endl;
} }
void test_findStr(string original) void test_findStr ( string original )
{ {
cout << "Testing findStr..." << endl; cout << "Testing findStr..." << endl;
string find = "established"; string find = "established";
auto word = findStr(original.begin(), find); auto word = findStr ( original.begin ( ), find );
assert(*word == 'e'); assert( *word == 'e' );
find = "Lorem Ipsum"; find = "Lorem Ipsum";
auto word2 = findStr (original.begin(), find); auto word2 = findStr ( original.begin ( ), find );
assert(*word2 == 'L'); assert( *word2 == 'L' );
string title = "<title> This is a test </title>"; string title = "<title> This is a test </title>";
find = "<title>"; find = "<title>";
auto word3 = findStr (title.begin(), find); auto word3 = findStr ( title.begin ( ), find );
assert(*word3 == '<'); assert( *word3 == '<' );
auto titleIt = title.begin(); auto titleIt = title.begin ( );
while (word3 != title.end() && titleIt != title.end()) while ( word3 != title.end ( ) && titleIt != title.end ( ) )
{ {
assert(*word3 == *titleIt); assert( *word3 == *titleIt );
++word3; ++word3;
++titleIt; ++titleIt;
} }
find = "</title>"; find = "</title>";
auto word4 = findStr (title.begin(), find); auto word4 = findStr ( title.begin ( ), find );
assert(*word4 == '<' && *(word4 + 1) == '/'); assert( *word4 == '<' && *( word4 + 1 ) == '/' );
auto word0 = findStr (original.begin(), find); auto word0 = findStr ( original.begin ( ), find );
assert(*word0 == '\0'); assert( *word0 == '\0' );
find = "orange"; find = "orange";
auto word5 = findStr (original.begin(), find); auto word5 = findStr ( original.begin ( ), find );
assert(*word5 == '\0'); assert( *word5 == '\0' );
find = "orange"; find = "orange";
string test = "apple"; string test = "apple";
auto word7 = findStr (test.begin(), find); auto word7 = findStr ( test.begin ( ), find );
assert(*word7 == '\0'); assert( *word7 == '\0' );
find = "bird"; find = "bird";
test = "bigbird"; test = "bigbird";
auto word6 = findStr (test.begin(), find); auto word6 = findStr ( test.begin ( ), find );
assert(*word6 == 'b' && *(word6 + 1) == 'i' && *(word6 + 2) == 'r'); assert( *word6 == 'b' && *( word6 + 1 ) == 'i' && *( word6 + 2 ) == 'r' );
cout << "test_findStr passed" << endl; cout << "test_findStr passed" << endl;
} }
void test_splitStr(string original) void test_splitStr ( string original )
{ {
cout << "Testing splitStr..." << endl; cout << "Testing splitStr..." << endl;
vector<string> vec = splitStr (original, ' '); vector< string > vec = splitStr ( original, ' ' );
assert(vec.size() == 53); assert( vec.size ( ) == 53 );
string word = "hello\ngoodbye"; string word = "hello\ngoodbye";
vec = splitStr (word, '\n'); vec = splitStr ( word, '\n' );
assert(vec.size() == 2); assert( vec.size ( ) == 2 );
assert(vec[0] == "hello" && vec[1] == "goodbye"); assert( vec[ 0 ] == "hello" && vec[ 1 ] == "goodbye" );
cout << "test_splitStr passed" << endl; cout << "test_splitStr passed" << endl;
} }
void test_toLower ( )
void test_toLower()
{ {
cout << "Testing toLower..." << endl; cout << "Testing toLower..." << endl;
...@@ -112,25 +113,24 @@ void test_toLower() ...@@ -112,25 +113,24 @@ void test_toLower()
string word4 = ""; string word4 = "";
string word5 = " "; string word5 = " ";
string test = toLower (word); string test = toLower ( word );
string test2 = toLower (word2); string test2 = toLower ( word2 );
string test3 = toLower (word3); string test3 = toLower ( word3 );
string test4 = toLower (word4); string test4 = toLower ( word4 );
string test5 = toLower (word5); string test5 = toLower ( word5 );
assert (test == "hello"); assert ( test == "hello" );
cout << test2 << endl; cout << test2 << endl;
assert (test2 == "hello"); assert ( test2 == "hello" );
assert (test3 == "hello goodbye !"); assert ( test3 == "hello goodbye !" );
assert (test4 == ""); assert ( test4 == "" );
assert (test5 == " "); assert ( test5 == " " );
cout << "test_toLower passed" << endl; cout << "test_toLower passed" << endl;
} }
void test_isStopWord ( )
void test_isStopWord()
{ {
cout << "Testing isStopWord..." << endl; cout << "Testing isStopWord..." << endl;
...@@ -140,11 +140,11 @@ void test_isStopWord() ...@@ -140,11 +140,11 @@ void test_isStopWord()
string blank = ""; string blank = "";
string blank2 = " "; string blank2 = " ";
assert (isStopWord (is)); assert ( isStopWord ( is ) );
assert (!isStopWord (hello)); assert ( !isStopWord ( hello ) );
assert (isStopWord (none)); assert ( isStopWord ( none ) );
assert (!isStopWord (blank)); assert ( !isStopWord ( blank ) );
assert (!isStopWord (blank2)); assert ( !isStopWord ( blank2 ) );
cout << "test_isStopWord passed" << endl; cout << "test_isStopWord passed" << endl;
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
using namespace std; using namespace std;
void test_execute(string original); void test_execute ( string original );
int main() int main ( )
{ {
cout << "Beginning testing for TokenizerTest_unit" << endl << endl; cout << "Beginning testing for TokenizerTest_unit" << endl << endl;
...@@ -22,27 +22,27 @@ int main() ...@@ -22,27 +22,27 @@ int main()
"The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here'," "The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here',"
"making it look like readable English. "; "making it look like readable English. ";
test_execute(original); test_execute ( original );
cout << "\nTests passed for TokenizerTest_unit :D" << endl; cout << "\nTests passed for TokenizerTest_unit :D" << endl;
} }
void test_execute(string original) void test_execute ( string original )
{ {
Tokenizer my_tokenizer; Tokenizer my_tokenizer;
my_tokenizer.execute(original); my_tokenizer.execute ( original );
auto dict = my_tokenizer.get(); auto dict = my_tokenizer.get ( );
for ( auto it = dict->begin(); it != dict->end(); it++ ) for ( auto it = dict->begin ( ); it != dict->end ( ); it++ )
{ {
cout << it->first << ':'; cout << it->first << ':';
for (int i = 0; i < it->second.size(); ++i) for ( int i = 0; i < it->second.size ( ); ++i )
{ {
cout << it->second[i] << " "; cout << it->second[ i ] << " ";
} }
cout << std::endl ; cout << std::endl;
} }
} }
...@@ -18,103 +18,107 @@ using namespace std; ...@@ -18,103 +18,107 @@ using namespace std;
* Takes in an iterator to the original text and a substring: specifically for a parser functionality * Takes in an iterator to the original text and a substring: specifically for a parser functionality
* Potentially make one that takes in two strings? Is this needed? * Potentially make one that takes in two strings? Is this needed?
*/ */
string::iterator findStr(string::iterator originalText, string &subStr) string::iterator findStr ( string::iterator originalText, string & subStr )
{ {
auto begin_sub = subStr.begin(); auto begin_sub = subStr.begin ( );
auto begin_original = originalText; auto begin_original = originalText;
while ( *begin_original != '\0') //*(forward++) != '\0' while ( *begin_original != '\0' ) //*(forward++) != '\0'
{ {
//keep looking for instance of a match //keep looking for instance of a match
if ( *begin_original != *begin_sub ) if ( *begin_original != *begin_sub )
{ {
++begin_original; ++begin_original;
} }
else if ( *begin_original == *begin_sub ) else if ( *begin_original == *begin_sub )
{ {
/* want to keep the original iterator where it is so it /* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */ can return the beginning of the matched word if found */
auto temp = begin_original; auto temp = begin_original;
while ( *temp == *begin_sub ) while ( *temp == *begin_sub )
{ {
++temp; ++temp;
++begin_sub; ++begin_sub;
//if it hits the end of the substring, it signifies an exact match //if it hits the end of the substring, it signifies an exact match
if ( *begin_sub == '\0') if ( *begin_sub == '\0' )
{ {
//this is pointing at the beginning of the match //this is pointing at the beginning of the match
return begin_original; return begin_original;
} }
} }
//need to reset because still has to search rest of the string for a match //need to reset because still has to search rest of the string for a match
begin_sub = subStr.begin(); begin_sub = subStr.begin ( );
//sets the original text pointer to where the last search left off //sets the original text pointer to where the last search left off
begin_original = temp; begin_original = temp;
} }
else else
{ {
//DO NOTHING //DO NOTHING
} }
} }
return begin_original; return begin_original;
} }
set<string> stopWords = {"a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from", "for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ","me", "my", "none", "of", "on", "or", "our", "she", "some", "the", "their", "them", "there", "they", "that", set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "be", "been", "but", "by", "few", "from",
"this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with", "you", "your" }; "for", "have", "he", "her", "here", "him", "his", "how",
"i", "in", "is", "it", "its", "many ", "me", "my", "none", "of", "on", "or", "our", "she",
vector<string> splitStr(string &originalText, char delim) "some", "the", "their", "them", "there", "they", "that",
{ "this", "to", "us", "was", "what", "when", "where", "which", "who", "why", "will", "with",
vector<string> splitWords; "you", "your" };
auto begin = originalText.begin();
vector< string > splitStr ( string & originalText, char delim )
while (*begin != '\0') {
{ vector< string > splitWords;
string word = ""; auto begin = originalText.begin ( );
while (*begin != delim && *begin != '\0')
{ while ( *begin != '\0' )
word += *begin; {
++begin; string word = "";
} while ( *begin != delim && *begin != '\0' )
{
splitWords.push_back(word); word += *begin;
++begin; ++begin;
} }
return splitWords; splitWords.push_back ( word );
++begin;
} }
bool isStopWord(string &word) return splitWords;
{
return (stopWords.find(word) != stopWords.end()); }
} bool isStopWord ( string & word )
{
string toLower(string &word) return ( stopWords.find ( word ) != stopWords.end ( ) );
{
auto iter = word.begin(); }
string lowerWord = "";
while (*iter != '\0') string toLower ( string & word )
{ {
if (*iter >= 'A' && *iter <= 'Z') auto iter = word.begin ( );
{ string lowerWord = "";
lowerWord += (*iter + 32); while ( *iter != '\0' )
} {
if ( *iter >= 'A' && *iter <= 'Z' )
else {
{ lowerWord += ( *iter + 32 );
lowerWord += *iter; }
}
++iter; else
} {
lowerWord += *iter;
return lowerWord; }
} ++iter;
}
return lowerWord;
}
#endif //EECS398_SEARCH_STRINGPROCESSING_H #endif //EECS398_SEARCH_STRINGPROCESSING_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment