Skip to content
Snippets Groups Projects
Commit b9acd359 authored by vcday's avatar vcday
Browse files

parse logic imporved

parent a041c8ff
No related branches found
No related tags found
No related merge requests found
......@@ -54,39 +54,46 @@ private:
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void parse ( string html, Tokenizer *tokenizer )
{
string tokenizerInput = "";
string currentTerm = "";
int index = 0;
while (index != html.size())
auto htmlIt = html.begin();
int offset = 0;
while (htmlIt != html.end())
{
currentTerm = "";
while ( html.at( index ) != '\n' )
{
currentTerm += html[ index ];
++index;
}
++index;
string url = extract_url ( currentTerm );
if (url != "")
// if open bracket
if ( *htmlIt == '<' )
{
urlFrontier->Push (url);
auto begCloseTag = findNext ("</", htmlIt);
auto endCloseTag = findNext ( ">", begCloseTag);
string line (htmlIt, endCloseTag + 1);
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url ( line );
if (url != "")
{
urlFrontier->Push ( url );
}
// check if line is title
else
{
string title = extract_title ( line );
if (title != "")
{
tokenizer->execute ( title, offset );
}
}
//TODO fix offset?
offset = htmlIt - html.begin();
}
else
{
string title = extract_title ( currentTerm );
if (title != "")
{
tokenizerInput += title;
}
++htmlIt;
}
}
tokenizer->execute ( tokenizerInput );
}
......@@ -98,16 +105,15 @@ private:
string extract_url ( string word )
{
string url = "";
if ( *findStr ( word, "<a" ) != '\0' )
if ( *findStr ( "<a", word ) != '\0' )
{
auto foundHttp = findStr ( word, "href=http" );
auto foundHref = findStr ( "href", word );
auto foundHttp = findNext ( "http", foundHref );
if ( *foundHttp != '\0' )
{
url = "http";
foundHttp += 9;
while ( *foundHttp != *findStr ( word, "\">" ) )
url = "";
auto closeTag = findNext ( ">", word.begin ( ) );
while ( *foundHttp != *closeTag )
{
url += *foundHttp;
++foundHttp;
......
......@@ -11,11 +11,23 @@
using namespace std;
void testSimple ( );
void testComplex ( );
int main ( )
{
cout << "Testing Parser ... " << endl << endl;
testSimple ();
testComplex ();
cout << "Parser Tests Passed! :D" << endl;
}
void testSimple ( )
{
ProducerConsumerQueue < string > * urlFrontierTest;
Document document ( "<title>This Cat Title Cat</title>\n" );
Document document ( "<title>This Cat Title Cat</title>" );
Parser parser ( urlFrontierTest );
auto dictionary = parser.execute ( &document );
......@@ -28,7 +40,44 @@ int main ( )
assert ( dictionary->at ( "cat" )[ 0 ] == 0 && dictionary->at ( "cat" )[ 1 ] == 2 );
assert ( dictionary->at ( "title" )[ 0 ] == 1 );
cout << "Parser Tests Passed! :D" << endl;
delete dictionary;
}
void testComplex ( )
{
ProducerConsumerQueue < string > * urlFrontierTest;
ifstream file("../tests/cats.html");
string temp;
string docString = "<title>Joe the Cat</title>\n";
docString += "<a href=\"https://www.w3schools.com/html/\">Visit our HTML tutorial</a>\n";
while(std::getline(file, temp)) {
docString += temp;
}
Document document ( docString );
Parser parser ( urlFrontierTest );
auto dictionary = parser.execute ( &document );
// cout << dictionary->size () << endl;
// for (auto p : *dictionary)
// cout << p.first << endl;
assert ( dictionary != nullptr );
assert ( dictionary->size () == 3);
assert ( dictionary->find ( "cat" ) != dictionary->end () );
assert ( dictionary->find ( "story" ) != dictionary->end () );
assert ( dictionary->find ( "joe" ) != dictionary->end () );
assert ( dictionary->find ( "the" ) == dictionary->end () );
assert ( dictionary->find ( "of" ) == dictionary->end () );
// assert ( dictionary->at ( "cat" )[ 0 ] == 1 );
// assert ( dictionary->at ( "story" )[ 0 ] == 0 );
// cout << urlFrontierTest->Size () << endl;
// cout << urlFrontierTest->Pop () << endl;
delete dictionary;
}
\ No newline at end of file
......@@ -14,17 +14,16 @@ class Tokenizer
public:
Tokenizer ( )
{
doc_index = new unordered_map< string, vector< int>>;
docIndex = new unordered_map< string, vector< int>>;
}
unordered_map< string, vector< int>> *get ( ) const
{
return doc_index;
return docIndex;
}
void execute ( string originalText )
void execute ( string originalText, int offset )
{
int offset = 0;
vector< string > splitText = splitStr ( originalText, ' ' );
string lowerString = "";
for ( int i = 0; i < splitText.size ( ); ++i )
......@@ -32,12 +31,12 @@ public:
lowerString = toLower ( splitText[ i ] );
if ( !isStopWord ( lowerString ) )
{
( *doc_index )[ lowerString ].push_back ( offset );
( *docIndex )[ lowerString ].push_back ( offset );
++offset;
}
}
}
private:
unordered_map< string, vector< int>> *doc_index;
unordered_map< string, vector< int>> *docIndex;
};
......@@ -24,9 +24,10 @@ set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "
"you", "your" };
/**
* Finds the needle in the haystack
* returns position of first match
* @param haystack
* @param needle
* @return
* @return string::iterator
*/
string::iterator findStr (string needle, string haystack )
{
......@@ -75,6 +76,55 @@ string::iterator findStr (string needle, string haystack )
}
/**
* Finds the next position of the needle in the string
* @param needle
* @param pointer
* @return string::iterator
*/
string::iterator findNext (string needle, string::iterator haystackPointer )
{
auto beginNeedle = needle.begin ( );
auto beginHaystack = haystackPointer;
while ( *beginHaystack != '\0' )
{
//keep looking for instance of a match
if ( *beginHaystack != *beginNeedle )
{
++beginHaystack;
}
else if ( *beginHaystack == *beginNeedle )
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto temp = beginHaystack;
while ( *temp == *beginNeedle )
{
++temp;
++beginNeedle;
//if it hits the end of the needleing, it signifies an exact match
if ( *beginNeedle == '\0' )
{
//this is pointing at the beginning of the match
return beginHaystack;
}
}
//need to reset because still has to search rest of the string for a match
beginNeedle = needle.begin ( );
//sets the original text pointer to where the last search left off
beginHaystack = temp;
}
else
{
//DO NOTHING
}
}
return beginHaystack;
}
/**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment