Skip to content
Snippets Groups Projects
Commit dc615c1e authored by vcday's avatar vcday
Browse files

better url and string parsing

parent a903b672
No related branches found
No related tags found
No related merge requests found
...@@ -55,8 +55,13 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ...@@ -55,8 +55,13 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
// if open bracket // if open bracket
if ( html[ htmlIt ] == '<' ) if ( html[ htmlIt ] == '<' )
{ {
if ( isInvalidTag( html, htmlIt ) )
if ( html[ htmlIt + 1 ] == 'p' && ( ( html[htmlIt + 2]) == '>' || ( html[ htmlIt + 2 ] == ' ') ) ) {
begCloseTag = findNext( ">", htmlIt, html );
htmlIt = begCloseTag;
continue;
}
else if ( html[ htmlIt + 1 ] == 'p' && ( ( html[htmlIt + 2]) == '>' || ( html[ htmlIt + 2 ] == ' ') ) )
{ {
begCloseTag = findNext( "</p>", htmlIt, html ); begCloseTag = findNext( "</p>", htmlIt, html );
isParagraph = true; isParagraph = true;
...@@ -86,7 +91,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ...@@ -86,7 +91,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
} }
// if html line is url, parses accordingly and pushes to frontier // if html line is url, parses accordingly and pushes to frontier
else if ( url != "" ) else if ( url != "" && url != "#" )
{ {
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false ); pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false );
} }
...@@ -122,6 +127,79 @@ string Parser::extractAnchorText ( string html ) ...@@ -122,6 +127,79 @@ string Parser::extractAnchorText ( string html )
return ""; return "";
} }
/**
* Return true if the tag does not close
* @param html
* @param htmlIt
* @return
*/
bool Parser::isInvalidTag( string html, unsigned long htmlIt )
{
// check size first so stay within string
if ( htmlIt + 1 < html.size( ) && html[ htmlIt + 1 ] == '!' && html[ htmlIt + 1 ] == '%' )
{
// comment or <!DocType> or <%
return true;
}
if (htmlIt + 3 < html.size( ) )
{
// break
if ( html[ htmlIt + 1 ] == 'b' && html[ htmlIt + 2 ] == 'r' && html[ htmlIt + 3 ] == '>' )
return true;
// br
else if ( html[ htmlIt + 1 ] == 'c' && html[ htmlIt + 2 ] == 'o' && html[ htmlIt + 3 ] == 'l' )
return true;
// img
else if ( html[ htmlIt + 1 ] == 'i' && html[ htmlIt + 2 ] == 'm' && html[ htmlIt + 3 ] == 'g' )
return true;
// input
else if ( html[ htmlIt + 1 ] == 'i' && html[ htmlIt + 2 ] == 'n' && html[ htmlIt + 3 ] == 'p' )
return true;
// keygen
else if ( html[ htmlIt + 1 ] == 'k' && html[ htmlIt + 2 ] == 'e' && html[ htmlIt + 3 ] == 'y' )
return true;
// link
else if ( html[ htmlIt + 1 ] == 'l' && html[ htmlIt + 2 ] == 'i' && html[ htmlIt + 3 ] == 'n' )
return true;
// source
else if ( html[ htmlIt + 1 ] == 's' && html[ htmlIt + 2 ] == 'o' && html[ htmlIt + 3 ] == 'u' )
return true;
// wbr
else if ( html[ htmlIt + 1 ] == 'w' && html[ htmlIt + 2 ] == 'b' && html[ htmlIt + 3 ] == 'r' )
return true;
}
if ( htmlIt + 4 < html.size( ) )
{
// meta
if ( html[ htmlIt + 1 ] == 'm' && html[ htmlIt + 2 ] == 'e' && html[ htmlIt + 3 ] == 't' && html[ htmlIt + 4 ] == 'a' )
return true;
// area
else if ( html[ htmlIt + 1 ] == 'a' && html[ htmlIt + 2 ] == 'r' && html[ htmlIt + 3 ] == 'e' && html[ htmlIt + 4 ] == 'a' )
return true;
// base
else if ( html[ htmlIt + 1 ] == 'b' && html[ htmlIt + 2 ] == 'a' && html[ htmlIt + 3 ] == 's' && html[ htmlIt + 4 ] == 'e' )
return true;
// command
else if ( html[ htmlIt + 1 ] == 'c' && html[ htmlIt + 2 ] == 'o' && html[ htmlIt + 3 ] == 'm' && html[ htmlIt + 4 ] == 'm' )
return true;
// track
else if ( html[ htmlIt + 1 ] == 't' && html[ htmlIt + 2 ] == 'r' && html[ htmlIt + 3 ] == 'a' && html[ htmlIt + 4 ] == 'c' )
return true;
}
if ( htmlIt + 5 < html.size( ) )
{
// param
if ( html[ htmlIt + 1 ] == 'p' && html[ htmlIt + 2 ] == 'a' && html[ htmlIt + 3 ] == 'r'
&& html[ htmlIt + 4 ] == 'a' && html[ htmlIt + 5 ] == 'm' )
return true;
}
return false;
}
/** /**
* Returns a url, or "" if none * Returns a url, or "" if none
* @param word * @param word
...@@ -130,54 +208,23 @@ string Parser::extractAnchorText ( string html ) ...@@ -130,54 +208,23 @@ string Parser::extractAnchorText ( string html )
string Parser::extractUrl ( string html ) string Parser::extractUrl ( string html )
{ {
string url = ""; string url = "";
if ( findStr( "<a", html ) != html.size( ) ) unsigned long aTag = findStr( "<a", html );
if ( aTag != html.size( ) )
{ {
unsigned long foundHref = findStr( "href", html ); unsigned long foundHref = findStr( "href=", html );
unsigned long foundHttp = findNext( "http", foundHref, html ); unsigned long begQuote = findNext( "\"", foundHref, html );
if ( foundHttp < html.size( ) )
{
url = "";
unsigned long closeTag = findNext( ">", foundHref, html );
// unsigned long closeQuote = findNext( "\"", foundHref, html );
unsigned long closeSpace = findNext( " ", foundHref, html );
unsigned long closeUrl = 0;
// ends in "
// if ( closeQuote < html.size( ) && closeTag < html.size( ) && closeQuote < closeTag && closeQuote < closeSpace )
// {
// closeUrl = closeQuote;
// }
// end == ' >'
if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
{
if ( html[ closeSpace - 1 ] == '\"' )
{
closeSpace -= 1;
}
closeUrl = closeSpace;
}
// end == '>'
else if ( closeTag < html.size( ) )
{
if ( html[ closeTag - 1 ] == '\"' )
{
closeTag -= 1;
}
closeUrl = closeTag;
}
while ( foundHttp != closeUrl && foundHttp < html.size() && html[ foundHttp ] != '\n' ) if ( begQuote < html.size( ) )
{
++begQuote;
unsigned long endQuote = findNext( "\"", begQuote + 1, html );
while ( begQuote != endQuote && endQuote < html.size( ) && begQuote < html.size( ) )
{ {
url.push_back( html[ foundHttp ] ); url += html[ begQuote ];
++foundHttp; ++begQuote;
} }
} }
} }
while ( !url.empty( ) && ( url.back( ) == '\"' || url.back( ) == ';' ) )
{
url.pop_back( );
}
return url; return url;
} }
...@@ -205,14 +252,29 @@ string Parser::extractTitle ( string html ) ...@@ -205,14 +252,29 @@ string Parser::extractTitle ( string html )
} }
/** /**
* Will return true if local url * Will return new url if local
* *
* @param url * @param url
* @param currentUrl
* @return * @return
*/ */
bool Parser::isLocal ( string url ) string Parser::isLocal ( string url, ParsedUrl currentUrl )
{ {
return ( url[ 0 ] == '/' ); if ( url[ 0 ] != '/' )
{
return url;
}
if ( currentUrl.getCompleteUrl( ).back( ) == '/' )
{
string temp = currentUrl.getCompleteUrl( );
temp.pop_back();
url = temp + url;
}
else
{
url = currentUrl.getCompleteUrl( ) + url;
}
return url;
} }
/** /**
...@@ -245,6 +307,11 @@ bool Parser::isValid ( string url ) ...@@ -245,6 +307,11 @@ bool Parser::isValid ( string url )
{ {
return false; return false;
} }
// #
if ( url[ 0 ] == '#' )
{
return false;
}
return true; return true;
} }
...@@ -258,10 +325,7 @@ bool Parser::isValid ( string url ) ...@@ -258,10 +325,7 @@ bool Parser::isValid ( string url )
*/ */
void Parser::pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug ) void Parser::pushToUrlQueue ( string url, ParsedUrl currentUrl, string anchorText, bool debug )
{ {
if ( isLocal( url ) ) url = isLocal( url, currentUrl );
{
url = currentUrl.getCompleteUrl( ) + url;
}
if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) ) if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) )
{ {
ParsedUrl pUrl = ParsedUrl( url ); ParsedUrl pUrl = ParsedUrl( url );
...@@ -327,7 +391,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon ...@@ -327,7 +391,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon
else if ( url != "" ) else if ( url != "" )
{ {
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false ); pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), true );
} }
// check if line is title // check if line is title
// check if line is title // check if line is title
......
...@@ -53,6 +53,14 @@ private: ...@@ -53,6 +53,14 @@ private:
*/ */
string extractAnchorText ( string html ); string extractAnchorText ( string html );
/**
* Returns true if no closing tag & should ignore
* @param html
* @param htmlIt
* @return
*/
bool isInvalidTag( string html, unsigned long htmlIt );
/** /**
* Returns a url, or "" if none * Returns a url, or "" if none
* @param html * @param html
...@@ -72,9 +80,10 @@ private: ...@@ -72,9 +80,10 @@ private:
* Will return true if local url * Will return true if local url
* *
* @param url * @param url
* @param currentUrl
* @return * @return
*/ */
bool isLocal ( string url ); string isLocal ( string url, ParsedUrl currentUrl );
/** /**
* Returns true is url is valid * Returns true is url is valid
......
...@@ -108,21 +108,22 @@ void testHttp( ) ...@@ -108,21 +108,22 @@ void testHttp( )
auto dictionary = parser.execute( &reader ); auto dictionary = parser.execute( &reader );
printDictionary( *dictionary ); printDictionary( *dictionary );
urlFrontierTest.Pop( ); assert( urlFrontierTest.Size( ) == 12 );
assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://trove.com/" ); assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://trove.com/" );
assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://arcinnovations.xyz/" ); assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "http://arcinnovations.xyz/" );
assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://gwydion.co/" ); assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://gwydion.co/" );
assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" ); assert( urlFrontierTest.Pop( ).getCompleteUrl( ) == "https://madeatmichigan.umich.edu/ventures/venture/gwydion/" );
assert ( dictionary != nullptr ); assert ( dictionary != nullptr );
assert ( dictionary->size( ) == 67 ); assert ( dictionary->size( ) == 372 );
assert ( dictionary->at( "=veronicacday.com/" ).size( ) == 1 && dictionary->at( "=veronicacday.com/" )[ 0 ] == 0 ); assert ( dictionary->at( "=veronicacday.com/" ).size( ) == 1 && dictionary->at( "=veronicacday.com/" )[ 0 ] == 0 );
assert ( dictionary->at( "%serena" ).size( ) == 2 && dictionary->at( "%serena" )[ 1 ] == 24 ); assert ( dictionary->at( "%serena" ).size( ) == 2 && dictionary->at( "%serena" )[ 1 ] == 80 );
assert ( dictionary->at( "#veronica" ).size( ) == 1 && dictionary->at( "#veronica" )[ 0 ] == 2 ); assert ( dictionary->at( "#veronica" ).size( ) == 1 && dictionary->at( "#veronica" )[ 0 ] == 2 );
assert ( dictionary->at( "#dai" ).size( ) == 1 && dictionary->at( "#dai" )[ 0 ] == 3 ); assert ( dictionary->at( "#dai" ).size( ) == 1 && dictionary->at( "#dai" )[ 0 ] == 3 );
assert ( dictionary->at( "%educ" ).size( ) == 1 && dictionary->at( "%educ" )[ 0 ] == 13 ); assert ( dictionary->at( "%educ" ).size( ) == 1 && dictionary->at( "%educ" )[ 0 ] == 40 );
assert ( dictionary->at( "%surgeri" ).size( ) == 1 && dictionary->at( "%surgeri" )[ 0 ] == 72 ); assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 );
assert ( dictionary->at( "%busi" ).size( ) == 6 );
......
"<p>Paragraph body text text text BODY hello <title>Specific TITLE? title</title> more body body text words</p>" <p>Paragraph body text text text BODY hello <title>Specific TITLE? title</title> more body body text words</p>
\ No newline at end of file \ No newline at end of file
...@@ -41,7 +41,7 @@ unsigned long findStr ( string needle, string haystack ) ...@@ -41,7 +41,7 @@ unsigned long findStr ( string needle, string haystack )
++temp; ++temp;
++needleIt; ++needleIt;
//if it hits the end of the needleing, it signifies an exact match //if it hits the end of the needleing, it signifies an exact match
if ( needleIt == needle.size( ) - 1 ) if ( needleIt == needle.size( ) )
{ {
//this is pointing at the beginning of the match //this is pointing at the beginning of the match
return haystackIt; return haystackIt;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment