Skip to content
Snippets Groups Projects
Commit 02e3c897 authored by vcday's avatar vcday
Browse files

crawler-parser test consistent

parent 0bbd6113
No related branches found
No related tags found
No related merge requests found
......@@ -63,11 +63,10 @@ void Spider::FuncToRun ( )
size_t docID = hash( currentUrl.CompleteUrl );
string localPath = util::GetCurrentWorkingDir( );
// don't include debug in file path
auto debug = findPrev( "cmake-build-debug", localPath.begin( ) + localPath.size( ) - 1,
localPath.begin( ) );
if ( *debug != '\0' )
unsigned long debug = findPrev( "cmake-build-debug", localPath.size( ) - 1, localPath );
if ( debug < localPath.size( ) )
{
localPath = subStr( localPath.begin( ), debug - 1 );
localPath = subStr( localPath, 0, debug);
}
string pathToDisk = localPath + "/crawlerOutput/" + to_string( docID ) + ".txt";
......
......@@ -39,18 +39,18 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string url = host + "/" + path;
string urlCurrent = host + "/" + path;
offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
while ( htmlIt < html.size( ) )
{
// if open bracket
if ( html[ htmlIt ]== '<' )
if ( html[ htmlIt ] == '<' )
{
unsigned long begCloseTag = findNext( "</", htmlIt, html );
unsigned long endCloseTag = findNext( ">", begCloseTag, html );
string line = subStr( html, htmlIt, endCloseTag );
string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt );
htmlIt = endCloseTag + 2;
// check if line is url
......@@ -63,7 +63,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) )
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
......@@ -106,13 +106,30 @@ string Parser::extract_url ( string html )
{
url = "";
unsigned long closeTag = findNext( ">", foundHref, html );
if ( closeTag < html.size( ) && html[ closeTag - 1 ] == '\"' )
unsigned long closeSpace = findNext( " ", foundHref, html );
unsigned long closeUrl = 0;
// end == ' >'
if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag )
{
closeTag -= 1;
if ( html[ closeSpace - 1 ] == '\"' )
{
closeSpace -= 1;
}
closeUrl = closeSpace;
}
while ( html[ foundHttp ] != html[ closeTag ] )
// end == '>'
else if ( closeTag < html.size( ) )
{
if ( html[ closeTag - 1 ] == '\"' )
{
closeTag -= 1;
}
closeUrl = closeTag;
}
while ( foundHttp != closeUrl && html[ foundHttp ] != '\n')
{
url += html[ foundHttp ];
url.push_back( html[ foundHttp ] );
++foundHttp;
}
}
......@@ -173,9 +190,9 @@ bool Parser::isValid ( string url )
return true;
}
// png || jpg || css || gif || pdf || wav || mp3 || mp4
if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif"
|| lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" )
// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif"
|| lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" )
{
return false;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment