From 02e3c89768ec57f7ea0c16a6fdf7e3d17c3d07bb Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Mon, 19 Mar 2018 17:13:46 -0400 Subject: [PATCH] crawler-parser test consistent --- crawler/spider.cpp | 7 +++---- parser/Parser.cpp | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/crawler/spider.cpp b/crawler/spider.cpp index 5099918..84043df 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -63,11 +63,10 @@ void Spider::FuncToRun ( ) size_t docID = hash( currentUrl.CompleteUrl ); string localPath = util::GetCurrentWorkingDir( ); // don't include debug in file path - auto debug = findPrev( "cmake-build-debug", localPath.begin( ) + localPath.size( ) - 1, - localPath.begin( ) ); - if ( *debug != '\0' ) + unsigned long debug = findPrev( "cmake-build-debug", localPath.size( ) - 1, localPath ); + if ( debug < localPath.size( ) ) { - localPath = subStr( localPath.begin( ), debug - 1 ); + localPath = subStr( localPath, 0, debug); } string pathToDisk = localPath + "/crawlerOutput/" + to_string( docID ) + ".txt"; diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 78b122b..5f5d955 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -39,18 +39,18 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) host.assign( currentUrl.Host ); string path = ""; path.assign( currentUrl.Path ); - string url = host + "/" + path; + string urlCurrent = host + "/" + path; - offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL ); + offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL ); while ( htmlIt < html.size( ) ) { // if open bracket - if ( html[ htmlIt ]== '<' ) + if ( html[ htmlIt ] == '<' ) { unsigned long begCloseTag = findNext( "</", htmlIt, html ); unsigned long endCloseTag = findNext( ">", begCloseTag, html ); - string line = subStr( html, htmlIt, endCloseTag ); + string line = subStr( html, htmlIt, endCloseTag + 1 - htmlIt ); htmlIt = endCloseTag + 2; // check if line is url @@ -63,7 +63,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) completeUrl.assign( currentUrl.CompleteUrl ); url = completeUrl + url; } - if ( isValid( url ) ) + if ( isValid( url ) && url != urlCurrent ) { // TODO ParsedUrl with anchor text ParsedUrl pUrl = ParsedUrl( url ); @@ -106,13 +106,30 @@ string Parser::extract_url ( string html ) { url = ""; unsigned long closeTag = findNext( ">", foundHref, html ); - if ( closeTag < html.size( ) && html[ closeTag - 1 ] == '\"' ) + unsigned long closeSpace = findNext( " ", foundHref, html ); + unsigned long closeUrl = 0; + // end == ' >' + if ( closeSpace < html.size( ) && closeTag < html.size( ) && closeSpace < closeTag ) { - closeTag -= 1; + if ( html[ closeSpace - 1 ] == '\"' ) + { + closeSpace -= 1; + } + closeUrl = closeSpace; } - while ( html[ foundHttp ] != html[ closeTag ] ) + // end == '>' + else if ( closeTag < html.size( ) ) + { + if ( html[ closeTag - 1 ] == '\"' ) + { + closeTag -= 1; + } + closeUrl = closeTag; + } + + while ( foundHttp != closeUrl && html[ foundHttp ] != '\n') { - url += html[ foundHttp ]; + url.push_back( html[ foundHttp ] ); ++foundHttp; } } @@ -173,9 +190,9 @@ bool Parser::isValid ( string url ) return true; } - // png || jpg || css || gif || pdf || wav || mp3 || mp4 - if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif" - || lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" ) + // png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico + if ( lastFour == ".png" || lastFour == ".jpg" || lastFour == ".css" || lastFour == ".gif" + || lastFour == ".pdf" || lastFour == ".wav" || lastFour == ".mp3" || lastFour == ".mp4" || lastFour == ".ico" ) { return false; } -- GitLab