From e18149f517544c551d13fbf905f05abede67d524 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Sun, 18 Mar 2018 21:05:24 -0400 Subject: [PATCH] validate url b4 push --- parser/Parser.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++++-- parser/Parser.h | 7 +++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 56d9646..3a70210 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -58,14 +58,18 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) string url = extract_url( line ); if ( url != "" ) { - if ( isLocal ( url ) ) + if ( isLocal( url ) ) { string completeUrl = ""; completeUrl.assign( currentUrl.CompleteUrl ); url = completeUrl + url; } - urlFrontier->Push( url ); - cout << url << endl; + if ( isValid( url ) ) + { + // TODO ParsedUrl with anchor text + urlFrontier->Push( url ); + cout << url << endl; + } } // check if line is title else @@ -148,4 +152,66 @@ string Parser::extract_title ( string & word ) bool Parser::isLocal ( string url ) { return ( *url.begin( ) == '/' ); + } + +/** + * Returns false if the link is an invalid type + * + * @param url + * @return + */ +bool Parser::isValid ( string url ) + { + auto begPtr = url.begin( ); + auto endPtr = begPtr + url.size( ) - 1; + unsigned long size = url.size( ); + + auto html = findPrev( ".html", endPtr, begPtr + size - 6 ); + + if ( *html != '\0' ) + { + return true; + } + + // png + if ( *findPrev( ".png", endPtr, begPtr + size - 5 ) != '\0' ) + { + return false; + } + //jpg + if ( *findPrev( ".jpg", endPtr, begPtr + size - 5 ) ) + { + return false; + } + //jpeg + if ( *findPrev( ".jpeg", endPtr, begPtr + size - 6 ) ) + { + return false; + } + //css + if ( *findPrev( ".css", endPtr, begPtr + size - 5 ) ) + { + return false; + } + //gif + if ( *findPrev( ".gif", endPtr, begPtr + size - 5 ) ) + { + return false; + } + //pdf + if ( *findPrev( ".pdf", endPtr, begPtr + size - 5 ) ) + { + return false; + } + //wav + if ( *findPrev( ".wav", endPtr, begPtr + size - 5 ) ) + { + return false; + } + //mp3 + if ( *findPrev( ".mp3", endPtr, begPtr + size - 5 ) ) + { + return false; + } + return true; } \ No newline at end of file diff --git a/parser/Parser.h b/parser/Parser.h index 7916a9b..aa7740f 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -70,5 +70,12 @@ private: */ bool isLocal ( string url ); + /** + * Returns true is url is valid + * + * @param url + * @return + */ + bool isValid ( string url ); }; -- GitLab