From e18149f517544c551d13fbf905f05abede67d524 Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Sun, 18 Mar 2018 21:05:24 -0400
Subject: [PATCH] validate url b4 push

---
 parser/Parser.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++++--
 parser/Parser.h   |  7 +++++
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 56d9646..3a70210 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -58,14 +58,18 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 			string url = extract_url( line );
 			if ( url != "" )
 				{
-				if ( isLocal ( url ) )
+				if ( isLocal( url ) )
 					{
 					string completeUrl = "";
 					completeUrl.assign( currentUrl.CompleteUrl );
 					url = completeUrl + url;
 					}
-				urlFrontier->Push( url );
-				cout << url << endl;
+				if ( isValid( url ) )
+					{
+					// TODO ParsedUrl with anchor text
+					urlFrontier->Push( url );
+					cout << url << endl;
+					}
 				}
 				// check if line is title
 			else
@@ -148,4 +152,66 @@ string Parser::extract_title ( string & word )
 bool Parser::isLocal ( string url )
 	{
 	return ( *url.begin( ) == '/' );
+	}
+
+/**
+ * Returns false if the link is an invalid type
+ *
+ * @param url
+ * @return
+ */
+bool Parser::isValid ( string url )
+	{
+	auto begPtr = url.begin( );
+	auto endPtr = begPtr + url.size( ) - 1;
+	unsigned long size = url.size( );
+
+	auto html = findPrev( ".html", endPtr, begPtr + size - 6 );
+
+	if ( *html != '\0' )
+		{
+		return true;
+		}
+
+	// png
+	if ( *findPrev( ".png", endPtr, begPtr + size - 5 ) != '\0' )
+		{
+		return false;
+		}
+	//jpg
+	if ( *findPrev( ".jpg", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	//jpeg
+	if ( *findPrev( ".jpeg", endPtr, begPtr + size - 6 ) )
+		{
+		return false;
+		}
+	//css
+	if ( *findPrev( ".css", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	//gif
+	if ( *findPrev( ".gif", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	//pdf
+	if ( *findPrev( ".pdf", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	//wav
+	if ( *findPrev( ".wav", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	//mp3
+	if ( *findPrev( ".mp3", endPtr, begPtr + size - 5 ) )
+		{
+		return false;
+		}
+	return true;
 	}
\ No newline at end of file
diff --git a/parser/Parser.h b/parser/Parser.h
index 7916a9b..aa7740f 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -70,5 +70,12 @@ private:
 	 */
 	bool isLocal ( string url );
 
+	/**
+	 * Returns true is url is valid
+	 *
+	 * @param url
+	 * @return
+	 */
+	bool isValid ( string url );
 	};
 
-- 
GitLab