From e809b071996c864e0938401bc6f09ed5dbaf8e79 Mon Sep 17 00:00:00 2001
From: vcday <vcday@umich.edu>
Date: Tue, 20 Mar 2018 14:32:45 -0400
Subject: [PATCH] added framework for anchor text parsing

---
 parser/Parser.cpp   | 77 ++++++++++++++++++++++++++++++---------------
 parser/Parser.h     | 23 ++++++++++++--
 shared/Document.cpp |  2 +-
 shared/url.h        | 58 ++++++++++++++++++++++++++++------
 4 files changed, 121 insertions(+), 39 deletions(-)

diff --git a/parser/Parser.cpp b/parser/Parser.cpp
index 5f5d955..25e6066 100644
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -33,16 +33,19 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 	unsigned long htmlIt = 0;
 	unsigned long offsetTitle = 0;
 	unsigned long offsetURL = 0;
+	unsigned long offsetAnchor = 0;
 
 	// tokenize url
-	string host = "";
-	host.assign( currentUrl.Host );
-	string path = "";
-	path.assign( currentUrl.Path );
-	string urlCurrent = host + "/" + path;
+	offsetURL = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offsetURL, Tokenizer::URL );
 
-	offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
+	// tokenize anchor
+	string anchorText = currentUrl.getAnchorText( );
+	if ( anchorText != "" )
+		{
+		offsetAnchor = tokenizer->execute( anchorText, offsetAnchor, Tokenizer::ANCHOR );
+		}
 
+	// find titles
 	while ( htmlIt < html.size( ) )
 		{
 		// if open bracket
@@ -54,27 +57,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 			htmlIt = endCloseTag + 2;
 
 			// check if line is url
-			string url = extract_url( line );
+			string url = extractUrl( line );
 			if ( url != "" )
 				{
-				if ( isLocal( url ) )
-					{
-					string completeUrl = "";
-					completeUrl.assign( currentUrl.CompleteUrl );
-					url = completeUrl + url;
-					}
-				if ( isValid( url ) && url != urlCurrent )
-					{
-					// TODO ParsedUrl with anchor text
-					ParsedUrl pUrl = ParsedUrl( url );
-					urlFrontier->Push( pUrl );
-					cout << url << endl;
-					}
+
+				pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
 				}
-				// check if line is title
+			// check if line is title
 			else
 				{
-				string title = extract_title( line );
+				string title = extractTitle( line );
 				if ( title != "" )
 					{
 					offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
@@ -86,8 +78,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
 			++htmlIt;
 			}
 		}
+	}
 
-
+/**
+ * Returns anchor text if found
+ * @param html
+ * @return
+ */
+string Parser::extractAnchorText( string html )
+	{
+	return "";
 	}
 
 /**
@@ -95,7 +95,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
  * @param word
  * @return
  */
-string Parser::extract_url ( string html )
+string Parser::extractUrl ( string html )
 	{
 	string url = "";
 	if ( findStr( "<a", html ) != html.size( ) )
@@ -143,7 +143,7 @@ string Parser::extract_url ( string html )
  * @param word
  * @return
  */
-string Parser::extract_title ( string html )
+string Parser::extractTitle ( string html )
 	{
 	string title = "";
 	char end = '<';
@@ -203,3 +203,30 @@ bool Parser::isValid ( string url )
 		}
 	return true;
 	}
+
+/**
+ * Sends to Url Frontier
+ *
+ * @param url
+ * @param currentUrl
+ * @param anchorText --> will be "null" if empty
+ * @param debug --> will print urls to std::cout
+ */
+void Parser::pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug )
+	{
+	if ( isLocal( url ) )
+		{
+		url = currentUrl.getCompleteUrl( ) + url;
+		}
+	if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) )
+		{
+		ParsedUrl pUrl = ParsedUrl( url );
+		pUrl.setAnchorText( anchorText );
+		urlFrontier->Push( pUrl );
+		if ( debug )
+			{
+			cout << url << endl;
+			cout << anchorText << endl;
+			}
+		}
+	}
diff --git a/parser/Parser.h b/parser/Parser.h
index 1721240..b38a91e 100644
--- a/parser/Parser.h
+++ b/parser/Parser.h
@@ -47,13 +47,19 @@ private:
 	 */
 	void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
 
-
+	/**
+	 * Returns anchor text if found
+	 * @param html 
+	 * @return 
+	 */
+	string extractAnchorText( string html );
+	
 	/**
 	 * Returns a url, or "" if none
 	 * @param html
 	 * @return
 	 */
-	string extract_url ( string html );
+	string extractUrl ( string html );
 
 
 	/**
@@ -61,7 +67,7 @@ private:
 	 * @param html
 	 * @return
 	 */
-	string extract_title ( string html );
+	string extractTitle ( string html );
 
 	/**
 	 * Will return true if local url
@@ -78,5 +84,16 @@ private:
 	 * @return
 	 */
 	bool isValid ( string url );
+
+	/**
+	 * Sends to Url Frontier
+	 * 
+	 * @param url 
+	 * @param currentUrl 
+	 * @param anchorText 
+	 * @param debug --> will print urls to std::cout
+	 */
+	void pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug );
+
 	};
 
diff --git a/shared/Document.cpp b/shared/Document.cpp
index 2a2f3e3..8efef78 100644
--- a/shared/Document.cpp
+++ b/shared/Document.cpp
@@ -84,7 +84,7 @@ void  Document::PrintDocMap ( string url, int location )
 			{
 			char *buffer = new char[bytes];
 			ssize_t bytesRead;
-			if ( bytesRead = read ( file, buffer, bytes ) )
+			if ( ( bytesRead = read ( file, buffer, bytes ) ) )
 				write ( 1, buffer, bytesRead );
 			else
 				{
diff --git a/shared/url.h b/shared/url.h
index 91c5502..0ea482d 100644
--- a/shared/url.h
+++ b/shared/url.h
@@ -20,11 +20,6 @@ using namespace std;
 #define MIL   ".mil"
 #define INT   ".int"
 
-
-
-
-
-
 class ParsedUrl
 	{
 public:
@@ -32,7 +27,8 @@ public:
 			*Service,
 			*Host,
 			*Domain,
-			*Path;
+			*Path,
+			*AnchorText;
 	double Score;
 
 	ParsedUrl( string input_url )
@@ -40,6 +36,11 @@ public:
 		// Assumes url points to static text but
 		// does not check.
 
+		//intialize anchor text to "null"
+		char *null = new char[2];
+		strcpy(null, string("").c_str());
+		AnchorText = null;
+
 		char *url = new char[input_url.length() + 1];
 		strcpy(url, input_url.c_str());
 
@@ -86,10 +87,6 @@ public:
 
 				}
 
-
-
-
-
 			// Whatever remains is the Path. // need to remove fragments
 
 			Path = p;
@@ -140,6 +137,47 @@ public:
 			Score += .5;
 		}
 
+	std::string getCompleteUrl( )
+		{
+		std::string completeUrl = "";
+		completeUrl.assign( this->CompleteUrl );
+		return completeUrl;
+		}
+
+	std::string getHost( )
+		{
+		std::string host = "";
+		host.assign( this->Host );
+		return host;
+		}
+
+	std::string getPath( )
+		{
+		std::string path = "";
+		path.assign( this->Path );
+		return path;
+		}
+
+	std::string getAnchorText( )
+		{
+		std::string anchorText = "";
+		anchorText.assign( this->AnchorText );
+		return anchorText;
+		}
+
+	void setAnchorText( std::string anchorText )
+		{
+		char * anchorCharStar = new char[ anchorText.size( ) ];
+
+		for ( int i = 0; i < anchorText.size( ); ++i )
+			{
+			anchorCharStar += anchorText[ i ];
+			}
+		anchorCharStar += '\0';
+		this->AnchorText = anchorCharStar;
+		}
+
+
 	~ParsedUrl( )
 		{
 		pathBuffer = 0;
-- 
GitLab