add anchor text parsing

15992b70 · vcday · dc615c1e · 15992b70 · 15992b70 · 15992b70
Commit 15992b70 authored 6 years ago by vcday
--- a/parser/Parser.cpp
+++ b/parser/Parser.cpp
@@ -34,7 +34,6 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
 	ParsedUrl currentUrl = reader->getUrl( );

 	// tokenize anchor
-	// TODO ParsedUrl with anchor text
 	string anchorText = currentUrl.getAnchorText( );
 	if ( anchorText != "" )
 		{
@@ -93,7 +92,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
 			// if html line is url, parses accordingly and pushes to frontier
 			else if ( url != "" && url != "#" )
 				{
-				pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false );
+				pushToUrlQueue( url, currentUrl, extractAnchorText( line ), false );
 				}
 			// check if line is header; classifies as body text
 			else if ( header != "")
@@ -124,7 +123,29 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
 */
 string Parser::extractAnchorText ( string html )
 	{
-	return "";
+	string anchor = "";
+	unsigned long aTag = findStr( "<a", html );
+	if ( aTag != html.size( ) )
+		{
+		unsigned long begAnchor = findNext( ">", aTag, html );
+		unsigned long endAnchor = findNext( "</a>", aTag, html );
+
+		if ( begAnchor > endAnchor)
+			{
+			return anchor;
+			}
+
+		if ( begAnchor < html.size( ) && endAnchor < html.size())
+			{
+			++begAnchor;
+			while ( begAnchor != endAnchor && begAnchor < html.size( ) )
+				{
+				anchor += html[ begAnchor ];
+				++begAnchor;
+				}
+			}
+		}
+	return anchor;
 	}

 /**
@@ -391,7 +412,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon

 	else if ( url != "" )
 		{
-			pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), true );
+			pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
 		}
 		// check if line is title
 		// check if line is title

--- a/parser/tests/parserTest.cpp
+++ b/parser/tests/parserTest.cpp
@@ -20,6 +20,8 @@ void testHttp( );
 void testURL( );
 void testBody ( );
 void testExtractBody ( );
+void testAnchorText ( );
+


 void printDictionary ( unordered_map< string, vector< unsigned long > > dictionary );
@@ -32,6 +34,7 @@ int main ( )
 	testURL( );
 	testBody ( );
 	testExtractBody ( );
+	testAnchorText ( );
 	cout << "Parser Tests Passed! :D" << endl;
 	}

@@ -125,8 +128,6 @@ void testHttp( )
 	assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 );
 	assert ( dictionary->at( "%busi" ).size( ) == 6 );

-
-
 	delete dictionary;
 	dictionary = nullptr;

@@ -261,4 +262,41 @@ void testExtractBody ( )
 	dictionary = nullptr;

 	cout << "Extract Body Test Passed!" << endl;
+	}
+
+void testAnchorText ( )
+	{
+	cout << "Testing Anchor Text: " << endl;
+	ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
+	Parser parser( &urlFrontierTest );
+	ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" );
+	fake_url.setAnchorText( "anchor text example Click Here!");
+
+	string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html";
+
+	LocalReader reader( filepath );
+	reader.setUrl( fake_url );
+	auto success = reader.request( );
+	if ( !success )
+		{
+		cerr << "Couldn't open file\n";
+		exit( 1 );
+		}
+
+	auto dictionary = parser.execute( &reader );
+	printDictionary( *dictionary );
+
+	assert ( dictionary != nullptr );
+	assert ( dictionary->at( "@anchor" )[ 0 ] == 0 );
+	assert ( dictionary->at( "@text" )[ 0 ] == 1 );
+	assert ( dictionary->at( "@exampl" )[ 0 ] == 2 );
+	assert ( dictionary->find( "@click" ) == dictionary->end( ) );
+	assert ( dictionary->find( "@here" ) == dictionary->end( ) );
+	assert ( dictionary->find( "click" ) == dictionary->end( ) );
+	assert ( dictionary->find( "here" ) == dictionary->end( ) );
+
+	delete dictionary;
+	dictionary = nullptr;
+
+	cout << "Extract Anchor Test Passed!" << endl;
 	}
\ No newline at end of file
--- a/util/Tokenizer.cpp
+++ b/util/Tokenizer.cpp
@@ -44,7 +44,7 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
 		return tokenize( splitStr( originalText, split, true ), offset, decorator );

 		}
-		// split by spaces
+	// split by spaces
 	else
 		{
 		return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
@@ -65,7 +65,10 @@ unsigned long Tokenizer::tokenize ( vector< string > splitText, unsigned long of
 		{
 		// case fold
 		processedString = toLower( splitText[ i ] );
-		//strip all characters
+
+		// remove "click here" etc
+		if ( decorator == Tokenizer::ANCHOR && anchorsToRemove.find( processedString ) != anchorsToRemove.end( ) )
+			continue;

 		if ( !isStopWord( processedString ) )
 			{

--- a/util/Tokenizer.h
+++ b/util/Tokenizer.h
@@ -58,4 +58,9 @@ private:
     */
    unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );

+	/**
+	 * Anchor text that should not be included in index
+	 */
+	set< string > anchorsToRemove = {"click", "here", "here!"};
+
 };