Skip to content
Snippets Groups Projects
Commit 15992b70 authored by vcday's avatar vcday
Browse files

add anchor text parsing

parent dc615c1e
No related branches found
No related tags found
No related merge requests found
...@@ -34,7 +34,6 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ...@@ -34,7 +34,6 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
ParsedUrl currentUrl = reader->getUrl( ); ParsedUrl currentUrl = reader->getUrl( );
// tokenize anchor // tokenize anchor
// TODO ParsedUrl with anchor text
string anchorText = currentUrl.getAnchorText( ); string anchorText = currentUrl.getAnchorText( );
if ( anchorText != "" ) if ( anchorText != "" )
{ {
...@@ -93,7 +92,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ...@@ -93,7 +92,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
// if html line is url, parses accordingly and pushes to frontier // if html line is url, parses accordingly and pushes to frontier
else if ( url != "" && url != "#" ) else if ( url != "" && url != "#" )
{ {
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false ); pushToUrlQueue( url, currentUrl, extractAnchorText( line ), false );
} }
// check if line is header; classifies as body text // check if line is header; classifies as body text
else if ( header != "") else if ( header != "")
...@@ -124,7 +123,29 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ...@@ -124,7 +123,29 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
*/ */
string Parser::extractAnchorText ( string html ) string Parser::extractAnchorText ( string html )
{ {
return ""; string anchor = "";
unsigned long aTag = findStr( "<a", html );
if ( aTag != html.size( ) )
{
unsigned long begAnchor = findNext( ">", aTag, html );
unsigned long endAnchor = findNext( "</a>", aTag, html );
if ( begAnchor > endAnchor)
{
return anchor;
}
if ( begAnchor < html.size( ) && endAnchor < html.size())
{
++begAnchor;
while ( begAnchor != endAnchor && begAnchor < html.size( ) )
{
anchor += html[ begAnchor ];
++begAnchor;
}
}
}
return anchor;
} }
/** /**
...@@ -391,7 +412,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon ...@@ -391,7 +412,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon
else if ( url != "" ) else if ( url != "" )
{ {
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), true ); pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
} }
// check if line is title // check if line is title
// check if line is title // check if line is title
......
...@@ -20,6 +20,8 @@ void testHttp( ); ...@@ -20,6 +20,8 @@ void testHttp( );
void testURL( ); void testURL( );
void testBody ( ); void testBody ( );
void testExtractBody ( ); void testExtractBody ( );
void testAnchorText ( );
void printDictionary ( unordered_map< string, vector< unsigned long > > dictionary ); void printDictionary ( unordered_map< string, vector< unsigned long > > dictionary );
...@@ -32,6 +34,7 @@ int main ( ) ...@@ -32,6 +34,7 @@ int main ( )
testURL( ); testURL( );
testBody ( ); testBody ( );
testExtractBody ( ); testExtractBody ( );
testAnchorText ( );
cout << "Parser Tests Passed! :D" << endl; cout << "Parser Tests Passed! :D" << endl;
} }
...@@ -125,8 +128,6 @@ void testHttp( ) ...@@ -125,8 +128,6 @@ void testHttp( )
assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 ); assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 );
assert ( dictionary->at( "%busi" ).size( ) == 6 ); assert ( dictionary->at( "%busi" ).size( ) == 6 );
delete dictionary; delete dictionary;
dictionary = nullptr; dictionary = nullptr;
...@@ -261,4 +262,41 @@ void testExtractBody ( ) ...@@ -261,4 +262,41 @@ void testExtractBody ( )
dictionary = nullptr; dictionary = nullptr;
cout << "Extract Body Test Passed!" << endl; cout << "Extract Body Test Passed!" << endl;
}
void testAnchorText ( )
{
cout << "Testing Anchor Text: " << endl;
ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
Parser parser( &urlFrontierTest );
ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" );
fake_url.setAnchorText( "anchor text example Click Here!");
string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html";
LocalReader reader( filepath );
reader.setUrl( fake_url );
auto success = reader.request( );
if ( !success )
{
cerr << "Couldn't open file\n";
exit( 1 );
}
auto dictionary = parser.execute( &reader );
printDictionary( *dictionary );
assert ( dictionary != nullptr );
assert ( dictionary->at( "@anchor" )[ 0 ] == 0 );
assert ( dictionary->at( "@text" )[ 0 ] == 1 );
assert ( dictionary->at( "@exampl" )[ 0 ] == 2 );
assert ( dictionary->find( "@click" ) == dictionary->end( ) );
assert ( dictionary->find( "@here" ) == dictionary->end( ) );
assert ( dictionary->find( "click" ) == dictionary->end( ) );
assert ( dictionary->find( "here" ) == dictionary->end( ) );
delete dictionary;
dictionary = nullptr;
cout << "Extract Anchor Test Passed!" << endl;
} }
\ No newline at end of file
...@@ -44,7 +44,7 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch ...@@ -44,7 +44,7 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
return tokenize( splitStr( originalText, split, true ), offset, decorator ); return tokenize( splitStr( originalText, split, true ), offset, decorator );
} }
// split by spaces // split by spaces
else else
{ {
return tokenize( splitStr( originalText, ' ', true ), offset, decorator ); return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
...@@ -65,7 +65,10 @@ unsigned long Tokenizer::tokenize ( vector< string > splitText, unsigned long of ...@@ -65,7 +65,10 @@ unsigned long Tokenizer::tokenize ( vector< string > splitText, unsigned long of
{ {
// case fold // case fold
processedString = toLower( splitText[ i ] ); processedString = toLower( splitText[ i ] );
//strip all characters
// remove "click here" etc
if ( decorator == Tokenizer::ANCHOR && anchorsToRemove.find( processedString ) != anchorsToRemove.end( ) )
continue;
if ( !isStopWord( processedString ) ) if ( !isStopWord( processedString ) )
{ {
......
...@@ -58,4 +58,9 @@ private: ...@@ -58,4 +58,9 @@ private:
*/ */
unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator ); unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
/**
* Anchor text that should not be included in index
*/
set< string > anchorsToRemove = {"click", "here", "here!"};
}; };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment