Skip to content
Snippets Groups Projects
Commit 15992b70 authored by vcday's avatar vcday
Browse files

add anchor text parsing

parent dc615c1e
Branches url-parsing
No related tags found
No related merge requests found
......@@ -34,7 +34,6 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
ParsedUrl currentUrl = reader->getUrl( );
// tokenize anchor
// TODO ParsedUrl with anchor text
string anchorText = currentUrl.getAnchorText( );
if ( anchorText != "" )
{
......@@ -93,7 +92,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
// if html line is url, parses accordingly and pushes to frontier
else if ( url != "" && url != "#" )
{
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false );
pushToUrlQueue( url, currentUrl, extractAnchorText( line ), false );
}
// check if line is header; classifies as body text
else if ( header != "")
......@@ -124,7 +123,29 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer )
*/
string Parser::extractAnchorText ( string html )
{
return "";
string anchor = "";
unsigned long aTag = findStr( "<a", html );
if ( aTag != html.size( ) )
{
unsigned long begAnchor = findNext( ">", aTag, html );
unsigned long endAnchor = findNext( "</a>", aTag, html );
if ( begAnchor > endAnchor)
{
return anchor;
}
if ( begAnchor < html.size( ) && endAnchor < html.size())
{
++begAnchor;
while ( begAnchor != endAnchor && begAnchor < html.size( ) )
{
anchor += html[ begAnchor ];
++begAnchor;
}
}
}
return anchor;
}
/**
......@@ -391,7 +412,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon
else if ( url != "" )
{
pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), true );
pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
}
// check if line is title
// check if line is title
......
......@@ -20,6 +20,8 @@ void testHttp( );
void testURL( );
void testBody ( );
void testExtractBody ( );
void testAnchorText ( );
void printDictionary ( unordered_map< string, vector< unsigned long > > dictionary );
......@@ -32,6 +34,7 @@ int main ( )
testURL( );
testBody ( );
testExtractBody ( );
testAnchorText ( );
cout << "Parser Tests Passed! :D" << endl;
}
......@@ -125,8 +128,6 @@ void testHttp( )
assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 );
assert ( dictionary->at( "%busi" ).size( ) == 6 );
delete dictionary;
dictionary = nullptr;
......@@ -261,4 +262,41 @@ void testExtractBody ( )
dictionary = nullptr;
cout << "Extract Body Test Passed!" << endl;
}
void testAnchorText ( )
{
cout << "Testing Anchor Text: " << endl;
ProducerConsumerQueue< ParsedUrl > urlFrontierTest;
Parser parser( &urlFrontierTest );
ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" );
fake_url.setAnchorText( "anchor text example Click Here!");
string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html";
LocalReader reader( filepath );
reader.setUrl( fake_url );
auto success = reader.request( );
if ( !success )
{
cerr << "Couldn't open file\n";
exit( 1 );
}
auto dictionary = parser.execute( &reader );
printDictionary( *dictionary );
assert ( dictionary != nullptr );
assert ( dictionary->at( "@anchor" )[ 0 ] == 0 );
assert ( dictionary->at( "@text" )[ 0 ] == 1 );
assert ( dictionary->at( "@exampl" )[ 0 ] == 2 );
assert ( dictionary->find( "@click" ) == dictionary->end( ) );
assert ( dictionary->find( "@here" ) == dictionary->end( ) );
assert ( dictionary->find( "click" ) == dictionary->end( ) );
assert ( dictionary->find( "here" ) == dictionary->end( ) );
delete dictionary;
dictionary = nullptr;
cout << "Extract Anchor Test Passed!" << endl;
}
\ No newline at end of file
......@@ -44,7 +44,7 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch
return tokenize( splitStr( originalText, split, true ), offset, decorator );
}
// split by spaces
// split by spaces
else
{
return tokenize( splitStr( originalText, ' ', true ), offset, decorator );
......@@ -65,7 +65,10 @@ unsigned long Tokenizer::tokenize ( vector< string > splitText, unsigned long of
{
// case fold
processedString = toLower( splitText[ i ] );
//strip all characters
// remove "click here" etc
if ( decorator == Tokenizer::ANCHOR && anchorsToRemove.find( processedString ) != anchorsToRemove.end( ) )
continue;
if ( !isStopWord( processedString ) )
{
......
......@@ -58,4 +58,9 @@ private:
*/
unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator );
/**
* Anchor text that should not be included in index
*/
set< string > anchorsToRemove = {"click", "here", "here!"};
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment