diff --git a/parser/Parser.cpp b/parser/Parser.cpp index fe02f6ebe1b85736a110d9e6851307259cc6daef..bd0ea22765165fdc0094bdd4c1c73202088b5917 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -34,7 +34,6 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) ParsedUrl currentUrl = reader->getUrl( ); // tokenize anchor - // TODO ParsedUrl with anchor text string anchorText = currentUrl.getAnchorText( ); if ( anchorText != "" ) { @@ -93,7 +92,7 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) // if html line is url, parses accordingly and pushes to frontier else if ( url != "" && url != "#" ) { - pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), false ); + pushToUrlQueue( url, currentUrl, extractAnchorText( line ), false ); } // check if line is header; classifies as body text else if ( header != "") @@ -124,7 +123,29 @@ void Parser::parse ( StreamReader *reader, Tokenizer *tokenizer ) */ string Parser::extractAnchorText ( string html ) { - return ""; + string anchor = ""; + unsigned long aTag = findStr( "<a", html ); + if ( aTag != html.size( ) ) + { + unsigned long begAnchor = findNext( ">", aTag, html ); + unsigned long endAnchor = findNext( "</a>", aTag, html ); + + if ( begAnchor > endAnchor) + { + return anchor; + } + + if ( begAnchor < html.size( ) && endAnchor < html.size()) + { + ++begAnchor; + while ( begAnchor != endAnchor && begAnchor < html.size( ) ) + { + anchor += html[ begAnchor ]; + ++begAnchor; + } + } + } + return anchor; } /** @@ -391,7 +412,7 @@ void Parser::extractAll ( string line, unsigned long & offsetTitle, unsigned lon else if ( url != "" ) { - pushToUrlQueue( url, currentUrl, extractAnchorText( "" ), true ); + pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true ); } // check if line is title // check if line is title diff --git a/parser/tests/parserTest.cpp b/parser/tests/parserTest.cpp index e52193a3a6bd5e1dcfe7b4adb6096563a846619f..0f836deb7fe9019409f054cc22ecfe59df933c50 100644 --- a/parser/tests/parserTest.cpp +++ b/parser/tests/parserTest.cpp @@ -20,6 +20,8 @@ void testHttp( ); void testURL( ); void testBody ( ); void testExtractBody ( ); +void testAnchorText ( ); + void printDictionary ( unordered_map< string, vector< unsigned long > > dictionary ); @@ -32,6 +34,7 @@ int main ( ) testURL( ); testBody ( ); testExtractBody ( ); + testAnchorText ( ); cout << "Parser Tests Passed! :D" << endl; } @@ -125,8 +128,6 @@ void testHttp( ) assert ( dictionary->at( "%surgeri" ).size( ) == 2 && dictionary->at( "%surgeri" )[ 0 ] == 511 ); assert ( dictionary->at( "%busi" ).size( ) == 6 ); - - delete dictionary; dictionary = nullptr; @@ -261,4 +262,41 @@ void testExtractBody ( ) dictionary = nullptr; cout << "Extract Body Test Passed!" << endl; + } + +void testAnchorText ( ) + { + cout << "Testing Anchor Text: " << endl; + ProducerConsumerQueue< ParsedUrl > urlFrontierTest; + Parser parser( &urlFrontierTest ); + ParsedUrl fake_url = ParsedUrl( "http://www.testingBody.edu" ); + fake_url.setAnchorText( "anchor text example Click Here!"); + + string filepath = util::GetCurrentWorkingDir( ) + "/tests/testParserBody.html"; + + LocalReader reader( filepath ); + reader.setUrl( fake_url ); + auto success = reader.request( ); + if ( !success ) + { + cerr << "Couldn't open file\n"; + exit( 1 ); + } + + auto dictionary = parser.execute( &reader ); + printDictionary( *dictionary ); + + assert ( dictionary != nullptr ); + assert ( dictionary->at( "@anchor" )[ 0 ] == 0 ); + assert ( dictionary->at( "@text" )[ 0 ] == 1 ); + assert ( dictionary->at( "@exampl" )[ 0 ] == 2 ); + assert ( dictionary->find( "@click" ) == dictionary->end( ) ); + assert ( dictionary->find( "@here" ) == dictionary->end( ) ); + assert ( dictionary->find( "click" ) == dictionary->end( ) ); + assert ( dictionary->find( "here" ) == dictionary->end( ) ); + + delete dictionary; + dictionary = nullptr; + + cout << "Extract Anchor Test Passed!" << endl; } \ No newline at end of file diff --git a/util/Tokenizer.cpp b/util/Tokenizer.cpp index f0f06f9f286092de884e86c5db576757ec0a44fd..fe6ba02f2390e9eadb92f65bbfe5c11788c73b6e 100644 --- a/util/Tokenizer.cpp +++ b/util/Tokenizer.cpp @@ -44,7 +44,7 @@ unsigned long Tokenizer::execute ( string originalText, unsigned long offset, ch return tokenize( splitStr( originalText, split, true ), offset, decorator ); } - // split by spaces + // split by spaces else { return tokenize( splitStr( originalText, ' ', true ), offset, decorator ); @@ -65,7 +65,10 @@ unsigned long Tokenizer::tokenize ( vector< string > splitText, unsigned long of { // case fold processedString = toLower( splitText[ i ] ); - //strip all characters + + // remove "click here" etc + if ( decorator == Tokenizer::ANCHOR && anchorsToRemove.find( processedString ) != anchorsToRemove.end( ) ) + continue; if ( !isStopWord( processedString ) ) { diff --git a/util/Tokenizer.h b/util/Tokenizer.h index 78c0ec9497d87e2dc730a6d351c6f1f3998063d6..4fb8658ee11ae0f8f431e36d4f53731b9b8e2429 100644 --- a/util/Tokenizer.h +++ b/util/Tokenizer.h @@ -58,4 +58,9 @@ private: */ unsigned long tokenize ( vector< string > splitText, unsigned long offset, char decorator ); + /** + * Anchor text that should not be included in index + */ + set< string > anchorsToRemove = {"click", "here", "here!"}; + };