From e809b071996c864e0938401bc6f09ed5dbaf8e79 Mon Sep 17 00:00:00 2001 From: vcday <vcday@umich.edu> Date: Tue, 20 Mar 2018 14:32:45 -0400 Subject: [PATCH] added framework for anchor text parsing --- parser/Parser.cpp | 77 ++++++++++++++++++++++++++++++--------------- parser/Parser.h | 23 ++++++++++++-- shared/Document.cpp | 2 +- shared/url.h | 58 ++++++++++++++++++++++++++++------ 4 files changed, 121 insertions(+), 39 deletions(-) diff --git a/parser/Parser.cpp b/parser/Parser.cpp index 5f5d955..25e6066 100644 --- a/parser/Parser.cpp +++ b/parser/Parser.cpp @@ -33,16 +33,19 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) unsigned long htmlIt = 0; unsigned long offsetTitle = 0; unsigned long offsetURL = 0; + unsigned long offsetAnchor = 0; // tokenize url - string host = ""; - host.assign( currentUrl.Host ); - string path = ""; - path.assign( currentUrl.Path ); - string urlCurrent = host + "/" + path; + offsetURL = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offsetURL, Tokenizer::URL ); - offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL ); + // tokenize anchor + string anchorText = currentUrl.getAnchorText( ); + if ( anchorText != "" ) + { + offsetAnchor = tokenizer->execute( anchorText, offsetAnchor, Tokenizer::ANCHOR ); + } + // find titles while ( htmlIt < html.size( ) ) { // if open bracket @@ -54,27 +57,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) htmlIt = endCloseTag + 2; // check if line is url - string url = extract_url( line ); + string url = extractUrl( line ); if ( url != "" ) { - if ( isLocal( url ) ) - { - string completeUrl = ""; - completeUrl.assign( currentUrl.CompleteUrl ); - url = completeUrl + url; - } - if ( isValid( url ) && url != urlCurrent ) - { - // TODO ParsedUrl with anchor text - ParsedUrl pUrl = ParsedUrl( url ); - urlFrontier->Push( pUrl ); - cout << url << endl; - } + + pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true ); } - // check if line is title + // check if line is title else { - string title = extract_title( line ); + string title = extractTitle( line ); if ( title != "" ) { offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE ); @@ -86,8 +78,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ++htmlIt; } } + } - +/** + * Returns anchor text if found + * @param html + * @return + */ +string Parser::extractAnchorText( string html ) + { + return ""; } /** @@ -95,7 +95,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) * @param word * @return */ -string Parser::extract_url ( string html ) +string Parser::extractUrl ( string html ) { string url = ""; if ( findStr( "<a", html ) != html.size( ) ) @@ -143,7 +143,7 @@ string Parser::extract_url ( string html ) * @param word * @return */ -string Parser::extract_title ( string html ) +string Parser::extractTitle ( string html ) { string title = ""; char end = '<'; @@ -203,3 +203,30 @@ bool Parser::isValid ( string url ) } return true; } + +/** + * Sends to Url Frontier + * + * @param url + * @param currentUrl + * @param anchorText --> will be "null" if empty + * @param debug --> will print urls to std::cout + */ +void Parser::pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug ) + { + if ( isLocal( url ) ) + { + url = currentUrl.getCompleteUrl( ) + url; + } + if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) ) + { + ParsedUrl pUrl = ParsedUrl( url ); + pUrl.setAnchorText( anchorText ); + urlFrontier->Push( pUrl ); + if ( debug ) + { + cout << url << endl; + cout << anchorText << endl; + } + } + } diff --git a/parser/Parser.h b/parser/Parser.h index 1721240..b38a91e 100644 --- a/parser/Parser.h +++ b/parser/Parser.h @@ -47,13 +47,19 @@ private: */ void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ); - + /** + * Returns anchor text if found + * @param html + * @return + */ + string extractAnchorText( string html ); + /** * Returns a url, or "" if none * @param html * @return */ - string extract_url ( string html ); + string extractUrl ( string html ); /** @@ -61,7 +67,7 @@ private: * @param html * @return */ - string extract_title ( string html ); + string extractTitle ( string html ); /** * Will return true if local url @@ -78,5 +84,16 @@ private: * @return */ bool isValid ( string url ); + + /** + * Sends to Url Frontier + * + * @param url + * @param currentUrl + * @param anchorText + * @param debug --> will print urls to std::cout + */ + void pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug ); + }; diff --git a/shared/Document.cpp b/shared/Document.cpp index 2a2f3e3..8efef78 100644 --- a/shared/Document.cpp +++ b/shared/Document.cpp @@ -84,7 +84,7 @@ void Document::PrintDocMap ( string url, int location ) { char *buffer = new char[bytes]; ssize_t bytesRead; - if ( bytesRead = read ( file, buffer, bytes ) ) + if ( ( bytesRead = read ( file, buffer, bytes ) ) ) write ( 1, buffer, bytesRead ); else { diff --git a/shared/url.h b/shared/url.h index 91c5502..0ea482d 100644 --- a/shared/url.h +++ b/shared/url.h @@ -20,11 +20,6 @@ using namespace std; #define MIL ".mil" #define INT ".int" - - - - - class ParsedUrl { public: @@ -32,7 +27,8 @@ public: *Service, *Host, *Domain, - *Path; + *Path, + *AnchorText; double Score; ParsedUrl( string input_url ) @@ -40,6 +36,11 @@ public: // Assumes url points to static text but // does not check. + //intialize anchor text to "null" + char *null = new char[2]; + strcpy(null, string("").c_str()); + AnchorText = null; + char *url = new char[input_url.length() + 1]; strcpy(url, input_url.c_str()); @@ -86,10 +87,6 @@ public: } - - - - // Whatever remains is the Path. // need to remove fragments Path = p; @@ -140,6 +137,47 @@ public: Score += .5; } + std::string getCompleteUrl( ) + { + std::string completeUrl = ""; + completeUrl.assign( this->CompleteUrl ); + return completeUrl; + } + + std::string getHost( ) + { + std::string host = ""; + host.assign( this->Host ); + return host; + } + + std::string getPath( ) + { + std::string path = ""; + path.assign( this->Path ); + return path; + } + + std::string getAnchorText( ) + { + std::string anchorText = ""; + anchorText.assign( this->AnchorText ); + return anchorText; + } + + void setAnchorText( std::string anchorText ) + { + char * anchorCharStar = new char[ anchorText.size( ) ]; + + for ( int i = 0; i < anchorText.size( ); ++i ) + { + anchorCharStar += anchorText[ i ]; + } + anchorCharStar += '\0'; + this->AnchorText = anchorCharStar; + } + + ~ParsedUrl( ) { pathBuffer = 0; -- GitLab