Skip to content
Snippets Groups Projects
Commit e809b071 authored by vcday's avatar vcday
Browse files

added framework for anchor text parsing

parent 02e3c897
Branches crawler-parser
No related tags found
No related merge requests found
...@@ -33,16 +33,19 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -33,16 +33,19 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
unsigned long htmlIt = 0; unsigned long htmlIt = 0;
unsigned long offsetTitle = 0; unsigned long offsetTitle = 0;
unsigned long offsetURL = 0; unsigned long offsetURL = 0;
unsigned long offsetAnchor = 0;
// tokenize url // tokenize url
string host = ""; offsetURL = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offsetURL, Tokenizer::URL );
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string urlCurrent = host + "/" + path;
offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL ); // tokenize anchor
string anchorText = currentUrl.getAnchorText( );
if ( anchorText != "" )
{
offsetAnchor = tokenizer->execute( anchorText, offsetAnchor, Tokenizer::ANCHOR );
}
// find titles
while ( htmlIt < html.size( ) ) while ( htmlIt < html.size( ) )
{ {
// if open bracket // if open bracket
...@@ -54,27 +57,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -54,27 +57,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
htmlIt = endCloseTag + 2; htmlIt = endCloseTag + 2;
// check if line is url // check if line is url
string url = extract_url( line ); string url = extractUrl( line );
if ( url != "" ) if ( url != "" )
{ {
if ( isLocal( url ) )
{ pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
}
} }
// check if line is title // check if line is title
else else
{ {
string title = extract_title( line ); string title = extractTitle( line );
if ( title != "" ) if ( title != "" )
{ {
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE ); offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
...@@ -86,8 +78,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -86,8 +78,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
++htmlIt; ++htmlIt;
} }
} }
}
/**
* Returns anchor text if found
* @param html
* @return
*/
string Parser::extractAnchorText( string html )
{
return "";
} }
/** /**
...@@ -95,7 +95,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ) ...@@ -95,7 +95,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
* @param word * @param word
* @return * @return
*/ */
string Parser::extract_url ( string html ) string Parser::extractUrl ( string html )
{ {
string url = ""; string url = "";
if ( findStr( "<a", html ) != html.size( ) ) if ( findStr( "<a", html ) != html.size( ) )
...@@ -143,7 +143,7 @@ string Parser::extract_url ( string html ) ...@@ -143,7 +143,7 @@ string Parser::extract_url ( string html )
* @param word * @param word
* @return * @return
*/ */
string Parser::extract_title ( string html ) string Parser::extractTitle ( string html )
{ {
string title = ""; string title = "";
char end = '<'; char end = '<';
...@@ -203,3 +203,30 @@ bool Parser::isValid ( string url ) ...@@ -203,3 +203,30 @@ bool Parser::isValid ( string url )
} }
return true; return true;
} }
/**
* Sends to Url Frontier
*
* @param url
* @param currentUrl
* @param anchorText --> will be "null" if empty
* @param debug --> will print urls to std::cout
*/
void Parser::pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug )
{
if ( isLocal( url ) )
{
url = currentUrl.getCompleteUrl( ) + url;
}
if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) )
{
ParsedUrl pUrl = ParsedUrl( url );
pUrl.setAnchorText( anchorText );
urlFrontier->Push( pUrl );
if ( debug )
{
cout << url << endl;
cout << anchorText << endl;
}
}
}
...@@ -47,13 +47,19 @@ private: ...@@ -47,13 +47,19 @@ private:
*/ */
void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer ); void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
/**
* Returns anchor text if found
* @param html
* @return
*/
string extractAnchorText( string html );
/** /**
* Returns a url, or "" if none * Returns a url, or "" if none
* @param html * @param html
* @return * @return
*/ */
string extract_url ( string html ); string extractUrl ( string html );
/** /**
...@@ -61,7 +67,7 @@ private: ...@@ -61,7 +67,7 @@ private:
* @param html * @param html
* @return * @return
*/ */
string extract_title ( string html ); string extractTitle ( string html );
/** /**
* Will return true if local url * Will return true if local url
...@@ -78,5 +84,16 @@ private: ...@@ -78,5 +84,16 @@ private:
* @return * @return
*/ */
bool isValid ( string url ); bool isValid ( string url );
/**
* Sends to Url Frontier
*
* @param url
* @param currentUrl
* @param anchorText
* @param debug --> will print urls to std::cout
*/
void pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug );
}; };
...@@ -84,7 +84,7 @@ void Document::PrintDocMap ( string url, int location ) ...@@ -84,7 +84,7 @@ void Document::PrintDocMap ( string url, int location )
{ {
char *buffer = new char[bytes]; char *buffer = new char[bytes];
ssize_t bytesRead; ssize_t bytesRead;
if ( bytesRead = read ( file, buffer, bytes ) ) if ( ( bytesRead = read ( file, buffer, bytes ) ) )
write ( 1, buffer, bytesRead ); write ( 1, buffer, bytesRead );
else else
{ {
......
...@@ -20,11 +20,6 @@ using namespace std; ...@@ -20,11 +20,6 @@ using namespace std;
#define MIL ".mil" #define MIL ".mil"
#define INT ".int" #define INT ".int"
class ParsedUrl class ParsedUrl
{ {
public: public:
...@@ -32,7 +27,8 @@ public: ...@@ -32,7 +27,8 @@ public:
*Service, *Service,
*Host, *Host,
*Domain, *Domain,
*Path; *Path,
*AnchorText;
double Score; double Score;
ParsedUrl( string input_url ) ParsedUrl( string input_url )
...@@ -40,6 +36,11 @@ public: ...@@ -40,6 +36,11 @@ public:
// Assumes url points to static text but // Assumes url points to static text but
// does not check. // does not check.
//intialize anchor text to "null"
char *null = new char[2];
strcpy(null, string("").c_str());
AnchorText = null;
char *url = new char[input_url.length() + 1]; char *url = new char[input_url.length() + 1];
strcpy(url, input_url.c_str()); strcpy(url, input_url.c_str());
...@@ -86,10 +87,6 @@ public: ...@@ -86,10 +87,6 @@ public:
} }
// Whatever remains is the Path. // need to remove fragments // Whatever remains is the Path. // need to remove fragments
Path = p; Path = p;
...@@ -140,6 +137,47 @@ public: ...@@ -140,6 +137,47 @@ public:
Score += .5; Score += .5;
} }
std::string getCompleteUrl( )
{
std::string completeUrl = "";
completeUrl.assign( this->CompleteUrl );
return completeUrl;
}
std::string getHost( )
{
std::string host = "";
host.assign( this->Host );
return host;
}
std::string getPath( )
{
std::string path = "";
path.assign( this->Path );
return path;
}
std::string getAnchorText( )
{
std::string anchorText = "";
anchorText.assign( this->AnchorText );
return anchorText;
}
void setAnchorText( std::string anchorText )
{
char * anchorCharStar = new char[ anchorText.size( ) ];
for ( int i = 0; i < anchorText.size( ); ++i )
{
anchorCharStar += anchorText[ i ];
}
anchorCharStar += '\0';
this->AnchorText = anchorCharStar;
}
~ParsedUrl( ) ~ParsedUrl( )
{ {
pathBuffer = 0; pathBuffer = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment