Skip to content
Snippets Groups Projects
Commit e809b071 authored by vcday's avatar vcday
Browse files

added framework for anchor text parsing

parent 02e3c897
Branches crawler-parser
No related tags found
No related merge requests found
......@@ -33,16 +33,19 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
unsigned long htmlIt = 0;
unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
unsigned long offsetAnchor = 0;
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string urlCurrent = host + "/" + path;
offsetURL = tokenizer->execute( currentUrl.getHost( ) + "/" + currentUrl.getPath( ), offsetURL, Tokenizer::URL );
offsetURL = tokenizer->execute( urlCurrent, offsetURL, Tokenizer::URL );
// tokenize anchor
string anchorText = currentUrl.getAnchorText( );
if ( anchorText != "" )
{
offsetAnchor = tokenizer->execute( anchorText, offsetAnchor, Tokenizer::ANCHOR );
}
// find titles
while ( htmlIt < html.size( ) )
{
// if open bracket
......@@ -54,27 +57,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url( line );
string url = extractUrl( line );
if ( url != "" )
{
if ( isLocal( url ) )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) && url != urlCurrent )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
}
pushToUrlQueue( url, currentUrl, extractAnchorText( line ), true );
}
// check if line is title
// check if line is title
else
{
string title = extract_title( line );
string title = extractTitle( line );
if ( title != "" )
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
......@@ -86,8 +78,16 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
++htmlIt;
}
}
}
/**
* Returns anchor text if found
* @param html
* @return
*/
string Parser::extractAnchorText( string html )
{
return "";
}
/**
......@@ -95,7 +95,7 @@ void Parser::parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer )
* @param word
* @return
*/
string Parser::extract_url ( string html )
string Parser::extractUrl ( string html )
{
string url = "";
if ( findStr( "<a", html ) != html.size( ) )
......@@ -143,7 +143,7 @@ string Parser::extract_url ( string html )
* @param word
* @return
*/
string Parser::extract_title ( string html )
string Parser::extractTitle ( string html )
{
string title = "";
char end = '<';
......@@ -203,3 +203,30 @@ bool Parser::isValid ( string url )
}
return true;
}
/**
* Sends to Url Frontier
*
* @param url
* @param currentUrl
* @param anchorText --> will be "null" if empty
* @param debug --> will print urls to std::cout
*/
void Parser::pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug )
{
if ( isLocal( url ) )
{
url = currentUrl.getCompleteUrl( ) + url;
}
if ( isValid( url ) && url != currentUrl.getCompleteUrl( ) )
{
ParsedUrl pUrl = ParsedUrl( url );
pUrl.setAnchorText( anchorText );
urlFrontier->Push( pUrl );
if ( debug )
{
cout << url << endl;
cout << anchorText << endl;
}
}
}
......@@ -47,13 +47,19 @@ private:
*/
void parse ( string html, ParsedUrl currentUrl, Tokenizer *tokenizer );
/**
* Returns anchor text if found
* @param html
* @return
*/
string extractAnchorText( string html );
/**
* Returns a url, or "" if none
* @param html
* @return
*/
string extract_url ( string html );
string extractUrl ( string html );
/**
......@@ -61,7 +67,7 @@ private:
* @param html
* @return
*/
string extract_title ( string html );
string extractTitle ( string html );
/**
* Will return true if local url
......@@ -78,5 +84,16 @@ private:
* @return
*/
bool isValid ( string url );
/**
* Sends to Url Frontier
*
* @param url
* @param currentUrl
* @param anchorText
* @param debug --> will print urls to std::cout
*/
void pushToUrlQueue( string url, ParsedUrl currentUrl, string anchorText, bool debug );
};
......@@ -84,7 +84,7 @@ void Document::PrintDocMap ( string url, int location )
{
char *buffer = new char[bytes];
ssize_t bytesRead;
if ( bytesRead = read ( file, buffer, bytes ) )
if ( ( bytesRead = read ( file, buffer, bytes ) ) )
write ( 1, buffer, bytesRead );
else
{
......
......@@ -20,11 +20,6 @@ using namespace std;
#define MIL ".mil"
#define INT ".int"
class ParsedUrl
{
public:
......@@ -32,7 +27,8 @@ public:
*Service,
*Host,
*Domain,
*Path;
*Path,
*AnchorText;
double Score;
ParsedUrl( string input_url )
......@@ -40,6 +36,11 @@ public:
// Assumes url points to static text but
// does not check.
//intialize anchor text to "null"
char *null = new char[2];
strcpy(null, string("").c_str());
AnchorText = null;
char *url = new char[input_url.length() + 1];
strcpy(url, input_url.c_str());
......@@ -86,10 +87,6 @@ public:
}
// Whatever remains is the Path. // need to remove fragments
Path = p;
......@@ -140,6 +137,47 @@ public:
Score += .5;
}
std::string getCompleteUrl( )
{
std::string completeUrl = "";
completeUrl.assign( this->CompleteUrl );
return completeUrl;
}
std::string getHost( )
{
std::string host = "";
host.assign( this->Host );
return host;
}
std::string getPath( )
{
std::string path = "";
path.assign( this->Path );
return path;
}
std::string getAnchorText( )
{
std::string anchorText = "";
anchorText.assign( this->AnchorText );
return anchorText;
}
void setAnchorText( std::string anchorText )
{
char * anchorCharStar = new char[ anchorText.size( ) ];
for ( int i = 0; i < anchorText.size( ); ++i )
{
anchorCharStar += anchorText[ i ];
}
anchorCharStar += '\0';
this->AnchorText = anchorCharStar;
}
~ParsedUrl( )
{
pathBuffer = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment