Skip to content
Snippets Groups Projects
Commit 9d465238 authored by benbergk's avatar benbergk
Browse files

changed ParsedUrl to be strings

parent 3d5f1a5e
No related branches found
No related tags found
No related merge requests found
...@@ -6,7 +6,7 @@ std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" ); ...@@ -6,7 +6,7 @@ std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" );
bool HttpReader::request ( ) bool HttpReader::request ( )
{ {
try try
{ {
sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP );
...@@ -14,11 +14,11 @@ bool HttpReader::request ( ) ...@@ -14,11 +14,11 @@ bool HttpReader::request ( )
// Get the host address. // Get the host address.
struct hostent *host = gethostbyname( url.Host ); struct hostent *host = gethostbyname( url.getHost().c_str() );
if ( host == nullptr ) if ( host == nullptr )
throw HTTPConnectionError; throw HTTPConnectionError;
if( strcmp(url.Service, "http") != 0) if(url.getService() != "http")
throw HTTPConnectionError; throw HTTPConnectionError;
assert( host ); assert( host );
...@@ -32,7 +32,7 @@ bool HttpReader::request ( ) ...@@ -32,7 +32,7 @@ bool HttpReader::request ( )
// Connect to the host. // Connect to the host.
int connectResult = connect( sock, ( struct sockaddr * ) &address, int connectResult = connect( sock, ( struct sockaddr * ) &address,
sizeof( address ) ); sizeof( address ) );
assert( connectResult == 0 ); assert( connectResult == 0 );
// Send a GET message for the desired page. // Send a GET message for the desired page.
...@@ -40,9 +40,9 @@ bool HttpReader::request ( ) ...@@ -40,9 +40,9 @@ bool HttpReader::request ( )
cout << "Socket Reader is pulling from the web" << endl; cout << "Socket Reader is pulling from the web" << endl;
string getMessage = "GET "; string getMessage = "GET ";
getMessage += url.CompleteUrl; getMessage += url.getCompleteUrl();
getMessage += " HTTP/1.1\r\nHost: "; getMessage += " HTTP/1.1\r\nHost: ";
getMessage += url.Host; getMessage += url.getHost();
getMessage += "\r\nConnection: close\r\n\r\n"; getMessage += "\r\nConnection: close\r\n\r\n";
cout << getMessage << endl; cout << getMessage << endl;
...@@ -51,12 +51,12 @@ bool HttpReader::request ( ) ...@@ -51,12 +51,12 @@ bool HttpReader::request ( )
bool isSuccess = checkStatus( ); bool isSuccess = checkStatus( );
return isSuccess; return isSuccess;
} }
catch ( std::exception & e ) catch ( std::exception & e )
{ {
cerr << "Error trying to connect to Host" << endl; cerr << "Error trying to connect to Host" << endl;
return false; return false;
} }
} }
bool HttpReader::fillBuffer ( char *buf, size_t buf_size ) bool HttpReader::fillBuffer ( char *buf, size_t buf_size )
...@@ -72,9 +72,9 @@ string HttpReader::PageToString ( ) ...@@ -72,9 +72,9 @@ string HttpReader::PageToString ( )
int bytes = 0; int bytes = 0;
while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 ) while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 )
{ {
temp += string( buf, bytes ); temp += string( buf, bytes );
} }
return temp; return temp;
} }
...@@ -97,10 +97,10 @@ bool HttpReader::checkStatus ( ) ...@@ -97,10 +97,10 @@ bool HttpReader::checkStatus ( )
else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0) else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
return true; return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{ {
cerr << "URL REDIRECTION" << endl; cerr << "URL REDIRECTION" << endl;
return false; return false;
} }
cerr << "Bad Request of TYPE:: " << buff << endl; cerr << "Bad Request of TYPE:: " << buff << endl;
return false; return false;
} }
......
...@@ -6,13 +6,13 @@ std::runtime_error HTTPSconnectionError( "Error connecting HTTPS to url" ); ...@@ -6,13 +6,13 @@ std::runtime_error HTTPSconnectionError( "Error connecting HTTPS to url" );
bool HttpsReader::request ( ) bool HttpsReader::request ( )
{ {
try try
{ {
struct hostent *host = gethostbyname( url.Host ); struct hostent *host = gethostbyname( url.getHost().c_str() );
if ( host == nullptr ) if ( host == nullptr )
throw HTTPSconnectionError; throw HTTPSconnectionError;
if( strcmp(url.Service, "https") != 0) if( url.getService() != "https")
throw HTTPSconnectionError; throw HTTPSconnectionError;
assert( host ); assert( host );
...@@ -30,7 +30,7 @@ bool HttpsReader::request ( ) ...@@ -30,7 +30,7 @@ bool HttpsReader::request ( )
// Connect the socket to the host address. // Connect the socket to the host address.
int connectResult = connect( sock, ( struct sockaddr * ) &address, int connectResult = connect( sock, ( struct sockaddr * ) &address,
sizeof( address ) ); sizeof( address ) );
assert( connectResult == 0 ); assert( connectResult == 0 );
// Build an SSL layer and set it to read/write // Build an SSL layer and set it to read/write
...@@ -54,9 +54,9 @@ bool HttpsReader::request ( ) ...@@ -54,9 +54,9 @@ bool HttpsReader::request ( )
// Send a GET message for the desired page through the SSL. // Send a GET message for the desired page through the SSL.
string getMessage = "GET "; string getMessage = "GET ";
getMessage += url.CompleteUrl; getMessage += url.getCompleteUrl();
getMessage += " HTTP/1.1\r\nHost: "; getMessage += " HTTP/1.1\r\nHost: ";
getMessage += url.Host; getMessage += url.getHost();
getMessage += "\r\nConnection: close\r\n\r\n"; getMessage += "\r\nConnection: close\r\n\r\n";
cout << getMessage << endl; cout << getMessage << endl;
...@@ -64,12 +64,12 @@ bool HttpsReader::request ( ) ...@@ -64,12 +64,12 @@ bool HttpsReader::request ( )
bool isSuccess = checkStatus( ); bool isSuccess = checkStatus( );
return isSuccess; return isSuccess;
} }
catch ( std::exception & e ) catch ( std::exception & e )
{ {
cerr << "Error trying to connect to Host" << endl; cerr << "Error trying to connect to Host" << endl;
return false; return false;
} }
} }
bool HttpsReader::fillBuffer ( char *buf, size_t buf_size ) bool HttpsReader::fillBuffer ( char *buf, size_t buf_size )
...@@ -85,9 +85,9 @@ string HttpsReader::PageToString ( ) ...@@ -85,9 +85,9 @@ string HttpsReader::PageToString ( )
int bytes = 0; int bytes = 0;
while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 ) while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 )
{ {
temp += string( buf, bytes ); temp += string( buf, bytes );
} }
return temp; return temp;
} }
...@@ -102,13 +102,13 @@ bool HttpsReader::checkStatus ( ) ...@@ -102,13 +102,13 @@ bool HttpsReader::checkStatus ( )
if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 ) if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 )
return true; return true;
else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0) else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0)
return true; return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{ {
cerr << "URL REDIRECTION" << endl; cerr << "URL REDIRECTION" << endl;
return false; return false;
} }
cerr << "Bad Request of TYPE:: " << buff << endl; cerr << "Bad Request of TYPE:: " << buff << endl;
return false; return false;
......
...@@ -20,24 +20,25 @@ StreamReader *SR_factory ( ParsedUrl url, string mode ) ...@@ -20,24 +20,25 @@ StreamReader *SR_factory ( ParsedUrl url, string mode )
StreamReader *newReader = nullptr; StreamReader *newReader = nullptr;
if ( mode == "local" ) if ( mode == "local" )
{ {
newReader = new LocalReader( string(url.CompleteUrl, strlen(url.CompleteUrl) ) ); newReader = new LocalReader( url.getCompleteUrl() );
} }
else if ( mode == "web" ) else if ( mode == "web" )
{
if ( url.getService() == "http" )
{ {
if ( !strcmp( url.Service, "http" ) )
{
newReader = new HttpReader( url ); newReader = new HttpReader( url );
} }
else if ( !strcmp( url.Service, "https" ) ) else if ( url.getService() == "https" )
{ {
newReader = new HttpsReader( url ); newReader = new HttpsReader( url );
} }
else else
{ {
cerr << "Error reading service type\n"; cerr << "Error reading service type\n";
} cerr << "Service Type: " << url.getService() << "\n";
} }
}
return newReader; return newReader;
} }
...@@ -45,14 +46,14 @@ StreamReader *SR_factory ( ParsedUrl url, string mode ) ...@@ -45,14 +46,14 @@ StreamReader *SR_factory ( ParsedUrl url, string mode )
void printDocIndex ( DocIndex *dict ) void printDocIndex ( DocIndex *dict )
{ {
for ( auto it = dict->begin( ); it != dict->end( ); it++ ) for ( auto it = dict->begin( ); it != dict->end( ); it++ )
{ {
cout << it->first << " : "; cout << it->first << " : ";
for ( int i = 0; i < it->second.size( ); ++i ) for ( int i = 0; i < it->second.size( ); ++i )
{ {
cout << it->second[ i ] << " "; cout << it->second[ i ] << " ";
}
cout << std::endl;
} }
cout << std::endl;
}
cout << std::endl; cout << std::endl;
} }
...@@ -80,18 +81,18 @@ void Spider::run ( ) ...@@ -80,18 +81,18 @@ void Spider::run ( )
int cond = 0; int cond = 0;
while ( cond < 250 ) while ( cond < 250 )
{ {
ParsedUrl currentUrl = getUrl( ); ParsedUrl currentUrl = getUrl( );
size_t docID = hash( currentUrl.CompleteUrl ); size_t docID = hash( currentUrl.getCompleteUrl().c_str() );
if ( shouldURLbeCrawled( docID ) ) if ( shouldURLbeCrawled( docID ) )
{ {
StreamReader *reader = SR_factory( currentUrl, this->mode ); StreamReader *reader = SR_factory( currentUrl, this->mode );
if(reader) if(reader)
{ {
bool success = reader->request( ); bool success = reader->request( );
if ( success ) if ( success )
{ {
cout << "Parsing " << currentUrl.CompleteUrl; cout << "Parsing " << currentUrl.getCompleteUrl();
DocIndex *dict = parser.execute( reader ); DocIndex *dict = parser.execute( reader );
IndexerQueue->Push( dict ); IndexerQueue->Push( dict );
...@@ -100,16 +101,16 @@ void Spider::run ( ) ...@@ -100,16 +101,16 @@ void Spider::run ( )
//delete dict; //delete dict;
cond++; cond++;
}
} }
}
delete reader; delete reader;
}
} }
} }
}
......
...@@ -19,42 +19,50 @@ using namespace std; ...@@ -19,42 +19,50 @@ using namespace std;
class ParsedUrl class ParsedUrl
{ {
public: private:
char *CompleteUrl, string CompleteUrl,
*Service, Service,
*Host, Host,
*Domain, Domain,
*Path, Path,
*AnchorText; AnchorText;
double Score; double Score;
public:
ParsedUrl ( string input_url ) ParsedUrl ( string input_url )
{ {
// Assumes url points to static text but // Assumes url points to static text but
// does not check. // does not check.
char *temp_CompleteUrl,
*temp_Service,
*temp_Host,
*temp_Domain,
*temp_Path,
*temp_AnchorText,
*temp_pathBuffer;
//intialize anchor text to "null" //intialize anchor text to "null"
char *null = new char[2]; char *null = new char[2];
strcpy( null, string( "" ).c_str( ) ); strcpy( null, string( "" ).c_str( ) );
AnchorText = null; temp_AnchorText = null;
char *url = new char[input_url.length( ) + 1]; char *url = new char[input_url.length( ) + 1];
strcpy( url, input_url.c_str( ) ); strcpy( url, input_url.c_str( ) );
CompleteUrl = url; temp_CompleteUrl = url;
pathBuffer = new char[strlen( url ) + 1]; temp_pathBuffer = new char[strlen( url ) + 1];
char *f, *t; char *f, *t;
for ( t = pathBuffer, f = url; ( *t++ = *f++ ); ); for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); );
Service = pathBuffer; temp_Service = temp_pathBuffer;
const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; const char Colon = ':', Slash = '/', HashTag = '#', Period = '.';
char *p; char *p;
for ( p = pathBuffer; *p && *p != Colon; p++ ); for ( p = temp_pathBuffer; *p && *p != Colon; p++ );
if ( *p ) if ( *p )
{ {
// Mark the end of the Service. // Mark the end of the Service.
*p++ = 0; *p++ = 0;
...@@ -63,7 +71,7 @@ public: ...@@ -63,7 +71,7 @@ public:
if ( *p == Slash ) if ( *p == Slash )
p++; p++;
Host = p; temp_Host = p;
for ( ; *p && *p != Slash; p++ ); for ( ; *p && *p != Slash; p++ );
...@@ -73,24 +81,24 @@ public: ...@@ -73,24 +81,24 @@ public:
//char * domainBuffer = new char[ 20 ]; //char * domainBuffer = new char[ 20 ];
//get the domain: //get the domain:
char *i = Host; char *i = temp_Host;
Domain = null; temp_Domain = null;
if(i) if(i)
{ {
for ( ; *i; i++ ) for ( ; *i; i++ )
{ {
if ( *i == Period ) if ( *i == Period )
Domain = i; temp_Domain = i;
}
} }
}
// Whatever remains is the Path. // need to remove fragments // Whatever remains is the Path. // need to remove fragments
Path = p; temp_Path = p;
for ( ; *p && *p != HashTag; p++ ); for ( ; *p && *p != HashTag; p++ );
if ( *p ) if ( *p )
...@@ -98,9 +106,18 @@ public: ...@@ -98,9 +106,18 @@ public:
*p++ = 0; *p++ = 0;
} }
else else
Host = Path = p; temp_Host = temp_Path = p;
CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl));
Service = string(temp_Service, strlen(temp_Service));
Host = string(temp_Host, strlen(temp_Host));
Domain = string(temp_Domain, strlen(temp_Domain));
Path = string(temp_Path, strlen(temp_Path));
AnchorText = string(temp_AnchorText, strlen(temp_AnchorText));
pathBuffer = temp_pathBuffer;
setScore( ); setScore( );
} }
...@@ -119,72 +136,67 @@ public: ...@@ -119,72 +136,67 @@ public:
void setScore() void setScore()
{ {
double lengthOfUrl = strlen(CompleteUrl); double lengthOfUrl = CompleteUrl.length();
Score += 4 * 1/ log( lengthOfUrl ); Score += 4 * 1/ log( lengthOfUrl );
if(lengthOfUrl > 4) if(lengthOfUrl > 4)
{ {
if(this->Domain ) if(this->Domain.length() )
{
if ( strcmp ( Domain , ORG ) )
Score += 5;
else if ( strcmp ( Domain , EDU ) )
Score += 4;
else if ( strcmp ( Domain , GOV ) )
Score += 3;
else if ( strcmp ( Domain , COM ) )
Score += 2;
else if ( strcmp ( Domain , NET ) )
Score += 1;
else if ( strcmp ( Domain , INT ) )
Score += 1;
else if ( strcmp ( Domain , MIL ) )
Score += .5;
}
{
if ( strcmp ( Domain.c_str() , ORG ) )
Score += 5;
else if ( strcmp ( Domain.c_str() , EDU ) )
Score += 4;
else if ( strcmp ( Domain.c_str() , GOV ) )
Score += 3;
else if ( strcmp ( Domain.c_str() , COM ) )
Score += 2;
else if ( strcmp ( Domain.c_str() , NET ) )
Score += 1;
else if ( strcmp ( Domain.c_str() , INT ) )
Score += 1;
else if ( strcmp ( Domain.c_str() , MIL ) )
Score += .5;
} }
}
}
std::string getDomain ( )
{
return Domain;
}
std::string getService ( )
{
return Service;
} }
std::string getCompleteUrl ( ) std::string getCompleteUrl ( )
{ {
std::string completeUrl = ""; return CompleteUrl;
completeUrl.assign( this->CompleteUrl );
return completeUrl;
} }
std::string getHost ( ) std::string getHost ( )
{ {
std::string host = ""; return Host;
host.assign( this->Host );
return host;
} }
std::string getPath ( ) std::string getPath ( )
{ {
std::string path = ""; return Path;
path.assign( this->Path );
return path;
} }
std::string getAnchorText ( ) std::string getAnchorText ( )
{ {
std::string anchorText = ""; return AnchorText;
anchorText.assign( this->AnchorText );
return anchorText;
} }
void setAnchorText ( std::string anchorText ) void setAnchorText ( std::string anchorText )
{ {
char *anchorCharStar = new char[anchorText.size( )]; AnchorText = anchorText;
for ( int i = 0; i < anchorText.size( ); ++i )
{
anchorCharStar += anchorText[ i ];
}
anchorCharStar += '\0';
this->AnchorText = anchorCharStar;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment