Skip to content
Snippets Groups Projects
Commit 4e2d4d5e authored by jsclose's avatar jsclose
Browse files

Created a checkstatus function for the web readers so that we dont pull from a site that is bad

parent b06fce3b
No related branches found
No related tags found
No related merge requests found
No preview for this file type
...@@ -67,6 +67,24 @@ ParsedUrl HttpReader::getUrl() ...@@ -67,6 +67,24 @@ ParsedUrl HttpReader::getUrl()
} }
bool HttpReader::checkStatus()
{
string code = "";
char buff[12];
int bytes = 0;
bytes = recv( sock, buff, 12, 0 ) ;
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{
cerr << "URL REDIRECTION" << endl;
return false;
}
}
void HttpReader::closeReader() void HttpReader::closeReader()
{ {
close( sock ); close( sock );
......
...@@ -12,6 +12,7 @@ public: ...@@ -12,6 +12,7 @@ public:
HttpReader( ParsedUrl url_in ) : url( url_in ) { } HttpReader( ParsedUrl url_in ) : url( url_in ) { }
void request(); void request();
bool fillBuffer(char * buf, size_t buf_size); bool fillBuffer(char * buf, size_t buf_size);
bool checkStatus();
string PageToString(); string PageToString();
ParsedUrl getUrl(); ParsedUrl getUrl();
void closeReader(); void closeReader();
......
...@@ -73,6 +73,28 @@ string HttpsReader::PageToString() ...@@ -73,6 +73,28 @@ string HttpsReader::PageToString()
return temp; return temp;
} }
bool HttpsReader::checkStatus()
{
string code = "";
char buff[12];
int bytes = 0;
bytes = SSL_read( ssl, buff, 12 );
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{
cerr << "URL REDIRECTION" << endl;
return false;
}
return false;
}
ParsedUrl HttpsReader::getUrl() ParsedUrl HttpsReader::getUrl()
{ {
return url; return url;
......
...@@ -17,7 +17,7 @@ public: ...@@ -17,7 +17,7 @@ public:
string PageToString(); string PageToString();
ParsedUrl getUrl(); ParsedUrl getUrl();
void closeReader(); void closeReader();
bool checkStatus();
private: private:
ParsedUrl url; ParsedUrl url;
......
...@@ -34,6 +34,10 @@ ParsedUrl LocalReader::getUrl() ...@@ -34,6 +34,10 @@ ParsedUrl LocalReader::getUrl()
} }
bool LocalReader::checkStatus()
{
return true;
}
void LocalReader::closeReader() void LocalReader::closeReader()
{ {
//FIXME //FIXME
......
...@@ -14,6 +14,7 @@ public: ...@@ -14,6 +14,7 @@ public:
void request(); void request();
bool fillBuffer(char * buf, size_t buf_size); bool fillBuffer(char * buf, size_t buf_size);
bool checkStatus();
string PageToString(); string PageToString();
ParsedUrl getUrl(); ParsedUrl getUrl();
void closeReader(); void closeReader();
......
...@@ -25,6 +25,7 @@ public: ...@@ -25,6 +25,7 @@ public:
StreamReader() {}; StreamReader() {};
virtual void request() = 0; virtual void request() = 0;
virtual bool fillBuffer(char * buf, size_t buf_size) = 0; virtual bool fillBuffer(char * buf, size_t buf_size) = 0;
virtual bool checkStatus() = 0;
virtual string PageToString() = 0; virtual string PageToString() = 0;
virtual ParsedUrl getUrl() =0; virtual ParsedUrl getUrl() =0;
virtual void closeReader() = 0; virtual void closeReader() = 0;
......
...@@ -50,7 +50,7 @@ int main( int argc, char *argv[] ) ...@@ -50,7 +50,7 @@ int main( int argc, char *argv[] )
string mode = "web"; string mode = "web";
int numberOfSpiders = 3; int numberOfSpiders = 1;
opterr = true; opterr = true;
int choice; int choice;
...@@ -128,7 +128,7 @@ crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap); ...@@ -128,7 +128,7 @@ crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap);
crawler.WaitOnAllSpiders(); crawler.WaitOnAllSpiders();
auto f = urlFrontier->Pop(); auto f = urlFrontier->Pop();
int x = 0; int x = 0;
delete urlFrontier; delete urlFrontier;
} }
\ No newline at end of file
...@@ -32,68 +32,70 @@ const unordered_map< string, vector< unsigned long > > *Parser::execute ( Stream ...@@ -32,68 +32,70 @@ const unordered_map< string, vector< unsigned long > > *Parser::execute ( Stream
void Parser::parse ( StreamReader* reader, Tokenizer *tokenizer ) void Parser::parse ( StreamReader* reader, Tokenizer *tokenizer )
{ {
reader->request(); reader->request();
string html = reader->PageToString(); bool success = reader->checkStatus();
ParsedUrl currentUrl = reader->getUrl(); if(success) {
string html = reader->PageToString();
ParsedUrl currentUrl = reader->getUrl();
auto htmlIt = html.begin( ); auto htmlIt = html.begin( );
unsigned long offsetTitle = 0; unsigned long offsetTitle = 0;
unsigned long offsetURL = 0; unsigned long offsetURL = 0;
// tokenize url // tokenize url
string host = ""; string host = "";
host.assign( currentUrl.Host ); host.assign( currentUrl.Host );
string path = ""; string path = "";
path.assign( currentUrl.Path ); path.assign( currentUrl.Path );
string url = host + "/" + path; string url = host + "/" + path;
offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL ); offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
while ( htmlIt != html.end( ) ) while ( htmlIt != html.end( ) )
{
// if open bracket
if ( *htmlIt == '<' )
{ {
auto begCloseTag = findNext( "</", htmlIt ); // if open bracket
auto endCloseTag = findNext( ">", begCloseTag ); if ( *htmlIt == '<' )
string line( htmlIt, endCloseTag + 1 );
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url( line );
if ( url != "" )
{ {
if ( isLocal( url ) ) auto begCloseTag = findNext( "</", htmlIt );
auto endCloseTag = findNext( ">", begCloseTag );
string line( htmlIt, endCloseTag + 1 );
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url( line );
if ( url != "" )
{ {
string completeUrl = ""; if ( isLocal( url ) )
completeUrl.assign( currentUrl.CompleteUrl ); {
url = completeUrl + url; string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
}
} }
if ( isValid( url ) ) // check if line is title
else
{ {
// TODO ParsedUrl with anchor text string title = extract_title( line );
if ( title != "" )
ParsedUrl pUrl = ParsedUrl( url ); {
urlFrontier->Push( pUrl ); offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
cout << url << endl; }
} }
} }
// check if line is title
else else
{ {
string title = extract_title( line ); ++htmlIt;
if ( title != "" )
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
} }
} }
else
{
++htmlIt;
}
}
}
} }
/** /**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment