Skip to content
Snippets Groups Projects
Commit 4e2d4d5e authored by jsclose's avatar jsclose
Browse files

Created a checkstatus function for the web readers so that we dont pull from a site that is bad

parent b06fce3b
No related branches found
No related tags found
No related merge requests found
No preview for this file type
......@@ -67,6 +67,24 @@ ParsedUrl HttpReader::getUrl()
}
bool HttpReader::checkStatus()
{
string code = "";
char buff[12];
int bytes = 0;
bytes = recv( sock, buff, 12, 0 ) ;
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{
cerr << "URL REDIRECTION" << endl;
return false;
}
}
void HttpReader::closeReader()
{
close( sock );
......
......@@ -12,6 +12,7 @@ public:
HttpReader( ParsedUrl url_in ) : url( url_in ) { }
void request();
bool fillBuffer(char * buf, size_t buf_size);
bool checkStatus();
string PageToString();
ParsedUrl getUrl();
void closeReader();
......
......@@ -73,6 +73,28 @@ string HttpsReader::PageToString()
return temp;
}
bool HttpsReader::checkStatus()
{
string code = "";
char buff[12];
int bytes = 0;
bytes = SSL_read( ssl, buff, 12 );
if( strncmp(buff, "HTTP/1.1 200",11 ) == 0)
return true;
else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0)
{
cerr << "URL REDIRECTION" << endl;
return false;
}
return false;
}
ParsedUrl HttpsReader::getUrl()
{
return url;
......
......@@ -17,7 +17,7 @@ public:
string PageToString();
ParsedUrl getUrl();
void closeReader();
bool checkStatus();
private:
ParsedUrl url;
......
......@@ -34,6 +34,10 @@ ParsedUrl LocalReader::getUrl()
}
bool LocalReader::checkStatus()
{
return true;
}
void LocalReader::closeReader()
{
//FIXME
......
......@@ -14,6 +14,7 @@ public:
void request();
bool fillBuffer(char * buf, size_t buf_size);
bool checkStatus();
string PageToString();
ParsedUrl getUrl();
void closeReader();
......
......@@ -25,6 +25,7 @@ public:
StreamReader() {};
virtual void request() = 0;
virtual bool fillBuffer(char * buf, size_t buf_size) = 0;
virtual bool checkStatus() = 0;
virtual string PageToString() = 0;
virtual ParsedUrl getUrl() =0;
virtual void closeReader() = 0;
......
......@@ -50,7 +50,7 @@ int main( int argc, char *argv[] )
string mode = "web";
int numberOfSpiders = 3;
int numberOfSpiders = 1;
opterr = true;
int choice;
......@@ -128,7 +128,7 @@ crawler.SpawnSpiders(numberOfSpiders , docMapLookUp, duplicateUrlMap);
crawler.WaitOnAllSpiders();
auto f = urlFrontier->Pop();
auto f = urlFrontier->Pop();
int x = 0;
delete urlFrontier;
}
\ No newline at end of file
......@@ -32,68 +32,70 @@ const unordered_map< string, vector< unsigned long > > *Parser::execute ( Stream
void Parser::parse ( StreamReader* reader, Tokenizer *tokenizer )
{
reader->request();
string html = reader->PageToString();
ParsedUrl currentUrl = reader->getUrl();
bool success = reader->checkStatus();
if(success) {
string html = reader->PageToString();
ParsedUrl currentUrl = reader->getUrl();
auto htmlIt = html.begin( );
unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
auto htmlIt = html.begin( );
unsigned long offsetTitle = 0;
unsigned long offsetURL = 0;
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string url = host + "/" + path;
// tokenize url
string host = "";
host.assign( currentUrl.Host );
string path = "";
path.assign( currentUrl.Path );
string url = host + "/" + path;
offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
offsetURL = tokenizer->execute( url, offsetURL, Tokenizer::URL );
while ( htmlIt != html.end( ) )
{
// if open bracket
if ( *htmlIt == '<' )
while ( htmlIt != html.end( ) )
{
auto begCloseTag = findNext( "</", htmlIt );
auto endCloseTag = findNext( ">", begCloseTag );
string line( htmlIt, endCloseTag + 1 );
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url( line );
if ( url != "" )
// if open bracket
if ( *htmlIt == '<' )
{
if ( isLocal( url ) )
auto begCloseTag = findNext( "</", htmlIt );
auto endCloseTag = findNext( ">", begCloseTag );
string line( htmlIt, endCloseTag + 1 );
htmlIt = endCloseTag + 2;
// check if line is url
string url = extract_url( line );
if ( url != "" )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
if ( isLocal( url ) )
{
string completeUrl = "";
completeUrl.assign( currentUrl.CompleteUrl );
url = completeUrl + url;
}
if ( isValid( url ) )
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
}
}
if ( isValid( url ) )
// check if line is title
else
{
// TODO ParsedUrl with anchor text
ParsedUrl pUrl = ParsedUrl( url );
urlFrontier->Push( pUrl );
cout << url << endl;
string title = extract_title( line );
if ( title != "" )
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
}
}
// check if line is title
else
{
string title = extract_title( line );
if ( title != "" )
{
offsetTitle = tokenizer->execute( title, offsetTitle, Tokenizer::TITLE );
}
++htmlIt;
}
}
else
{
++htmlIt;
}
}
}
}
/**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment