Skip to content
Snippets Groups Projects
Commit c42194a6 authored by jsclose's avatar jsclose
Browse files

working on testing url + robots.txt in spider.h

parent 81d48785
No related branches found
No related tags found
No related merge requests found
......@@ -23,9 +23,6 @@ class StreamReader
public:
StreamReader() {};
virtual void fillBuffer() = 0;
protected:
char *buffer;
};
\ No newline at end of file
......@@ -96,6 +96,7 @@ bool Spider::shouldURLbeCrawled( string url )
//search for url in doc cache
auto locationOnDisk = this->docMapLookup->find( url );
//bool protectedByRobots = checkRobots( url );
//if it doesnt find anything for that url key
if ( locationOnDisk == this->docMapLookup->end( ))
{
......@@ -109,7 +110,47 @@ bool Spider::shouldURLbeCrawled( string url )
return false;
}
/*
//check if path in url is in the robots txt
bool Spider::checkRobots(string url_in)
{
ParsedUrl url = ParsedUrl(url_in);
string pathToRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host));
int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
//File does not exist yet
if(robotsFileD == -1)
{
robotsFileD = getRobots(url);
}
//char* robotsTXT = util::getFileMap(robotsFileD);
return 1;
}
//Makes request to get a new robots txt file, returns the file pointer
int Spider::getRobots(ParsedUrl url )
{
string pathToDiskRobots = util::GetCurrentWorkingDir() + "/robots/" + string(url.Host, strlen(url.Host));
string pathToWebRobots = "http://" + string(url.Host, strlen(url.Host)) + "/robots.txt";
//string(url.Service, strlen(url.Service))+
SocketReader *reader = new SocketReader(pathToWebRobots);
reader->fillBuffer();
int fd = util::writeToNewFileToLocation( reader->buffer, pathToDiskRobots);
if( fd == -1)
{
cerr << "Error getting Robots.txt file " << endl;
}
return fd;
return 1;
};
*/
/*
returns true if fileMap was created, otherwise false
Modifies the filemap to be a char* of the file of the url passed
......
......@@ -36,9 +36,12 @@ public:
bool writeDocToDisk(string url);
bool shouldURLbeCrawled( string URL );
int getRobots(ParsedUrl url );
bool checkRobots(string url);
private:
int locationOnDisk;
......
No preview for this file type
......@@ -7,96 +7,9 @@
#include <string>
#include <iostream>
#include "../util/util.h"
//#include "../crawler/StreamReader.h"
//#include "../crawler/SocketReader.h"
using namespace std;
//
//class Url
// {
//
//public:
// std::string url;
//
// Url( string url_in ) : url( url_in )
// { };
//
//
// //Removes/ parses url
// void clean();
//
// //parses domain from url
// string getDomain()
// {
// string domain;
// string protocol = getProtocol();
// int domainStart = protocol.size() + 3;
// if(url [ domainStart ] == 'w' )
// domainStart += 4;//starts with www.
//
//
//
// for( int domainEnd = domainStart ; domainEnd < url.size() ; domainEnd++ )
// {
// if ( url[ domainEnd ] == '/' || url[ domainEnd ] == ':')
// return domain;
// else
// domain.push_back( url[ domainEnd ] );
// }
// }
//
// //return .gov, .com, .edu
// string getDomainType()
// {
// string domain = getDomain();
// string type = "";
//
// auto i = domain.end();
// --i;
// while(*i != '.')
// {
// type.push_back(( *i ));
// --i;
// }
// reverse(type.begin(), type.end() );
//
//
// return type;
//
// }
//
// /*
// * HTTP, HTTPS, MAILTO etc
// */
// string getProtocol()
// {
// string protocol;
// for ( int i = 0; i < url.size( ); i++ )
// {
// if( url [ i ] != ':' )
// protocol.push_back( url [ i ] );
// else
// return protocol;
// }
//
//
// };
//
// /*
// * Returns URL such that
// * http://www.example.com:80/path/to/myfile.html#SomewhereInTheDocument
// * becomes http://www.example.com:80/path/to/myfile.html
// */
// void removeAnchor( ){
// int i = 0;
// string cleaned;
// while( url[ i ] != '#')
// cleaned.push_back( url [ i++ ] );
//
// url = cleaned;
// }
// };
......@@ -110,6 +23,7 @@ public:
char *CompleteUrl,
*Service,
*Host,
*Domain,
*Path;
ParsedUrl( string input_url )
......@@ -129,7 +43,7 @@ public:
Service = pathBuffer;
const char Colon = ':', Slash = '/';
const char Colon = ':', Slash = '/', HashTag = '#', Period = '.';
char *p;
for ( p = pathBuffer; *p && *p != Colon; p++ )
;
......@@ -153,51 +67,43 @@ public:
// Mark the end of the Host.
*p++ = 0;
// Whatever remains is the Path.
//char * domainBuffer = new char[ 20 ];
//get the domain:
for(int i = strlen(Host); Host[i] != Period; i--){
}
// Whatever remains is the Path. // need to remove fragments
Path = p;
for ( ; *p && *p != HashTag; p++ )
;
if ( *p )
// Mark the end of the Path, remove fragments.
*p++ = 0;
}
else
Host = Path = p;
}
/*
//check if path in url is in the robots txt
void checkRobots()
void printUrl()
{
string pathToRobots = util::GetCurrentWorkingDir() + '/' + Service;
int robotsFileD = util::getFileDescriptor(pathToRobots , "R");
//File does not exist yet
if(robotsFileD == -1)
{
robotsFileD = getRobots();
}
char* robotsTXT = util::getFileMap(robotsFileD);
cout << "Complete URL: " << CompleteUrl << endl;
cout << "Service: " << Service << endl;
cout << "Host: " << Host << endl;
cout << "Path: " << Path << endl;
}
//Makes request to get a new robots txt file, returns the file pointer
int getRobots( )
{
StreamReader reader;
string pathToRobots = util::GetCurrentWorkingDir() + '/' + Service;
reader = new SocketReader(CompleteUrl+ '/' + 'robots.txt');
reader->fillBuffer();
int fd = util::writeToNewFileToLocation( reader->buffer, pathToRobots);
if( fd == -1)
{
cerr << "Error getting Robots.txt file " << endl;
}
return fd;
return 1;
};
*/
~ParsedUrl( )
{
delete [ ] pathBuffer;
......
......@@ -12,25 +12,20 @@ using namespace std;
int main(int argc, const char * argv[])
{
Url test1 = Url("https://developer.mozilla.org/en-US/docs/Learn" ) ;
string protocol = test1.getProtocol();
assert( protocol == "https");
string domain = test1.getDomain();
assert( domain == "developer.mozilla.org");
string domainType = test1.getDomainType();
assert( domainType == "org");
Url test2 = Url("http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
test2.removeAnchor();
assert( test2.url == "http://www.example.com:80/path/to/myfile.html?key1=value1&key2=value2" );
protocol = test2.getProtocol();
assert( protocol == "http");
domain = test2.getDomain();
assert( domain == "example.com");
domainType = test2.getDomainType();
assert( domainType == "com");
ParsedUrl test1 = ParsedUrl("https://developer.mozilla.org/en-US/docs/Learn" ) ;
//string protocol = test1.getProtocol();
test1.printUrl();
//assert( strcmp(test1.Service, "https") == 1);
//assert( strcmp(test1.Host, "developer.mozilla.org") == 1);
ParsedUrl test2 = ParsedUrl("http://www.example.com/path/to/myfile.html?key1=value1&key2=value2#SomewhereInTheDocument");
test2.printUrl();
assert( strcmp(test2.Service, "http"));
assert( strcmp(test2.Host, "example.com"));
std::cout << "URL TEST PASSED" << std::endl;
......
url_test 0 → 100755
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment