diff --git a/crawler/Readers/HttpReader.cpp b/crawler/Readers/HttpReader.cpp index 200c1177d2e70e52748a2c5cb821b7117ef75a33..0424d16003a1dd5fecfb6aac00eb64c1dc58cbdc 100644 --- a/crawler/Readers/HttpReader.cpp +++ b/crawler/Readers/HttpReader.cpp @@ -6,7 +6,7 @@ std::runtime_error HTTPConnectionError( "Error connecting HTTP to url" ); bool HttpReader::request ( ) { try - { + { sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP ); @@ -14,11 +14,11 @@ bool HttpReader::request ( ) // Get the host address. - struct hostent *host = gethostbyname( url.Host ); + struct hostent *host = gethostbyname( url.getHost().c_str() ); if ( host == nullptr ) throw HTTPConnectionError; - if( strcmp(url.Service, "http") != 0) + if(url.getService() != "http") throw HTTPConnectionError; assert( host ); @@ -32,7 +32,7 @@ bool HttpReader::request ( ) // Connect to the host. int connectResult = connect( sock, ( struct sockaddr * ) &address, - sizeof( address ) ); + sizeof( address ) ); assert( connectResult == 0 ); // Send a GET message for the desired page. @@ -40,9 +40,9 @@ bool HttpReader::request ( ) cout << "Socket Reader is pulling from the web" << endl; string getMessage = "GET "; - getMessage += url.CompleteUrl; + getMessage += url.getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.Host; + getMessage += url.getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -51,12 +51,12 @@ bool HttpReader::request ( ) bool isSuccess = checkStatus( ); return isSuccess; - } + } catch ( std::exception & e ) - { + { cerr << "Error trying to connect to Host" << endl; return false; - } + } } bool HttpReader::fillBuffer ( char *buf, size_t buf_size ) @@ -72,9 +72,9 @@ string HttpReader::PageToString ( ) int bytes = 0; while ( ( bytes = recv( sock, buf, 10240, 0 ) ) > 0 ) - { + { temp += string( buf, bytes ); - } + } return temp; } @@ -97,10 +97,10 @@ bool HttpReader::checkStatus ( ) else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0) return true; else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) - { + { cerr << "URL REDIRECTION" << endl; return false; - } + } cerr << "Bad Request of TYPE:: " << buff << endl; return false; } diff --git a/crawler/Readers/HttpsReader.cpp b/crawler/Readers/HttpsReader.cpp index 78d2b5ed8a921ae9718977923e570aa342a84807..31b9528573207b3ec29733342c24c4cef5668eb7 100644 --- a/crawler/Readers/HttpsReader.cpp +++ b/crawler/Readers/HttpsReader.cpp @@ -6,13 +6,13 @@ std::runtime_error HTTPSconnectionError( "Error connecting HTTPS to url" ); bool HttpsReader::request ( ) { try - { - struct hostent *host = gethostbyname( url.Host ); + { + struct hostent *host = gethostbyname( url.getHost().c_str() ); if ( host == nullptr ) throw HTTPSconnectionError; - if( strcmp(url.Service, "https") != 0) + if( url.getService() != "https") throw HTTPSconnectionError; assert( host ); @@ -30,7 +30,7 @@ bool HttpsReader::request ( ) // Connect the socket to the host address. int connectResult = connect( sock, ( struct sockaddr * ) &address, - sizeof( address ) ); + sizeof( address ) ); assert( connectResult == 0 ); // Build an SSL layer and set it to read/write @@ -54,9 +54,9 @@ bool HttpsReader::request ( ) // Send a GET message for the desired page through the SSL. string getMessage = "GET "; - getMessage += url.CompleteUrl; + getMessage += url.getCompleteUrl(); getMessage += " HTTP/1.1\r\nHost: "; - getMessage += url.Host; + getMessage += url.getHost(); getMessage += "\r\nConnection: close\r\n\r\n"; cout << getMessage << endl; @@ -64,12 +64,12 @@ bool HttpsReader::request ( ) bool isSuccess = checkStatus( ); return isSuccess; - } + } catch ( std::exception & e ) - { + { cerr << "Error trying to connect to Host" << endl; return false; - } + } } bool HttpsReader::fillBuffer ( char *buf, size_t buf_size ) @@ -85,9 +85,9 @@ string HttpsReader::PageToString ( ) int bytes = 0; while ( ( bytes = SSL_read( ssl, buf, 10240 ) ) > 0 ) - { + { temp += string( buf, bytes ); - } + } return temp; } @@ -102,13 +102,13 @@ bool HttpsReader::checkStatus ( ) if ( strncmp( buff, "HTTP/1.1 200", 11 ) == 0 ) return true; - else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0) - return true; - else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) - { - cerr << "URL REDIRECTION" << endl; - return false; - } + else if(strncmp(buff, "HTTP/1.1 400", 11 ) == 0) + return true; + else if(strncmp(buff, "HTTP/1.1 302", 11 ) == 0) + { + cerr << "URL REDIRECTION" << endl; + return false; + } cerr << "Bad Request of TYPE:: " << buff << endl; return false; diff --git a/crawler/spider.cpp b/crawler/spider.cpp index c50c65839cc93e238f8d9b4c1480e6c89908cf9b..8cbda419b69df25a569315c1fef9acf5fa0f7a31 100644 --- a/crawler/spider.cpp +++ b/crawler/spider.cpp @@ -20,24 +20,25 @@ StreamReader *SR_factory ( ParsedUrl url, string mode ) StreamReader *newReader = nullptr; if ( mode == "local" ) - { - newReader = new LocalReader( string(url.CompleteUrl, strlen(url.CompleteUrl) ) ); - } + { + newReader = new LocalReader( url.getCompleteUrl() ); + } else if ( mode == "web" ) + { + if ( url.getService() == "http" ) { - if ( !strcmp( url.Service, "http" ) ) - { newReader = new HttpReader( url ); - } - else if ( !strcmp( url.Service, "https" ) ) - { + } + else if ( url.getService() == "https" ) + { newReader = new HttpsReader( url ); - } + } else - { + { cerr << "Error reading service type\n"; - } + cerr << "Service Type: " << url.getService() << "\n"; } + } return newReader; } @@ -45,14 +46,14 @@ StreamReader *SR_factory ( ParsedUrl url, string mode ) void printDocIndex ( DocIndex *dict ) { for ( auto it = dict->begin( ); it != dict->end( ); it++ ) - { + { cout << it->first << " : "; for ( int i = 0; i < it->second.size( ); ++i ) - { + { cout << it->second[ i ] << " "; - } - cout << std::endl; } + cout << std::endl; + } cout << std::endl; } @@ -80,18 +81,18 @@ void Spider::run ( ) int cond = 0; while ( cond < 250 ) - { + { ParsedUrl currentUrl = getUrl( ); - size_t docID = hash( currentUrl.CompleteUrl ); + size_t docID = hash( currentUrl.getCompleteUrl().c_str() ); if ( shouldURLbeCrawled( docID ) ) - { + { StreamReader *reader = SR_factory( currentUrl, this->mode ); if(reader) - { + { bool success = reader->request( ); if ( success ) - { - cout << "Parsing " << currentUrl.CompleteUrl; + { + cout << "Parsing " << currentUrl.getCompleteUrl(); DocIndex *dict = parser.execute( reader ); IndexerQueue->Push( dict ); @@ -100,16 +101,16 @@ void Spider::run ( ) //delete dict; cond++; - } } + } delete reader; - } } } + } diff --git a/shared/url.h b/shared/url.h index 08d8c41f69486c4a02ccf09c3377680f1ee5fc5c..20aad823ce37d04fb3c53f5867484954594a7eed 100644 --- a/shared/url.h +++ b/shared/url.h @@ -19,42 +19,50 @@ using namespace std; class ParsedUrl { -public: - char *CompleteUrl, - *Service, - *Host, - *Domain, - *Path, - *AnchorText; +private: + string CompleteUrl, + Service, + Host, + Domain, + Path, + AnchorText; double Score; +public: ParsedUrl ( string input_url ) { // Assumes url points to static text but // does not check. + char *temp_CompleteUrl, + *temp_Service, + *temp_Host, + *temp_Domain, + *temp_Path, + *temp_AnchorText, + *temp_pathBuffer; //intialize anchor text to "null" char *null = new char[2]; strcpy( null, string( "" ).c_str( ) ); - AnchorText = null; + temp_AnchorText = null; char *url = new char[input_url.length( ) + 1]; strcpy( url, input_url.c_str( ) ); - CompleteUrl = url; + temp_CompleteUrl = url; - pathBuffer = new char[strlen( url ) + 1]; + temp_pathBuffer = new char[strlen( url ) + 1]; char *f, *t; - for ( t = pathBuffer, f = url; ( *t++ = *f++ ); ); + for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); ); - Service = pathBuffer; + temp_Service = temp_pathBuffer; const char Colon = ':', Slash = '/', HashTag = '#', Period = '.'; char *p; - for ( p = pathBuffer; *p && *p != Colon; p++ ); + for ( p = temp_pathBuffer; *p && *p != Colon; p++ ); if ( *p ) - { + { // Mark the end of the Service. *p++ = 0; @@ -63,7 +71,7 @@ public: if ( *p == Slash ) p++; - Host = p; + temp_Host = p; for ( ; *p && *p != Slash; p++ ); @@ -73,24 +81,24 @@ public: //char * domainBuffer = new char[ 20 ]; //get the domain: - char *i = Host; - Domain = null; + char *i = temp_Host; + temp_Domain = null; if(i) - { + { for ( ; *i; i++ ) - { + { if ( *i == Period ) - Domain = i; - - } + temp_Domain = i; } + } + // Whatever remains is the Path. // need to remove fragments - Path = p; + temp_Path = p; for ( ; *p && *p != HashTag; p++ ); if ( *p ) @@ -98,9 +106,18 @@ public: *p++ = 0; - } + } else - Host = Path = p; + temp_Host = temp_Path = p; + + + CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl)); + Service = string(temp_Service, strlen(temp_Service)); + Host = string(temp_Host, strlen(temp_Host)); + Domain = string(temp_Domain, strlen(temp_Domain)); + Path = string(temp_Path, strlen(temp_Path)); + AnchorText = string(temp_AnchorText, strlen(temp_AnchorText)); + pathBuffer = temp_pathBuffer; setScore( ); } @@ -119,72 +136,67 @@ public: void setScore() { - double lengthOfUrl = strlen(CompleteUrl); + double lengthOfUrl = CompleteUrl.length(); Score += 4 * 1/ log( lengthOfUrl ); if(lengthOfUrl > 4) - { + { - if(this->Domain ) - - { - if ( strcmp ( Domain , ORG ) ) - Score += 5; - else if ( strcmp ( Domain , EDU ) ) - Score += 4; - else if ( strcmp ( Domain , GOV ) ) - Score += 3; - else if ( strcmp ( Domain , COM ) ) - Score += 2; - else if ( strcmp ( Domain , NET ) ) - Score += 1; - else if ( strcmp ( Domain , INT ) ) - Score += 1; - else if ( strcmp ( Domain , MIL ) ) - Score += .5; - } + if(this->Domain.length() ) + { + if ( strcmp ( Domain.c_str() , ORG ) ) + Score += 5; + else if ( strcmp ( Domain.c_str() , EDU ) ) + Score += 4; + else if ( strcmp ( Domain.c_str() , GOV ) ) + Score += 3; + else if ( strcmp ( Domain.c_str() , COM ) ) + Score += 2; + else if ( strcmp ( Domain.c_str() , NET ) ) + Score += 1; + else if ( strcmp ( Domain.c_str() , INT ) ) + Score += 1; + else if ( strcmp ( Domain.c_str() , MIL ) ) + Score += .5; } + + } + } + + std::string getDomain ( ) + { + return Domain; + } + + std::string getService ( ) + { + return Service; } std::string getCompleteUrl ( ) { - std::string completeUrl = ""; - completeUrl.assign( this->CompleteUrl ); - return completeUrl; + return CompleteUrl; } std::string getHost ( ) { - std::string host = ""; - host.assign( this->Host ); - return host; + return Host; } std::string getPath ( ) { - std::string path = ""; - path.assign( this->Path ); - return path; + return Path; } std::string getAnchorText ( ) { - std::string anchorText = ""; - anchorText.assign( this->AnchorText ); - return anchorText; + return AnchorText; } void setAnchorText ( std::string anchorText ) { - char *anchorCharStar = new char[anchorText.size( )]; - - for ( int i = 0; i < anchorText.size( ); ++i ) - { - anchorCharStar += anchorText[ i ]; - } - anchorCharStar += '\0'; - this->AnchorText = anchorCharStar; + AnchorText = anchorText; }