Skip to content
Snippets Groups Projects
url.h 4.33 KiB
Newer Older
  • Learn to ignore specific revisions
  • jsclose's avatar
    jsclose committed
    
    #pragma once
    
    #include <string>
    
    #include <iostream>
    
    jsclose's avatar
    jsclose committed
    #include "../util/util.h"
    
    jsclose's avatar
    jsclose committed
    #include <time.h>
    
    #include "../util/stringProcessing.h"
    
    jsclose's avatar
    jsclose committed
    //#include "../crawler/SocketReader.h"
    
    using namespace std;
    
    #define GOV   ".gov"
    #define COM   ".com"
    #define EDU   ".edu"
    #define ORG   ".org"
    #define NET   ".net"
    #define MIL   ".mil"
    #define INT   ".int"
    
    class ParsedUrl
    	{
    
    private:
    	string CompleteUrl,
    			Service,
    			Host,
    			Domain,
    			Path,
    			AnchorText;
    
    jsclose's avatar
    jsclose committed
    
    
    public:
    
    vcday's avatar
    vcday committed
    	ParsedUrl ( string input_url )
    
    		// Assumes url points to static text but
    		// does not check.
    
    jsclose's avatar
    jsclose committed
    		try
    			{
    
    jsclose's avatar
    jsclose committed
    			char *temp_CompleteUrl,
    					*temp_Service,
    					*temp_Host,
    					*temp_Domain,
    					*temp_Path,
    					*temp_AnchorText,
    					*temp_pathBuffer;
    
    jsclose's avatar
    jsclose committed
    			//intialize anchor text to ""
    			char *null = new char[2];
    			strcpy( null, string( "" ).c_str( ) );
    			temp_AnchorText = null;
    
    jsclose's avatar
    jsclose committed
    			char *url = new char[input_url.length( ) + 1];
    			strcpy( url, input_url.c_str( ) );
    
    jsclose's avatar
    jsclose committed
    			temp_CompleteUrl = url;
    
    jsclose's avatar
    jsclose committed
    			temp_pathBuffer = new char[strlen( url ) + 1];
    			char *f, *t;
    			for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); );
    
    jsclose's avatar
    jsclose committed
    			temp_Service = temp_pathBuffer;
    
    					const char Colon = ':', Slash = '/', HashTag = '#', Period = '.', QuestionMark = '?';
    
    jsclose's avatar
    jsclose committed
    					char *p;
    					for ( p = temp_pathBuffer; *p && *p != Colon; p++ );
    
    jsclose's avatar
    jsclose committed
    					if ( *p )
    
    jsclose's avatar
    jsclose committed
    					{
    
    jsclose's avatar
    jsclose committed
    						// Mark the end of the Service.
    						*p++ = 0;
    
    						if ( *p == Slash )
    							p++;
    						if ( *p == Slash )
    							p++;
    
    						temp_Host = p;
    
    						for ( ; *p && *p != Slash; p++ );
    
    						if ( *p )
    							// Mark the end of the Host.
    							*p++ = 0;
    
    						//char * domainBuffer = new char[ 20 ];
    						//get the domain:
    						char *i = temp_Host;
    						temp_Domain = nullptr;
    						if(i)
    						{
    							for ( ; *i; i++ )
    							{
    								if ( *i == Period )
    									temp_Domain = i;
    							}
    						}
    
    						// Whatever remains is the Path. // need to remove fragments
    
    						temp_Path = p;
    
    						for ( ; *p && *p != QuestionMark; p++ );
    
    
    jsclose's avatar
    jsclose committed
    						for ( ; *p && *p != HashTag; p++ );
    
    
    jsclose's avatar
    jsclose committed
    						if ( *p )
    							// Mark the end of the Path, remove fragments.
    							*p++ = 0;
    
    jsclose's avatar
    jsclose committed
    					}
    
    jsclose's avatar
    jsclose committed
    					else
    						temp_Host = temp_Path = p;
    
    jsclose's avatar
    jsclose committed
    
    
    
    jsclose's avatar
    jsclose committed
    					CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl));
    					Service = string(temp_Service, strlen(temp_Service));
    					Host = string(temp_Host, strlen(temp_Host));
    
    jsclose's avatar
    jsclose committed
    			if(Service == "http" || Service == "https")
    				{
    					if(  temp_Domain != nullptr )
    						Domain = string(temp_Domain, strlen(temp_Domain));
    
    jsclose's avatar
    jsclose committed
    					Path = string(temp_Path, strlen(temp_Path));
    
    
    					//vector<string> noquestionmakr =	splitStr(Path, '?', false);
    
    jsclose's avatar
    jsclose committed
    					AnchorText = string(temp_AnchorText, strlen(temp_AnchorText));
    					pathBuffer = temp_pathBuffer;
    
    jsclose's avatar
    jsclose committed
    					setScore( );
    				}
    
    jsclose's avatar
    jsclose committed
    			}
    		catch (exception e)
    			{
    			cerr << "Error constructing a ParsedUrl from string url "<< endl;
    
    jsclose's avatar
    jsclose committed
    
    
    jsclose's avatar
    jsclose committed
    			}
    
    vcday's avatar
    vcday committed
    	void printUrl ( )
    
    jsclose's avatar
    jsclose committed
    		{
    
    		cout << "Complete URL: " << CompleteUrl << endl;
    		cout << "Service: " << Service << endl;
    		cout << "Host: " << Host << endl;
    
    		cout << "Domain: " << Domain << endl;
    
    		cout << "Path: " << Path << endl;
    
    		cout << "Score: " << Score << endl;
    
    	void setScore()
    
    vcday's avatar
    vcday committed
    		{
    
    		double lengthOfUrl = CompleteUrl.length();
    
    jsclose's avatar
    jsclose committed
    		Score +=  1/ ( lengthOfUrl );
    
    jsclose's avatar
    jsclose committed
    
    		if(lengthOfUrl > 4)
    
    			if(this->Domain.length() )
    
    jsclose's avatar
    jsclose committed
    					Score += .5;
    
    				else if ( Domain == EDU  )
    					Score += .5;
    				else if ( Domain ==  GOV )
    
    					Score += .75;
    
    				else if ( Domain ==  COM )
    					Score += .5;
    				else if ( Domain ==  NET )
    
    jsclose's avatar
    jsclose committed
    					Score += 3;
    
    jsclose's avatar
    jsclose committed
    					Score += 4;
    
    jsclose's avatar
    jsclose committed
    					Score += 5;
    
    
    		}
    		}
    
    	std::string getDomain ( )
    		{
    		return Domain;
    		}
    
    	std::string getService ( )
    		{
    		return Service;
    
    jsclose's avatar
    jsclose committed
    
    
    vcday's avatar
    vcday committed
    	std::string getCompleteUrl ( )
    
    		return CompleteUrl;
    
    vcday's avatar
    vcday committed
    	std::string getHost ( )
    
    		return Host;
    
    vcday's avatar
    vcday committed
    	std::string getPath ( )
    
    		return Path;
    
    jsclose's avatar
    jsclose committed
    
    	double getScore ( )
    		{
    		return Score;
    		}
    
    	void updateScore( double time )
    		{
    
    
    jsclose's avatar
    jsclose committed
    		}
    
    
    vcday's avatar
    vcday committed
    	std::string getAnchorText ( )
    
    		return AnchorText;
    
    vcday's avatar
    vcday committed
    	void setAnchorText ( std::string anchorText )
    
    		AnchorText = anchorText;
    
    vcday's avatar
    vcday committed
    	~ParsedUrl ( )
    
    vcday's avatar
    vcday committed
    		delete[] pathBuffer;
    
    private:
    	char *pathBuffer;
    	};