Skip to content
Snippets Groups Projects
url.h 3.99 KiB
Newer Older
jsclose's avatar
jsclose committed

#pragma once

#include <string>
#include <iostream>
jsclose's avatar
jsclose committed
#include "../util/util.h"
jsclose's avatar
jsclose committed
#include <time.h>
jsclose's avatar
jsclose committed
//#include "../crawler/SocketReader.h"
using namespace std;
#define GOV   ".gov"
#define COM   ".com"
#define EDU   ".edu"
#define ORG   ".org"
#define NET   ".net"
#define MIL   ".mil"
#define INT   ".int"
class ParsedUrl
	{
private:
	string CompleteUrl,
			Service,
			Host,
			Domain,
			Path,
			AnchorText;
jsclose's avatar
jsclose committed

public:
vcday's avatar
vcday committed
	ParsedUrl ( string input_url )
		// Assumes url points to static text but
		// does not check.
jsclose's avatar
jsclose committed
		try
			{
jsclose's avatar
jsclose committed
			char *temp_CompleteUrl,
					*temp_Service,
					*temp_Host,
					*temp_Domain,
					*temp_Path,
					*temp_AnchorText,
					*temp_pathBuffer;
jsclose's avatar
jsclose committed
			//intialize anchor text to ""
			char *null = new char[2];
			strcpy( null, string( "" ).c_str( ) );
			temp_AnchorText = null;
jsclose's avatar
jsclose committed
			char *url = new char[input_url.length( ) + 1];
			strcpy( url, input_url.c_str( ) );
jsclose's avatar
jsclose committed
			temp_CompleteUrl = url;
jsclose's avatar
jsclose committed
			temp_pathBuffer = new char[strlen( url ) + 1];
			char *f, *t;
			for ( t = temp_pathBuffer, f = url; ( *t++ = *f++ ); );
jsclose's avatar
jsclose committed
			temp_Service = temp_pathBuffer;
jsclose's avatar
jsclose committed
			const char Colon = ':', Slash = '/', HashTag = '#', Period = '.';
			char *p;
			for ( p = temp_pathBuffer; *p && *p != Colon; p++ );
jsclose's avatar
jsclose committed
			if ( *p )
			{
				// Mark the end of the Service.
				*p++ = 0;
jsclose's avatar
jsclose committed
				if ( *p == Slash )
					p++;
				if ( *p == Slash )
					p++;
jsclose's avatar
jsclose committed
				temp_Host = p;
jsclose's avatar
jsclose committed
				for ( ; *p && *p != Slash; p++ );
jsclose's avatar
jsclose committed
				if ( *p )
					// Mark the end of the Host.
					*p++ = 0;

				//char * domainBuffer = new char[ 20 ];
				//get the domain:
				char *i = temp_Host;
jsclose's avatar
jsclose committed
				temp_Domain = nullptr;
jsclose's avatar
jsclose committed
				if(i)
jsclose's avatar
jsclose committed
					for ( ; *i; i++ )
					{
						if ( *i == Period )
							temp_Domain = i;
					}
jsclose's avatar
jsclose committed

				// Whatever remains is the Path. // need to remove fragments

				temp_Path = p;
				for ( ; *p && *p != HashTag; p++ );

				if ( *p )
					// Mark the end of the Path, remove fragments.
					*p++ = 0;
jsclose's avatar
jsclose committed
			else
				temp_Host = temp_Path = p;
jsclose's avatar
jsclose committed
			CompleteUrl = string(temp_CompleteUrl, strlen(temp_CompleteUrl));
			Service = string(temp_Service, strlen(temp_Service));
			Host = string(temp_Host, strlen(temp_Host));
jsclose's avatar
jsclose committed
			if(  temp_Domain != nullptr )
				Domain = string(temp_Domain, strlen(temp_Domain));

jsclose's avatar
jsclose committed
			Path = string(temp_Path, strlen(temp_Path));
			AnchorText = string(temp_AnchorText, strlen(temp_AnchorText));
			pathBuffer = temp_pathBuffer;
jsclose's avatar
jsclose committed
			setScore( );
jsclose's avatar
jsclose committed
			}
		catch (exception e)
			{
			cerr << "Error constructing a ParsedUrl from string url "<< endl;
jsclose's avatar
jsclose committed

jsclose's avatar
jsclose committed
			}
vcday's avatar
vcday committed
	void printUrl ( )
jsclose's avatar
jsclose committed
		{
		cout << "Complete URL: " << CompleteUrl << endl;
		cout << "Service: " << Service << endl;
		cout << "Host: " << Host << endl;
		cout << "Domain: " << Domain << endl;
		cout << "Path: " << Path << endl;
		cout << "Score: " << Score << endl;
	void setScore()
vcday's avatar
vcday committed
		{
		double lengthOfUrl = CompleteUrl.length();
jsclose's avatar
jsclose committed
		Score +=  1/ ( lengthOfUrl );
jsclose's avatar
jsclose committed

		if(lengthOfUrl > 4)
			if(this->Domain.length() )
			{
				if ( strcmp ( Domain.c_str() , ORG ) )
jsclose's avatar
jsclose committed
					Score += .5;
				else if ( strcmp ( Domain.c_str() , EDU ) )
jsclose's avatar
jsclose committed
					Score += 1;
				else if ( strcmp ( Domain.c_str() , GOV ) )
jsclose's avatar
jsclose committed
					Score += 1;
				else if ( strcmp ( Domain.c_str() , COM ) )
					Score += 2;
				else if ( strcmp ( Domain.c_str() , NET ) )
jsclose's avatar
jsclose committed
					Score += 3;
				else if ( strcmp ( Domain.c_str() , INT ) )
jsclose's avatar
jsclose committed
					Score += 4;
				else if ( strcmp ( Domain.c_str() , MIL ) )
jsclose's avatar
jsclose committed
					Score += 5;

		}
		}

	std::string getDomain ( )
		{
		return Domain;
		}

	std::string getService ( )
		{
		return Service;
jsclose's avatar
jsclose committed

vcday's avatar
vcday committed
	std::string getCompleteUrl ( )
		return CompleteUrl;
vcday's avatar
vcday committed
	std::string getHost ( )
		return Host;
vcday's avatar
vcday committed
	std::string getPath ( )
		return Path;
jsclose's avatar
jsclose committed

	double getScore ( )
		{
		return Score;
		}

	void updateScore( double time )
		{

jsclose's avatar
jsclose committed
		Score +=  3 * time;
jsclose's avatar
jsclose committed
		}

vcday's avatar
vcday committed
	std::string getAnchorText ( )
		return AnchorText;
vcday's avatar
vcday committed
	void setAnchorText ( std::string anchorText )
		AnchorText = anchorText;
vcday's avatar
vcday committed
	~ParsedUrl ( )
vcday's avatar
vcday committed
		delete[] pathBuffer;
private:
	char *pathBuffer;
	};