Skip to content
Snippets Groups Projects
Commit fc704f06 authored by jsclose's avatar jsclose
Browse files

url->anchortext map working, added addiontal features for checking valid url

parent 01c790b6
Branches
No related tags found
No related merge requests found
No preview for this file type
......@@ -20,7 +20,15 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
//Looks to see if the complete url already exists, if so return
if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) )
{
//update the anchor text
pthread_mutex_lock( &m );
(*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()]++;
pthread_mutex_unlock( &m );
//add the new
return ;
}
else
{
......@@ -41,11 +49,21 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
}
else
{
pthread_mutex_lock( &m );
this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time
pthread_mutex_unlock( &m );
}
//add url to the duplicate url map
this->duplicateUrlMap->insert( url->getCompleteUrl( ) );
pthread_mutex_lock( &m );
(*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()] = 1;
pthread_mutex_unlock( &m );
return;
}
}
......@@ -54,23 +72,26 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
void UrlFrontier::Push( ParsedUrl * url )
{
//if the url has been seen? if so, dont add it
if(url->isValid)
{
checkUrl(url);
checkUrl(url);
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock( &m );
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock( &m );
queue.push( url );
queue.push( url );
if ( queue.size( ) == 1 )
{
pthread_cond_broadcast( &consumer_cv );
}
if ( queue.size( ) == 1 )
{
pthread_cond_broadcast( &consumer_cv );
}
pthread_mutex_unlock( &m );
pthread_mutex_unlock( &m );
}
}
......
......@@ -11,6 +11,9 @@
using namespace std;
typedef unordered_map<string, int> anchorToCountMap ;
typedef unordered_map<string , anchorToCountMap> urlMap;
class ComparisonClass {
public:
bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) {
......@@ -41,7 +44,7 @@ class UrlFrontier
private:
set< string > *duplicateUrlMap = new set< string>( );
urlMap *duplicateUrlMap = new urlMap;
unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( );
};
......
......@@ -32,6 +32,7 @@ private:
public:
ParsedUrl() {}
ParsedUrl ( string input_url )
......@@ -129,6 +130,9 @@ public:
setScore( );
}
else
isValid = false;
}
......@@ -237,6 +241,7 @@ public:
delete[] pathBuffer;
}
bool isValid = true;
private:
char *pathBuffer;
};
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment