Skip to content
Snippets Groups Projects
Commit fc704f06 authored by jsclose's avatar jsclose
Browse files

url->anchortext map working, added addiontal features for checking valid url

parent 01c790b6
No related branches found
No related tags found
No related merge requests found
No preview for this file type
......@@ -20,7 +20,15 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
//Looks to see if the complete url already exists, if so return
if ( this->duplicateUrlMap->find( url->getCompleteUrl() ) != this->duplicateUrlMap->end( ) )
{
//update the anchor text
pthread_mutex_lock( &m );
(*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()]++;
pthread_mutex_unlock( &m );
//add the new
return ;
}
else
{
......@@ -41,11 +49,21 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
}
else
{
pthread_mutex_lock( &m );
this->domainMap->insert( std::make_pair( url->getHost( ), now )); //otherwise add to the map the current time
pthread_mutex_unlock( &m );
}
//add url to the duplicate url map
this->duplicateUrlMap->insert( url->getCompleteUrl( ) );
pthread_mutex_lock( &m );
(*duplicateUrlMap)[url->getCompleteUrl()][url->getAnchorText()] = 1;
pthread_mutex_unlock( &m );
return;
}
}
......@@ -54,23 +72,26 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
void UrlFrontier::Push( ParsedUrl * url )
{
//if the url has been seen? if so, dont add it
if(url->isValid)
{
checkUrl(url);
checkUrl(url);
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock( &m );
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock( &m );
queue.push( url );
queue.push( url );
if ( queue.size( ) == 1 )
{
pthread_cond_broadcast( &consumer_cv );
}
if ( queue.size( ) == 1 )
{
pthread_cond_broadcast( &consumer_cv );
}
pthread_mutex_unlock( &m );
pthread_mutex_unlock( &m );
}
}
......
......@@ -11,6 +11,9 @@
using namespace std;
typedef unordered_map<string, int> anchorToCountMap ;
typedef unordered_map<string , anchorToCountMap> urlMap;
class ComparisonClass {
public:
bool operator() (ParsedUrl *lhs , ParsedUrl *rhs) {
......@@ -41,7 +44,7 @@ class UrlFrontier
private:
set< string > *duplicateUrlMap = new set< string>( );
urlMap *duplicateUrlMap = new urlMap;
unordered_map< string , time_t > *domainMap = new unordered_map< string, time_t >( );
};
......
......@@ -32,6 +32,7 @@ private:
public:
ParsedUrl() {}
ParsedUrl ( string input_url )
......@@ -129,6 +130,9 @@ public:
setScore( );
}
else
isValid = false;
}
......@@ -237,6 +241,7 @@ public:
delete[] pathBuffer;
}
bool isValid = true;
private:
char *pathBuffer;
};
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment