Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eecs398-search
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
vcday
eecs398-search
Commits
fc704f06
Commit
fc704f06
authored
6 years ago
by
jsclose
Browse files
Options
Downloads
Patches
Plain Diff
url->anchortext map working, added addiontal features for checking valid url
parent
01c790b6
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
crawler-parser-indexer-test
+0
-0
0 additions, 0 deletions
crawler-parser-indexer-test
crawler/UrlFrontier.cpp
+33
-12
33 additions, 12 deletions
crawler/UrlFrontier.cpp
crawler/UrlFrontier.h
+4
-1
4 additions, 1 deletion
crawler/UrlFrontier.h
shared/url.h
+5
-0
5 additions, 0 deletions
shared/url.h
with
42 additions
and
13 deletions
crawler-parser-indexer-test
+
0
−
0
View file @
fc704f06
No preview for this file type
This diff is collapsed.
Click to expand it.
crawler/UrlFrontier.cpp
+
33
−
12
View file @
fc704f06
...
...
@@ -20,7 +20,15 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
//Looks to see if the complete url already exists, if so return
if
(
this
->
duplicateUrlMap
->
find
(
url
->
getCompleteUrl
()
)
!=
this
->
duplicateUrlMap
->
end
(
)
)
{
//update the anchor text
pthread_mutex_lock
(
&
m
);
(
*
duplicateUrlMap
)[
url
->
getCompleteUrl
()][
url
->
getAnchorText
()]
++
;
pthread_mutex_unlock
(
&
m
);
//add the new
return
;
}
else
{
...
...
@@ -41,11 +49,21 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
}
else
{
pthread_mutex_lock
(
&
m
);
this
->
domainMap
->
insert
(
std
::
make_pair
(
url
->
getHost
(
),
now
));
//otherwise add to the map the current time
pthread_mutex_unlock
(
&
m
);
}
//add url to the duplicate url map
this
->
duplicateUrlMap
->
insert
(
url
->
getCompleteUrl
(
)
);
pthread_mutex_lock
(
&
m
);
(
*
duplicateUrlMap
)[
url
->
getCompleteUrl
()][
url
->
getAnchorText
()]
=
1
;
pthread_mutex_unlock
(
&
m
);
return
;
}
}
...
...
@@ -54,23 +72,26 @@ void UrlFrontier::checkUrl(ParsedUrl* url)
void
UrlFrontier
::
Push
(
ParsedUrl
*
url
)
{
//if the url has been seen? if so, dont add it
if
(
url
->
isValid
)
{
checkUrl
(
url
);
checkUrl
(
url
);
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock
(
&
m
);
//set the value of the last time the domain was seen to score
//url.setTime(difference);
//url.setScore();
pthread_mutex_lock
(
&
m
);
queue
.
push
(
url
);
queue
.
push
(
url
);
if
(
queue
.
size
(
)
==
1
)
{
pthread_cond_broadcast
(
&
consumer_cv
);
}
if
(
queue
.
size
(
)
==
1
)
{
pthread_cond_broadcast
(
&
consumer_cv
);
}
pthread_mutex_unlock
(
&
m
);
pthread_mutex_unlock
(
&
m
);
}
}
...
...
This diff is collapsed.
Click to expand it.
crawler/UrlFrontier.h
+
4
−
1
View file @
fc704f06
...
...
@@ -11,6 +11,9 @@
using
namespace
std
;
typedef
unordered_map
<
string
,
int
>
anchorToCountMap
;
typedef
unordered_map
<
string
,
anchorToCountMap
>
urlMap
;
class
ComparisonClass
{
public:
bool
operator
()
(
ParsedUrl
*
lhs
,
ParsedUrl
*
rhs
)
{
...
...
@@ -41,7 +44,7 @@ class UrlFrontier
private:
set
<
string
>
*
duplicateUrlMap
=
new
set
<
string
>
(
)
;
urlMap
*
duplicateUrlMap
=
new
urlMap
;
unordered_map
<
string
,
time_t
>
*
domainMap
=
new
unordered_map
<
string
,
time_t
>
(
);
};
...
...
This diff is collapsed.
Click to expand it.
shared/url.h
+
5
−
0
View file @
fc704f06
...
...
@@ -32,6 +32,7 @@ private:
public:
ParsedUrl
()
{}
ParsedUrl
(
string
input_url
)
...
...
@@ -129,6 +130,9 @@ public:
setScore
(
);
}
else
isValid
=
false
;
}
...
...
@@ -237,6 +241,7 @@ public:
delete
[]
pathBuffer
;
}
bool
isValid
=
true
;
private
:
char
*
pathBuffer
;
};
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment