Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eecs398-search
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
vcday
eecs398-search
Commits
0fe06957
Commit
0fe06957
authored
7 years ago
by
benbergk
Browse files
Options
Downloads
Patches
Plain Diff
formatting changes and added lock around file
parent
2f874e38
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
crawler/spider.cpp
+24
-85
24 additions, 85 deletions
crawler/spider.cpp
crawler/spider.h
+1
-2
1 addition, 2 deletions
crawler/spider.h
shared/documentMap.h
+98
-3
98 additions, 3 deletions
shared/documentMap.h
with
123 additions
and
90 deletions
crawler/spider.cpp
+
24
−
85
View file @
0fe06957
...
...
@@ -16,11 +16,7 @@
#include
"LocalReader.h"
#include
"SocketReader.h"
namespace
filepath
{
const
char
*
DOC_MAP
=
"/docMap.txt"
;
}
#include
"../shared/documentMap.h"
string
Spider
::
getUrl
()
...
...
@@ -41,11 +37,10 @@ void Spider::FuncToRun()
string
currentUrl
=
getUrl
(
);
char
*
fileMap
;
bool
toCrawl
=
shouldURLbeCrawled
(
currentUrl
);
if
(
toCrawl
)
//url has not been seen
if
(
shouldURLbeCrawled
(
currentUrl
))
{
if
(
cond
)
bool
success
=
writeDocToDisk
(
currentUrl
);
if
(
success
&&
cond
)
{
...
...
@@ -75,95 +70,39 @@ Takes a URL. Hashes it. Checks if the url is in the docMapLookup. If it is, chec
*/
bool
Spider
::
writeDocToDisk
(
string
url
)
{
Document
d
(
url
);
int
resultPosition
=
d
.
WriteToDocMap
();
if
(
resultPosition
==
-
1
)
{
return
false
;
}
this
->
docMapLookup
->
insert
(
std
::
pair
<
string
,
int
>
(
url
,
resultPosition
));
for
(
auto
it
=
this
->
docMapLookup
->
begin
(
);
it
!=
this
->
docMapLookup
->
end
(
);
++
it
)
std
::
cout
<<
it
->
first
<<
" => "
<<
it
->
second
<<
'\n'
;
return
true
;
}
bool
Spider
::
shouldURLbeCrawled
(
string
url
)
{
//search for url in doc cache
auto
locationOnDisk
=
this
->
docMapLookup
->
find
(
url
);
//if it doesnt find anything for that url key
if
(
locationOnDisk
==
this
->
docMapLookup
->
end
(
))
{
//cerr << "Url Not Found In Cache Lookup" << endl;
//get file descriptor for the docMap on disk
string
loc
=
util
::
GetCurrentWorkingDir
()
+
filepath
::
DOC_MAP
;
int
file
=
util
::
getFileDescriptor
(
loc
.
c_str
(),
"W"
);
//check if its available
if
(
file
==
-
1
)
cerr
<<
"Error opening docMap"
<<
endl
;
else
{
//get the current size of the docMap
size_t
seekPosition
=
util
::
FileSize
(
file
);
//seek to the end of the file
off_t
resultPosition
=
lseek
(
file
,
seekPosition
,
SEEK_SET
);
if
(
resultPosition
==
-
1
)
{
cerr
<<
"Could not seek to "
<<
seekPosition
<<
", error = "
<<
errno
;
return
errno
;
}
cout
<<
"Current docMap position on disk"
<<
endl
;
cout
<<
resultPosition
<<
endl
;
size_t
success
=
write
(
file
,
"Hello World!
\n
"
,
14
);
if
(
success
==
-
1
)
{
cerr
<<
"Error writing document object to document map"
<<
endl
;
}
this
->
docMapLookup
->
insert
(
std
::
pair
<
string
,
int
>
(
url
,
resultPosition
));
for
(
auto
it
=
this
->
docMapLookup
->
begin
(
);
it
!=
this
->
docMapLookup
->
end
(
);
++
it
)
std
::
cout
<<
it
->
first
<<
" => "
<<
it
->
second
<<
'\n'
;
close
(
file
);
return
true
;
}
}
else
{
//maps url id -> location on disk (where to seek too)
std
::
cout
<<
locationOnDisk
->
first
<<
" is "
<<
locationOnDisk
->
second
;
string
loc
=
util
::
GetCurrentWorkingDir
()
+
filepath
::
DOC_MAP
;
int
file
=
util
::
getFileDescriptor
(
loc
.
c_str
(),
"R"
);
//check if its available
if
(
file
)
{
size_t
seekPosition
=
locationOnDisk
->
second
;
off_t
resultPosition
=
lseek
(
file
,
seekPosition
,
SEEK_SET
);
int
bytes
=
14
;
if
(
bytes
>
0
)
{
char
*
buffer
=
new
char
[
bytes
];
ssize_t
bytesRead
;
if
(
bytesRead
=
read
(
file
,
buffer
,
bytes
))
write
(
1
,
buffer
,
bytesRead
);
else
{
cerr
<<
"Could not read "
<<
bytes
<<
" bytes at position "
<<
resultPosition
<<
", error = "
<<
errno
;
return
errno
;
}
}
//
return
false
;
}
return
false
;
//Just for testing
Document
::
PrintDocMap
(
url
,
locationOnDisk
->
second
);
}
return
false
;
}
...
...
This diff is collapsed.
Click to expand it.
crawler/spider.h
+
1
−
2
View file @
0fe06957
...
...
@@ -37,8 +37,7 @@ public:
//Where to write to disk? What type of data are we reading in?
int
writeFileToDisk
(
char
*
fileContents
,
string
locationOnDisk
);
void
markURLSeen
(
string
URL
);
bool
writeDocToDisk
(
string
url
);
bool
shouldURLbeCrawled
(
string
URL
);
...
...
This diff is collapsed.
Click to expand it.
shared/documentMap.h
+
98
−
3
View file @
0fe06957
...
...
@@ -7,18 +7,113 @@
#include
"url.h"
#include
<string>
#include
<vector>
#include
<pthread.h>
using
namespace
std
;
namespace
filepath
{
const
char
*
DOC_MAP
=
"/docMap.txt"
;
}
pthread_mutex_t
docMap_mutex
=
PTHREAD_MUTEX_INITIALIZER
;
class
Document
{
p
ublic
:
Url
url
;
p
rivate
:
Parsed
Url
url
;
long
docID
;
bool
lastCrawlStatus
;
int
lastCrawlDate
;
int
lastCrawlPageCount
;
//add more info fields here
public:
Document
(
string
url_in
)
:
url
(
ParsedUrl
(
url_in
))
{}
int
WriteToDocMap
()
{
pthread_mutex_lock
(
&
docMap_mutex
);
//for now just write url
string
loc
=
util
::
GetCurrentWorkingDir
()
+
filepath
::
DOC_MAP
;
int
file
=
util
::
getFileDescriptor
(
loc
.
c_str
(),
"W"
);
off_t
resultPosition
=
0
;
//check if its available
if
(
file
==
-
1
)
{
cerr
<<
"Error opening docMap"
<<
endl
;
close
(
file
);
pthread_mutex_unlock
(
&
docMap_mutex
);
return
-
1
;
}
else
{
//get the current size of the docMap
size_t
seekPosition
=
util
::
FileSize
(
file
);
//seek to the end of the file
resultPosition
=
lseek
(
file
,
seekPosition
,
SEEK_SET
);
if
(
resultPosition
==
-
1
)
{
cerr
<<
"Could not seek to "
<<
seekPosition
<<
", error = "
<<
errno
;
close
(
file
);
pthread_mutex_unlock
(
&
docMap_mutex
);
return
-
1
;
}
cout
<<
"Current docMap position on disk"
<<
endl
;
cout
<<
resultPosition
<<
endl
;
size_t
success
=
write
(
file
,
"Hello World!
\n
"
,
14
);
if
(
success
==
-
1
)
{
cerr
<<
"Error writing document object to document map"
<<
endl
;
close
(
file
);
pthread_mutex_unlock
(
&
docMap_mutex
);
return
-
1
;
}
}
close
(
file
);
pthread_mutex_unlock
(
&
docMap_mutex
);
return
resultPosition
;
}
static
void
PrintDocMap
(
string
url
,
int
location
)
{
pthread_mutex_lock
(
&
docMap_mutex
);
std
::
cout
<<
url
<<
" is "
<<
location
;
string
loc
=
util
::
GetCurrentWorkingDir
()
+
filepath
::
DOC_MAP
;
int
file
=
util
::
getFileDescriptor
(
loc
.
c_str
(),
"R"
);
Document
()
{};
//check if its available
if
(
file
)
{
off_t
resultPosition
=
lseek
(
file
,
(
size_t
)
location
,
SEEK_SET
);
int
bytes
=
14
;
if
(
bytes
>
0
)
{
char
*
buffer
=
new
char
[
bytes
];
ssize_t
bytesRead
;
if
(
bytesRead
=
read
(
file
,
buffer
,
bytes
))
write
(
1
,
buffer
,
bytesRead
);
else
{
cerr
<<
"Could not read "
<<
bytes
<<
" bytes at position "
<<
resultPosition
<<
", error = "
<<
errno
;
pthread_mutex_unlock
(
&
docMap_mutex
);
return
;
}
}
}
pthread_mutex_unlock
(
&
docMap_mutex
);
return
;
}
};
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment