Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eecs398-search
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
vcday
eecs398-search
Commits
22d29b60
Commit
22d29b60
authored
7 years ago
by
aanvi
Browse files
Options
Downloads
Patches
Plain Diff
Added body parsing
parent
c1880624
No related branches found
No related tags found
1 merge request
!2
WIP:Crawler parser 2 merge into duplicate url-crawler
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
parser/Parser.cpp
+303
-223
303 additions, 223 deletions
parser/Parser.cpp
parser/Parser.h
+69
-82
69 additions, 82 deletions
parser/Parser.h
parser/tests/parserTest.cpp
+101
-5
101 additions, 5 deletions
parser/tests/parserTest.cpp
util/Tokenizer.h
+41
-96
41 additions, 96 deletions
util/Tokenizer.h
with
514 additions
and
406 deletions
parser/Parser.cpp
+
303
−
223
View file @
22d29b60
...
...
@@ -6,9 +6,9 @@
* @param urlFrontierIn
*/
Parser
::
Parser
(
ProducerConsumerQueue
<
ParsedUrl
>
*
urlFrontierIn
)
{
urlFrontier
=
urlFrontierIn
;
}
{
urlFrontier
=
urlFrontierIn
;
}
/**
...
...
@@ -16,215 +16,160 @@ Parser::Parser ( ProducerConsumerQueue< ParsedUrl > *urlFrontierIn )
* @return
*/
const
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
Parser
::
execute
(
Document
*
document
)
{
Tokenizer
tokenizer
;
parse
(
document
->
DocToString
(
),
document
->
getUrl
(
),
&
tokenizer
);
return
tokenizer
.
get
(
);
}
{
Tokenizer
tokenizer
;
parse
(
document
->
DocToString
(
),
document
->
getUrl
(
),
&
tokenizer
);
return
tokenizer
.
get
(
);
}
/**
* Parses file
* @param inFile
* @return
*/
<<<<<<<
HEAD
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
// TODO different counts: frequency, total num unique words, etc
//TODO flag different types of words - determine if we want to do this in key of dict or value (in wordData struct)
/*
* Anchor text = #
* Title = *
* Url = @
* Body = %
*/
void
Parser
::
parse
(
string
html
,
ParsedUrl
currentUrl
,
Tokenizer
*
tokenizer
)
{
void
Parser
::
parse
(
string
html
,
Tokenizer
*
tokenizer
)
{
unsigned
long
htmlIt
=
0
;
unsigned
long
offsetTitle
=
0
;
unsigned
long
offsetBody
=
0
;
unsigned
long
offsetURL
=
0
;
//maybe add some code to read in stream and add chars to string as they come in
auto
htmlIt
=
html
.
begin
();
int
offset
=
0
;
while
(
htmlIt
!=
html
.
end
())
=======
void
Parser
::
parse
(
string
html
,
ParsedUrl
currentUrl
,
Tokenizer
*
tokenizer
)
{
// tokenize url
string
host
=
""
;
host
.
assign
(
currentUrl
.
Host
);
string
path
=
""
;
path
.
assign
(
currentUrl
.
Path
);
string
urlCurrent
=
host
+
"/"
+
path
;
unsigned
long
htmlIt
=
0
;
unsigned
long
offsetTitle
=
0
;
unsigned
long
offsetURL
=
0
;
offsetURL
=
tokenizer
->
execute
(
urlCurrent
,
offsetURL
,
Tokenizer
::
URL
);
// tokenize url
string
host
=
""
;
host
.
assign
(
currentUrl
.
Host
);
string
path
=
""
;
path
.
assign
(
currentUrl
.
Path
);
string
urlCurrent
=
host
+
"/"
+
path
;
while
(
htmlIt
<
html
.
size
(
)
)
{
unsigned
long
begCloseTag
=
0
;
bool
isParagraph
=
false
;
unsigned
long
savePosition
=
htmlIt
;
// if open bracket
if
(
html
[
htmlIt
]
==
'<'
)
{
offsetURL
=
tokenizer
->
execute
(
urlCurrent
,
offsetURL
,
Tokenizer
::
URL
);
if
(
html
[
htmlIt
+
1
]
==
'p'
&&
(
(
html
[
htmlIt
+
2
])
==
'>'
||
(
html
[
htmlIt
+
2
]
==
' '
)
)
)
{
begCloseTag
=
findNext
(
"</p>"
,
htmlIt
,
html
);
isParagraph
=
true
;
}
else
{
begCloseTag
=
findNext
(
"</"
,
htmlIt
,
html
);
}
while
(
htmlIt
<
html
.
size
(
)
)
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
{
// if open bracket
if
(
html
[
htmlIt
]
==
'<'
)
{
<<<<<<<
HEAD
// TODO have to put a conditional that ensures the opening and closing tags are the same type
auto
begCloseTag
=
findNext
(
"</"
,
htmlIt
);
auto
endCloseTag
=
findNext
(
">"
,
begCloseTag
);
string
line
(
htmlIt
,
endCloseTag
+
1
);
=======
unsigned
long
begCloseTag
=
findNext
(
"</"
,
htmlIt
,
html
);
unsigned
long
endCloseTag
=
findNext
(
">"
,
begCloseTag
,
html
);
string
line
=
subStr
(
html
,
htmlIt
,
endCloseTag
+
1
-
htmlIt
);
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
htmlIt
=
endCloseTag
+
2
;
unsigned
long
endCloseTag
=
findNext
(
">"
,
begCloseTag
,
html
);
string
line
=
subStr
(
html
,
htmlIt
,
endCloseTag
+
1
-
htmlIt
);
htmlIt
=
endCloseTag
+
2
;
//check if line is a script
if
(
isScript
(
line
)
)
{
// DO NOTHING
}
// check if line is url
<<<<<<<
HEAD
else
if
(
url
=
extract_url
(
line
)
!=
""
)
{
//where is urlFrontier defined?
urlFrontier
->
push
(
url
);
}
// check if line is title
else
if
(
title
=
extract_title
(
line
)
!=
""
)
{
tokenizer
->
execute
(
title
,
offset
);
=======
string
url
=
extract_url
(
line
);
if
(
url
!=
""
)
{
if
(
isLocal
(
url
)
)
{
string
completeUrl
=
""
;
completeUrl
.
assign
(
currentUrl
.
CompleteUrl
);
url
=
completeUrl
+
url
;
}
if
(
isValid
(
url
)
&&
url
!=
urlCurrent
)
{
// TODO ParsedUrl with anchor text
ParsedUrl
pUrl
=
ParsedUrl
(
url
);
urlFrontier
->
Push
(
pUrl
);
cout
<<
url
<<
endl
;
}
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
}
else
if
(
body
=
extract_body
(
line
)
!=
""
)
// check if line is url
string
title
=
extract_title
(
line
);
string
url
=
extract_url
(
line
);
string
header
=
extract_header
(
line
);
//checking if html line is script
if
(
isTag
(
line
,
"script"
)
)
{
//DO NOTHING
}
//checking for p tag
else
if
(
isParagraph
)
{
string
body
=
extract_body
(
line
,
offsetTitle
,
offsetBody
,
isParagraph
,
tokenizer
,
currentUrl
,
urlCurrent
);
offsetBody
=
tokenizer
->
execute
(
body
,
offsetBody
,
Tokenizer
::
BODY
);
}
//if html line is url, parses accordingly and pushes to frontier
else
if
(
url
!=
""
)
{
if
(
isLocal
(
url
)
)
{
tokenizer
->
execute
(
body
,
offset
);
string
completeUrl
=
""
;
completeUrl
.
assign
(
currentUrl
.
CompleteUrl
);
url
=
completeUrl
+
url
;
}
else
{
<<<<<<<
HEAD
//DO NOTHING
=======
string
title
=
extract_title
(
line
);
if
(
title
!=
""
)
{
offsetTitle
=
tokenizer
->
execute
(
title
,
offsetTitle
,
Tokenizer
::
TITLE
);
}
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
}
}
else
{
++
htmlIt
;
}
}
}
if
(
isValid
(
url
)
&&
url
!=
urlCurrent
)
{
// TODO ParsedUrl with anchor text
ParsedUrl
pUrl
=
ParsedUrl
(
url
);
// urlFrontier->Push( pUrl );
cout
<<
url
<<
endl
;
}
}
//check if line is header; classifies as body text
else
if
(
header
!=
""
)
{
offsetBody
=
tokenizer
->
execute
(
header
,
offsetBody
,
Tokenizer
::
BODY
);
}
// check if line is title
else
if
(
title
!=
""
)
{
offsetTitle
=
tokenizer
->
execute
(
title
,
offsetTitle
,
Tokenizer
::
TITLE
);
}
/*
* Returns true if script tag, false if not
*/
bool
Parser
::
isScript
(
string
&
word
)
{
if
(
*
findStr
(
"<script"
,
word
)
!=
'\0'
)
{
return
true
;
}
return
false
;
}
/*
* Returns body text if p tags, empty string if not
* If there's no closing tag, stops at the first opening tag or when it hits end of file
*/
string
Parser
::
extract_body
(
string
&
word
,
int
&
offset
)
{
string
body
=
""
;
auto
foundBody
=
findStr
(
"<p"
,
word
)
!=
'\0'
;
if
(
*
foundBody
!=
'\0'
)
{
while
(
*
findStr
!=
'<'
)
else
{
body
+=
*
findStr
;
if
(
*
findStr
==
' '
)
{
count
+=
1
;
}
//DO NOTHING
}
}
return
body
;
else
{
++
htmlIt
;
}
}
}
/**
* Returns a url, or "" if none
* @param word
* @return
*/
<<<<<<<
HEAD
string
Parser
::
extract_url
(
string
&
word
)
=======
string
Parser
::
extract_url
(
string
html
)
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
{
string
url
=
""
;
if
(
findStr
(
"<a"
,
html
)
!=
html
.
size
(
)
)
{
unsigned
long
foundHref
=
findStr
(
"href"
,
html
);
unsigned
long
foundHttp
=
findNext
(
"http"
,
foundHref
,
html
);
if
(
foundHttp
<
html
.
size
(
)
)
{
url
=
""
;
unsigned
long
closeTag
=
findNext
(
">"
,
foundHref
,
html
);
unsigned
long
closeSpace
=
findNext
(
" "
,
foundHref
,
html
);
unsigned
long
closeUrl
=
0
;
// end == ' >'
if
(
closeSpace
<
html
.
size
(
)
&&
closeTag
<
html
.
size
(
)
&&
closeSpace
<
closeTag
)
{
if
(
html
[
closeSpace
-
1
]
==
'\"'
)
{
closeSpace
-=
1
;
}
closeUrl
=
closeSpace
;
}
// end == '>'
else
if
(
closeTag
<
html
.
size
(
)
)
{
if
(
html
[
closeTag
-
1
]
==
'\"'
)
{
closeTag
-=
1
;
}
closeUrl
=
closeTag
;
}
{
string
url
=
""
;
if
(
findStr
(
"<a"
,
html
)
!=
html
.
size
(
)
)
{
unsigned
long
foundHref
=
findStr
(
"href"
,
html
);
unsigned
long
foundHttp
=
findNext
(
"http"
,
foundHref
,
html
);
if
(
foundHttp
<
html
.
size
(
)
)
{
url
=
""
;
unsigned
long
closeTag
=
findNext
(
">"
,
foundHref
,
html
);
unsigned
long
closeSpace
=
findNext
(
" "
,
foundHref
,
html
);
unsigned
long
closeUrl
=
0
;
// end == ' >'
if
(
closeSpace
<
html
.
size
(
)
&&
closeTag
<
html
.
size
(
)
&&
closeSpace
<
closeTag
)
{
if
(
html
[
closeSpace
-
1
]
==
'\"'
)
{
closeSpace
-=
1
;
}
closeUrl
=
closeSpace
;
}
// end == '>'
else
if
(
closeTag
<
html
.
size
(
)
)
{
if
(
html
[
closeTag
-
1
]
==
'\"'
)
{
closeTag
-=
1
;
}
closeUrl
=
closeTag
;
}
while
(
foundHttp
!=
closeUrl
&&
html
[
foundHttp
]
!=
'\n'
)
{
url
.
push_back
(
html
[
foundHttp
]
);
++
foundHttp
;
}
}
}
while
(
foundHttp
!=
closeUrl
&&
html
[
foundHttp
]
!=
'\n'
)
{
url
.
push_back
(
html
[
foundHttp
]
);
++
foundHttp
;
}
}
}
return
url
;
}
return
url
;
}
/**
* Returns a title, or "" if none
...
...
@@ -232,21 +177,21 @@ string Parser::extract_url ( string html )
* @return
*/
string
Parser
::
extract_title
(
string
html
)
{
string
title
=
""
;
char
end
=
'<'
;
auto
pos
=
findStr
(
"<title>"
,
html
);
if
(
pos
<
html
.
size
(
)
)
{
pos
+=
7
;
while
(
html
[
pos
]
!=
end
)
{
title
+=
html
[
pos
];
++
pos
;
}
}
return
title
;
}
{
string
title
=
""
;
char
end
=
'<'
;
auto
pos
=
findStr
(
"<title>"
,
html
);
if
(
pos
<
html
.
size
(
)
)
{
pos
+=
7
;
while
(
html
[
pos
]
!=
end
)
{
title
+=
html
[
pos
];
++
pos
;
}
}
return
title
;
}
/**
* Will return true if local url
...
...
@@ -255,9 +200,9 @@ string Parser::extract_title ( string html )
* @return
*/
bool
Parser
::
isLocal
(
string
url
)
{
return
(
url
[
0
]
==
'/'
);
}
{
return
(
url
[
0
]
==
'/'
);
}
/**
* Returns false if the link is an invalid type
...
...
@@ -266,28 +211,163 @@ bool Parser::isLocal ( string url )
* @return
*/
bool
Parser
::
isValid
(
string
url
)
{
unsigned
long
size
=
url
.
size
(
);
{
unsigned
long
size
=
url
.
size
(
);
string
lastFive
=
lastN
(
url
,
5
);
string
lastFour
=
lastN
(
url
,
4
);
// .html
if
(
lastFive
==
".html"
)
{
return
true
;
}
// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
if
(
lastFour
==
".png"
||
lastFour
==
".jpg"
||
lastFour
==
".css"
||
lastFour
==
".gif"
||
lastFour
==
".pdf"
||
lastFour
==
".wav"
||
lastFour
==
".mp3"
||
lastFour
==
".mp4"
||
lastFour
==
".ico"
)
{
return
false
;
}
//jpeg
if
(
lastFive
==
".jpeg"
)
{
return
false
;
}
return
true
;
}
//TODO delete?? may not need
void
Parser
::
remove_tag
(
string
&
html
,
unsigned
long
&
htmlIt
,
unsigned
long
savePosition
,
string
tag
)
{
unsigned
long
openTag
=
findStr
(
"<"
+
tag
+
">"
,
html
);
unsigned
long
closeTag
=
findNext
(
"</"
+
tag
+
">"
,
openTag
,
html
);
//TODO write erase functions??
html
.
erase
(
closeTag
,
tag
.
length
(
)
+
2
);
html
.
erase
(
openTag
,
tag
.
length
(
)
+
3
);
htmlIt
=
savePosition
;
}
void
Parser
::
extract_all
(
string
line
,
unsigned
long
&
offsetTitle
,
unsigned
long
&
offsetBody
,
bool
isParagraph
,
Tokenizer
*
tokenizer
,
ParsedUrl
&
currentUrl
,
string
&
urlCurrent
)
{
// check if line is url
string
title
=
extract_title
(
line
);
string
url
=
extract_url
(
line
);
//checking if html line is script
if
(
isTag
(
line
,
"script"
)
)
{
//DO NOTHING
}
//TODO delete this conditional if keeping whats in main right now
else
if
(
isParagraph
)
{
string
body
=
extract_body
(
line
,
offsetTitle
,
offsetBody
,
isParagraph
,
tokenizer
,
currentUrl
,
urlCurrent
);
offsetBody
=
tokenizer
->
execute
(
body
,
offsetBody
,
Tokenizer
::
BODY
);
}
else
if
(
url
!=
""
)
{
if
(
isLocal
(
url
)
)
{
string
completeUrl
=
""
;
completeUrl
.
assign
(
currentUrl
.
CompleteUrl
);
url
=
completeUrl
+
url
;
}
if
(
isValid
(
url
)
&&
url
!=
urlCurrent
)
{
// TODO ParsedUrl with anchor text
ParsedUrl
pUrl
=
ParsedUrl
(
url
);
// urlFrontier->Push( pUrl );
cout
<<
url
<<
endl
;
}
}
// check if line is title
// check if line is title
else
if
(
title
!=
""
)
{
offsetTitle
=
tokenizer
->
execute
(
title
,
offsetTitle
,
Tokenizer
::
TITLE
);
}
else
{
//DO NOTHING
}
}
/**
* Returns true if tag is in html, false if not
* @param html
* @return
*/
bool
Parser
::
isTag
(
string
html
,
string
tag
)
{
string
findTag
=
"<"
+
tag
;
if
(
findStr
(
findTag
,
html
)
!=
html
.
size
(
)
)
{
return
true
;
}
return
false
;
}
string
lastFive
=
lastN
(
url
,
5
);
string
lastFour
=
lastN
(
url
,
4
);
string
Parser
::
extract_body
(
string
html
,
unsigned
long
&
offsetTitle
,
unsigned
long
&
offsetBody
,
bool
isParagraph
,
Tokenizer
*
tokenizer
,
ParsedUrl
&
currentUrl
,
string
&
urlCurrent
)
{
string
body
=
""
;
unsigned
long
startParTag
=
findNext
(
"<p>"
,
0
,
html
);
unsigned
long
closeParTag
=
findNext
(
"</p>"
,
startParTag
,
html
);
unsigned
long
nextCloseTag
=
findNext
(
"</"
,
startParTag
,
html
);
startParTag
+=
3
;
while
(
nextCloseTag
!=
startParTag
)
{
if
(
closeParTag
==
nextCloseTag
)
{
while
(
startParTag
!=
closeParTag
)
{
body
+=
html
[
startParTag
];
++
startParTag
;
if
(
startParTag
>=
html
.
size
(
))
{
return
body
;
}
}
}
else
{
unsigned
long
newHtmlStart
=
findNext
(
"<"
,
startParTag
,
html
);
char
a
=
html
[
newHtmlStart
];
unsigned
long
closeNewHtml
=
findNext
(
">"
,
newHtmlStart
,
html
);
char
b
=
html
[
closeNewHtml
];
unsigned
long
newHtmlTagLength
=
closeNewHtml
-
newHtmlStart
;
while
(
startParTag
!=
newHtmlStart
)
{
body
+=
html
[
startParTag
];
++
startParTag
;
}
string
newHtml
=
subStr
(
html
,
newHtmlStart
,
nextCloseTag
-
newHtmlStart
+
newHtmlTagLength
+
2
);
extract_all
(
newHtml
,
offsetTitle
,
offsetBody
,
false
,
tokenizer
,
currentUrl
,
urlCurrent
);
startParTag
=
nextCloseTag
+
newHtmlTagLength
+
2
;
nextCloseTag
=
findNext
(
"</"
,
startParTag
,
html
);
}
}
// .html
if
(
lastFive
==
".html"
)
{
return
true
;
}
return
body
;
}
// png || jpg || css || gif || pdf || wav || mp3 || mp4 || ico
if
(
lastFour
==
".png"
||
lastFour
==
".jpg"
||
lastFour
==
".css"
||
lastFour
==
".gif"
||
lastFour
==
".pdf"
||
lastFour
==
".wav"
||
lastFour
==
".mp3"
||
lastFour
==
".mp4"
||
lastFour
==
".ico"
)
{
return
false
;
}
//jpeg
if
(
lastFive
==
".jpeg"
)
{
return
false
;
}
return
true
;
}
string
Parser
::
extract_header
(
string
html
)
{
string
header
=
""
;
unsigned
long
startHeader
=
findStr
(
"<h"
,
html
);
if
(
startHeader
!=
html
.
size
(
)
&&
(
html
[
startHeader
+
1
]
>=
'1'
&&
html
[
startHeader
+
1
]
<=
'6'
)
)
{
unsigned
long
endHeader
=
findNext
(
"</h"
,
startHeader
,
html
);
startHeader
+=
4
;
while
(
startHeader
!=
endHeader
)
{
header
+=
html
[
startHeader
];
++
startHeader
;
}
}
return
header
;
}
This diff is collapsed.
Click to expand it.
parser/Parser.h
+
69
−
82
View file @
22d29b60
...
...
@@ -19,92 +19,79 @@ using namespace std;
* Returns a pointer to a dictionary that contains the tokenized input
*/
class
Parser
{
{
public:
<<<<<<<
HEAD
Parser
(
ProducerConsumerQueue
<
string
>
*
urlFrontierIn
)
{
urlFrontier
=
urlFrontierIn
;
}
=======
/**
* Parser Cstor
* @param urlFrontierIn
*/
Parser
(
ProducerConsumerQueue
<
ParsedUrl
>
*
urlFrontierIn
);
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Executes the Parser
* @return
*/
<<<<<<<
HEAD
// TODO need to change vector type to word data, change where struct is declared
const
unordered_map
<
string
,
vector
<
Tokenizer
::
wordData
>>
*
execute
(
Document
*
document
)
{
Tokenizer
tokenizer
;
parse
(
document
->
DocToString
(),
&
tokenizer
);
return
tokenizer
.
get
(
);
}
=======
const
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
execute
(
Document
*
document
);
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Parser Cstor
* @param urlFrontierIn
*/
Parser
(
ProducerConsumerQueue
<
ParsedUrl
>
*
urlFrontierIn
);
/**
* Executes the Parser
* @return
*/
const
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
execute
(
Document
*
document
);
private:
ProducerConsumerQueue
<
ParsedUrl
>
*
urlFrontier
;
/**
* Parses file
* @param inFile
* @return
*/
<<<<<<<
HEAD
void
parse
(
string
html
,
Tokenizer
*
tokenizer
);
=======
void
parse
(
string
html
,
ParsedUrl
currentUrl
,
Tokenizer
*
tokenizer
);
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Returns a url, or "" if none
* @param html
* @return
*/
string
extract_url
(
string
html
);
/**
* Returns a title, or "" if none
* @param html
* @return
*/
string
extract_title
(
string
html
);
<<<<<<<
HEAD
bool
isScript
(
string
&
word
);
string
extract_body
(
string
&
word
);
=======
/**
* Will return true if local url
*
* @param url
* @return
*/
bool
isLocal
(
string
url
);
/**
* Returns true is url is valid
*
* @param url
* @return
*/
bool
isValid
(
string
url
);
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
};
ProducerConsumerQueue
<
ParsedUrl
>
*
urlFrontier
;
/**
* Parses file
* @param inFile
* @return
*/
void
parse
(
string
html
,
ParsedUrl
currentUrl
,
Tokenizer
*
tokenizer
);
/**
* Returns a url, or "" if none
* @param html
* @return
*/
string
extract_url
(
string
html
);
/**
* Returns a title, or "" if none
* @param html
* @return
*/
string
extract_title
(
string
html
);
/**
* Will return true if local url
*
* @param url
* @return
*/
bool
isLocal
(
string
url
);
/**
* Returns true is url is valid
*
* @param url
* @return
*/
bool
isValid
(
string
url
);
bool
isTag
(
string
html
,
string
tag
);
string
extract_body
(
string
html
,
unsigned
long
&
offsetTitle
,
unsigned
long
&
offsetBody
,
bool
isParagraph
,
Tokenizer
*
tokenizer
,
ParsedUrl
&
currentUrl
,
string
&
urlCurrent
);
void
extract_all
(
string
line
,
unsigned
long
&
offsetTitle
,
unsigned
long
&
offsetBody
,
bool
isParagraph
,
Tokenizer
*
tokenizer
,
ParsedUrl
&
currentUrl
,
string
&
urlCurrent
);
//TODO delete?? may not need
void
remove_tag
(
string
&
html
,
unsigned
long
&
htmlIt
,
unsigned
long
savePosition
,
string
tag
);
string
extract_header
(
string
html
);
};
This diff is collapsed.
Click to expand it.
parser/tests/parserTest.cpp
+
101
−
5
View file @
22d29b60
#include
<string>
#include
<cstring>
#include
<cassert>
#include
<iostream>
#include
"../Parser.h"
...
...
@@ -14,6 +15,10 @@ void testComplex ( );
void
testURL
(
);
void
testExtractBody
(
);
void
testBody
(
);
int
main
(
)
{
cout
<<
"Testing Parser ... "
<<
endl
<<
endl
;
...
...
@@ -26,11 +31,13 @@ int main ( )
cout
<<
"Testing Complex: "
<<
endl
;
testComplex
(
);
cout
<<
"Complex Test Passed!"
<<
endl
;
cout
<<
"Parser Tests Passed! :D"
<<
endl
;
}
cout
<<
"Testing BODY: "
<<
endl
;
testExtractBody
(
);
testBody
(
);
cout
<<
"Parser Tests Passed! :D"
<<
endl
;
}
void
testSimple
(
)
void
testSimple
(
)
{
ProducerConsumerQueue
<
ParsedUrl
>
urlFrontierTest
;
...
...
@@ -143,4 +150,93 @@ void testURL ( )
delete
dictionary
;
dictionary
=
nullptr
;
}
\ No newline at end of file
}
void
testBody
(
)
{
ProducerConsumerQueue
<
ParsedUrl
>
urlFrontierTest
;
ParsedUrl
url
=
ParsedUrl
(
"http://www.testurl.com"
);
char
docString
[
1024
];
strcpy
(
docString
,
"<!DOCTYPE html>
\n
"
"<html>
\n
"
"<head>
\n
"
"<!-- HTML Codes by Quackit.com -->
\n
"
"<title>
\n
"
"Story of Cat</title>
\n
"
"<meta name=
\"
viewport
\"
content=
\"
width=device-width, initial-scale=1
\"
>
\n
"
"<meta name=
\"
keywords
\"
content=
\"
cat story
\"
>
\n
"
"<meta name=
\"
description
\"
content=
\"
This is the tale of a cat names joe
\"
>
\n
"
"<style>
\n
"
"body {background-color:#ffffff;background-repeat:no-repeat;background-position:top left;background-attachment:fixed;}
\n
"
"h1{font-family:Arial, sans-serif;color:#000000;background-color:#ffffff;}
\n
"
"p {font-family:Georgia, serif;font-size:14px;font-style:normal;font-weight:normal;color:#000000;background-color:#ffffff;}
\n
"
"</style>
\n
"
"</head>
\n
"
"<body>
\n
"
"<h1>Joe the cat</h1>
\n
"
"<p>On Saturday, joe the cat went to the store. He climbed up a mountain? It was weird. The store was called Food Store</p>
\n
"
"</body>
\n
"
"</html>"
);
Document
document
(
url
,
docString
);
Parser
parser
(
&
urlFrontierTest
);
auto
dictionary
=
parser
.
execute
(
&
document
);
cout
<<
dictionary
->
size
(
)
<<
endl
;
//assert( dictionary->size( ) == 4);
for
(
auto
it
=
dictionary
->
begin
(
);
it
!=
dictionary
->
end
(
);
it
++
)
{
cout
<<
it
->
first
<<
':'
;
for
(
int
i
=
0
;
i
<
it
->
second
.
size
(
);
++
i
)
{
cout
<<
it
->
second
[
i
]
<<
" "
;
}
cout
<<
std
::
endl
;
}
}
void
testExtractBody
(
)
{
ProducerConsumerQueue
<
ParsedUrl
>
urlFrontierTest
;
ParsedUrl
url
=
ParsedUrl
(
"http://www.testurl.com"
);
char
docString
[
1024
];
strcpy
(
docString
,
"<title>Paragraph body text hello</title>"
);
Document
document
(
url
,
docString
);
Parser
parser
(
&
urlFrontierTest
);
auto
dictionary
=
parser
.
execute
(
&
document
);
cout
<<
dictionary
->
size
(
)
<<
endl
;
for
(
auto
it
=
dictionary
->
begin
(
);
it
!=
dictionary
->
end
(
);
it
++
)
{
cout
<<
it
->
first
<<
':'
;
for
(
int
i
=
0
;
i
<
it
->
second
.
size
(
);
++
i
)
{
cout
<<
it
->
second
[
i
]
<<
" "
;
}
cout
<<
std
::
endl
;
}
cout
<<
endl
<<
endl
;
assert
(
dictionary
->
size
(
)
==
6
);
char
docString2
[
1024
];
strcpy
(
docString2
,
"<p>Paragraph body text hello <title>Specific title</title> more body words</p>"
);
Document
document2
(
url
,
docString2
);
Parser
parser2
(
&
urlFrontierTest
);
dictionary
=
parser
.
execute
(
&
document2
);
cout
<<
"Dictionary 2 size "
<<
dictionary
->
size
(
)
<<
endl
;
for
(
auto
it
=
dictionary
->
begin
(
);
it
!=
dictionary
->
end
(
);
it
++
)
{
cout
<<
it
->
first
<<
':'
;
for
(
int
i
=
0
;
i
<
it
->
second
.
size
(
);
++
i
)
{
cout
<<
it
->
second
[
i
]
<<
" "
;
}
cout
<<
std
::
endl
;
}
assert
(
dictionary
->
size
(
)
==
10
);
assert
(
dictionary
->
at
(
"#specif"
)[
0
]
==
0
);
assert
(
dictionary
->
at
(
"%paragraph"
)[
0
]
==
0
);
assert
(
dictionary
->
at
(
"%bodi"
)[
1
]
==
5
);
}
This diff is collapsed.
Click to expand it.
util/Tokenizer.h
+
41
−
96
View file @
22d29b60
#pragma once
#include
<string>
...
...
@@ -6,111 +5,57 @@
#include
<vector>
#include
"stringProcessing.h"
#include
"Stemmer.h"
#include
"../parser/Parser.h"
using
namespace
std
;
class
Tokenizer
{
{
public:
struct
wordData
{
int
frequency
=
0
;
int
offset
;
};
Tokenizer
(
)
{
docIndex
=
new
unordered_map
<
string
,
vector
<
wordData
>>
;
}
unordered_map
<
string
,
vector
<
wordData
>>
*
get
(
)
const
{
return
docIndex
;
}
//add type of word parameter, ie paragraph, url etc
void
execute
(
string
&
originalText
,
int
offset
)
{
vector
<
string
>
splitText
=
splitStr
(
originalText
,
' '
);
string
processedString
=
""
;
int
vectorLength
=
0
;
for
(
int
i
=
0
;
i
<
splitText
.
size
(
);
++
i
)
{
// case fold
processedString
=
toLower
(
splitText
[
i
]
);
//strip all characters
processedString
=
stripStr
(
processedString
);
if
(
!
isStopWord
(
lowerString
)
)
{
// stem word
processedString
=
stem
.
execute
(
processedString
);
wordData
currentWord
;
currentWord
.
offset
=
offset
;
vectorLength
=
(
*
docIndex
)[
lowerString
].
size
(
);
(
*
docIndex
)[
lowerString
].
push_back
(
currentWord
);
//incrementing frequency value of the current word
(
*
docIndex
)[
lowerString
][
vectorLength
-
1
].
frequency
+=
1
;
++
offset
;
}
}
}
// decorators
static
const
char
TITLE
=
'#'
;
static
const
char
ANCHOR
=
'@'
;
static
const
char
URL
=
'$'
;
/**
* Tokenizer Cstor
*/
Tokenizer
(
);
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
<<<<<<<
HEAD
unordered_map
<
string
,
vector
<
wordData
>>
*
get
(
)
const
;
=======
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
get
(
)
const
;
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned
long
execute
(
string
originalText
,
unsigned
long
offset
,
char
decorator
=
'\0'
);
// decorators
static
const
char
TITLE
=
'#'
;
static
const
char
ANCHOR
=
'@'
;
static
const
char
URL
=
'$'
;
static
const
char
BODY
=
'%'
;
/**
* Tokenizer Cstor
*/
Tokenizer
(
);
/**
* Returns pointer to the docIndex dictionary
*
* @return pointer to unordered_map< string, vector< int>>
*/
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
get
(
)
const
;
/**
* Executes the Tokenizer
* Sends tokens to dictionary
*
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned
long
execute
(
string
originalText
,
unsigned
long
offset
,
char
decorator
=
'\0'
);
private:
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
docIndex
;
Stemmer
stem
;
unordered_map
<
string
,
vector
<
unsigned
long
>
>
*
docIndex
;
Stemmer
stem
;
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned
long
tokenize
(
vector
<
string
>
splitText
,
unsigned
long
offset
,
char
decorator
);
/**
* Tokenizes text (titles, body text)
*
* @param originalText
* @param offset
* @param decorator
*/
unsigned
long
tokenize
(
vector
<
string
>
splitText
,
unsigned
long
offset
,
char
decorator
);
<<<<<<<
HEAD
private
:
unordered_map
<
string
,
vector
<
wordData
>>
*
docIndex
;
Stemmer
stem
;
=======
>>>>>>>
02e3
c89768ec57f7ea0c16a6fdf7e3d17c3d07bb
};
};
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment