Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eecs398-search
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
vcday
eecs398-search
Commits
b9acd359
Commit
b9acd359
authored
7 years ago
by
vcday
Browse files
Options
Downloads
Patches
Plain Diff
parse logic imporved
parent
a041c8ff
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
parser/Parser.h
+37
-31
37 additions, 31 deletions
parser/Parser.h
parser/tests/parserTest.cpp
+51
-2
51 additions, 2 deletions
parser/tests/parserTest.cpp
util/Tokenizer.h
+5
-6
5 additions, 6 deletions
util/Tokenizer.h
util/stringProcessing.h
+51
-1
51 additions, 1 deletion
util/stringProcessing.h
with
144 additions
and
40 deletions
parser/Parser.h
+
37
−
31
View file @
b9acd359
...
...
@@ -54,39 +54,46 @@ private:
* @param inFile
* @return
*/
//TODO instead of grabbing each line, look to see if beginning of
// TODO title/url/anchortext, etc. Then continue until close tag and add to tokenizer after end of tag found
void
parse
(
string
html
,
Tokenizer
*
tokenizer
)
{
string
tokenizerInput
=
""
;
string
currentTerm
=
""
;
int
index
=
0
;
while
(
index
!=
html
.
size
())
auto
htmlIt
=
html
.
begin
();
int
offset
=
0
;
while
(
htmlIt
!=
html
.
end
())
{
currentTerm
=
""
;
while
(
html
.
at
(
index
)
!=
'\n'
)
{
currentTerm
+=
html
[
index
];
++
index
;
}
++
index
;
string
url
=
extract_url
(
currentTerm
);
if
(
url
!=
""
)
// if open bracket
if
(
*
htmlIt
==
'<'
)
{
urlFrontier
->
Push
(
url
);
auto
begCloseTag
=
findNext
(
"</"
,
htmlIt
);
auto
endCloseTag
=
findNext
(
">"
,
begCloseTag
);
string
line
(
htmlIt
,
endCloseTag
+
1
);
htmlIt
=
endCloseTag
+
2
;
// check if line is url
string
url
=
extract_url
(
line
);
if
(
url
!=
""
)
{
urlFrontier
->
Push
(
url
);
}
// check if line is title
else
{
string
title
=
extract_title
(
line
);
if
(
title
!=
""
)
{
tokenizer
->
execute
(
title
,
offset
);
}
}
//TODO fix offset?
offset
=
htmlIt
-
html
.
begin
();
}
else
{
string
title
=
extract_title
(
currentTerm
);
if
(
title
!=
""
)
{
tokenizerInput
+=
title
;
}
++
htmlIt
;
}
}
tokenizer
->
execute
(
tokenizerInput
);
}
...
...
@@ -98,16 +105,15 @@ private:
string
extract_url
(
string
word
)
{
string
url
=
""
;
if
(
*
findStr
(
word
,
"<a"
)
!=
'\0'
)
if
(
*
findStr
(
"<a"
,
word
)
!=
'\0'
)
{
auto
foundHttp
=
findStr
(
word
,
"href=http"
);
auto
foundHref
=
findStr
(
"href"
,
word
);
auto
foundHttp
=
findNext
(
"http"
,
foundHref
);
if
(
*
foundHttp
!=
'\0'
)
{
url
=
"http"
;
foundHttp
+=
9
;
while
(
*
foundHttp
!=
*
findStr
(
word
,
"
\"
>"
)
)
url
=
""
;
auto
closeTag
=
findNext
(
">"
,
word
.
begin
(
)
);
while
(
*
foundHttp
!=
*
closeTag
)
{
url
+=
*
foundHttp
;
++
foundHttp
;
...
...
This diff is collapsed.
Click to expand it.
parser/tests/parserTest.cpp
+
51
−
2
View file @
b9acd359
...
...
@@ -11,11 +11,23 @@
using
namespace
std
;
void
testSimple
(
);
void
testComplex
(
);
int
main
(
)
{
cout
<<
"Testing Parser ... "
<<
endl
<<
endl
;
testSimple
();
testComplex
();
cout
<<
"Parser Tests Passed! :D"
<<
endl
;
}
void
testSimple
(
)
{
ProducerConsumerQueue
<
string
>
*
urlFrontierTest
;
Document
document
(
"<title>This Cat Title Cat</title>
\n
"
);
Document
document
(
"<title>This Cat Title Cat</title>"
);
Parser
parser
(
urlFrontierTest
);
auto
dictionary
=
parser
.
execute
(
&
document
);
...
...
@@ -28,7 +40,44 @@ int main ( )
assert
(
dictionary
->
at
(
"cat"
)[
0
]
==
0
&&
dictionary
->
at
(
"cat"
)[
1
]
==
2
);
assert
(
dictionary
->
at
(
"title"
)[
0
]
==
1
);
cout
<<
"Parser Tests Passed! :D"
<<
endl
;
delete
dictionary
;
}
void
testComplex
(
)
{
ProducerConsumerQueue
<
string
>
*
urlFrontierTest
;
ifstream
file
(
"../tests/cats.html"
);
string
temp
;
string
docString
=
"<title>Joe the Cat</title>
\n
"
;
docString
+=
"<a href=
\"
https://www.w3schools.com/html/
\"
>Visit our HTML tutorial</a>
\n
"
;
while
(
std
::
getline
(
file
,
temp
))
{
docString
+=
temp
;
}
Document
document
(
docString
);
Parser
parser
(
urlFrontierTest
);
auto
dictionary
=
parser
.
execute
(
&
document
);
// cout << dictionary->size () << endl;
// for (auto p : *dictionary)
// cout << p.first << endl;
assert
(
dictionary
!=
nullptr
);
assert
(
dictionary
->
size
()
==
3
);
assert
(
dictionary
->
find
(
"cat"
)
!=
dictionary
->
end
()
);
assert
(
dictionary
->
find
(
"story"
)
!=
dictionary
->
end
()
);
assert
(
dictionary
->
find
(
"joe"
)
!=
dictionary
->
end
()
);
assert
(
dictionary
->
find
(
"the"
)
==
dictionary
->
end
()
);
assert
(
dictionary
->
find
(
"of"
)
==
dictionary
->
end
()
);
// assert ( dictionary->at ( "cat" )[ 0 ] == 1 );
// assert ( dictionary->at ( "story" )[ 0 ] == 0 );
// cout << urlFrontierTest->Size () << endl;
// cout << urlFrontierTest->Pop () << endl;
delete
dictionary
;
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
util/Tokenizer.h
+
5
−
6
View file @
b9acd359
...
...
@@ -14,17 +14,16 @@ class Tokenizer
public:
Tokenizer
(
)
{
doc
_i
ndex
=
new
unordered_map
<
string
,
vector
<
int
>>
;
doc
I
ndex
=
new
unordered_map
<
string
,
vector
<
int
>>
;
}
unordered_map
<
string
,
vector
<
int
>>
*
get
(
)
const
{
return
doc
_i
ndex
;
return
doc
I
ndex
;
}
void
execute
(
string
originalText
)
void
execute
(
string
originalText
,
int
offset
)
{
int
offset
=
0
;
vector
<
string
>
splitText
=
splitStr
(
originalText
,
' '
);
string
lowerString
=
""
;
for
(
int
i
=
0
;
i
<
splitText
.
size
(
);
++
i
)
...
...
@@ -32,12 +31,12 @@ public:
lowerString
=
toLower
(
splitText
[
i
]
);
if
(
!
isStopWord
(
lowerString
)
)
{
(
*
doc
_i
ndex
)[
lowerString
].
push_back
(
offset
);
(
*
doc
I
ndex
)[
lowerString
].
push_back
(
offset
);
++
offset
;
}
}
}
private
:
unordered_map
<
string
,
vector
<
int
>>
*
doc
_i
ndex
;
unordered_map
<
string
,
vector
<
int
>>
*
doc
I
ndex
;
};
This diff is collapsed.
Click to expand it.
util/stringProcessing.h
+
51
−
1
View file @
b9acd359
...
...
@@ -24,9 +24,10 @@ set< string > stopWords = { "a", "all", "an", "and", "any", "are", "as", "at", "
"you"
,
"your"
};
/**
* Finds the needle in the haystack
* returns position of first match
* @param haystack
* @param needle
* @return
* @return
string::iterator
*/
string
::
iterator
findStr
(
string
needle
,
string
haystack
)
{
...
...
@@ -75,6 +76,55 @@ string::iterator findStr (string needle, string haystack )
}
/**
* Finds the next position of the needle in the string
* @param needle
* @param pointer
* @return string::iterator
*/
string
::
iterator
findNext
(
string
needle
,
string
::
iterator
haystackPointer
)
{
auto
beginNeedle
=
needle
.
begin
(
);
auto
beginHaystack
=
haystackPointer
;
while
(
*
beginHaystack
!=
'\0'
)
{
//keep looking for instance of a match
if
(
*
beginHaystack
!=
*
beginNeedle
)
{
++
beginHaystack
;
}
else
if
(
*
beginHaystack
==
*
beginNeedle
)
{
/* want to keep the original iterator where it is so it
can return the beginning of the matched word if found */
auto
temp
=
beginHaystack
;
while
(
*
temp
==
*
beginNeedle
)
{
++
temp
;
++
beginNeedle
;
//if it hits the end of the needleing, it signifies an exact match
if
(
*
beginNeedle
==
'\0'
)
{
//this is pointing at the beginning of the match
return
beginHaystack
;
}
}
//need to reset because still has to search rest of the string for a match
beginNeedle
=
needle
.
begin
(
);
//sets the original text pointer to where the last search left off
beginHaystack
=
temp
;
}
else
{
//DO NOTHING
}
}
return
beginHaystack
;
}
/**
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment