Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eecs398-search
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
vcday
eecs398-search
Commits
eb55e2f3
Commit
eb55e2f3
authored
7 years ago
by
vcday
Browse files
Options
Downloads
Patches
Plain Diff
parser changes
parent
fbbf2bf0
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Parser.h
+155
-0
155 additions, 0 deletions
Parser.h
Tokenizer.h
+0
-0
0 additions, 0 deletions
Tokenizer.h
with
155 additions
and
0 deletions
Parser.
cpp
→
Parser.
h
+
155
−
0
View file @
eb55e2f3
...
...
@@ -12,7 +12,8 @@
#include
<queue>
#include
<iostream>
#include
<fstream>
#include
<stringProcessing.h>
#include
"Tokenizer.h"
#include
"stringProcessing.h"
using
namespace
std
;
...
...
@@ -49,15 +50,13 @@ public:
*/
// input: object with char* and URL string
//
string
execute
()
const
unordered_map
<
string
,
vector
<
int
>>
execute
()
{
Tokenizer
tokenizer
()
;
Tokenizer
tokenizer
;
//TEMP - until we get real input from crawler
raw_data
data
(
"url"
,
"html"
);
parse
(
data
.
html_data
,
&
tokenizer
);
return
tokenzier
.
get
();
return
tokenizer
.
get
();
}
...
...
@@ -72,22 +71,17 @@ private:
string
parse
(
string
&
html_data
,
Tokenizer
*
tokenizer
)
{
//figure out file handle syntax - pointer to file
tokenizerInput
=
""
;
currentTerm
=
""
;
string
tokenizerInput
=
""
;
string
currentTerm
=
""
;
for
(
int
i
=
0
;
i
<
html_data
.
size
();
++
i
)
{
while
(
html_data
[
i
]
!=
' '
)
{
currentTerm
+=
html_data
[
i
];
}
//one method that directly adds urls onto frontier instead of checking for them
if
(
!
check_title
(
currentTerm
))
{
add_urls
(
current_term
);
}
else
{
tokenizerInput
+=
currentTerm
;
//can also pass titles individually through tokenizer instead of concatonating (idk how to spell)
}
add_urls
(
currentTerm
);
check_title
(
currentTerm
);
tokenizerInput
+=
currentTerm
;
}
tokenizer
->
execute
(
tokenizerInput
);
...
...
@@ -104,12 +98,12 @@ private:
string
http_end_tag
=
">"
;
auto
word_iter
=
word
.
begin
();
url
=
""
;
string
url
=
""
;
word_iter
=
findStr
(
word_iter
,
a_tag
);
if
(
word_iter
)
{
if
(
word_iter
!=
nullptr
)
{
auto
found_http
=
findStr
(
word_iter
,
http_start
);
if
(
found_http
)
{
url
=
"http"
;
if
(
found_http
!=
nullptr
)
{
url
=
"http"
;
found_http
+=
9
;
auto
end_http
=
findStr
(
word_iter
,
http_end_tag
);
while
(
found_http
!=
end_http
)
{
...
...
@@ -129,36 +123,6 @@ private:
}
/**
* Checks for url in string word
* @param word
*/
bool
check_url
(
string
&
word
)
{
//need to add string processing function where you check in a specified range of positions
if
(
char
*
pos
=
strstr
(
"href"
,
word
))
{
while
(
pos
!=
"
\"
"
&&
pos
!=
"
\'
"
)
++
pos
;
// take everything until next quote
string
url
=
""
;
++
pos
;
while
(
pos
!=
"
\"
"
&&
pos
!=
"
\'
"
)
{
//filter out everything except http, https
url
+=
*
pos
;
}
// send it back to the crawler
URL_PQ
.
push
(
url
);
return
true
;
}
return
false
;
}
/**
* <title >AJF</title>
* @param word
...
...
@@ -166,10 +130,10 @@ private:
bool
check_title
(
string
&
word
)
{
/*
if (char* pos = strstr("<title>", word))
if
(
char
*
pos
=
strstr
(
"<title>"
,
word
))
{
pos
+=
6
;
end_pos = strstr("</title>", word);
auto
end_pos
=
strstr
(
"</title>"
,
word
);
string
title
=
""
;
while
(
pos
!=
end_pos
)
{
...
...
@@ -179,105 +143,13 @@ private:
}
return
title
;
}
*/
}
begin_title
=
"<title>"
;
auto
word_begin
=
word
.
begin
();
auto
word_iter
=
findStr
();
//
string
begin_title = "<title>";
//
auto word_begin = word.begin();
//
auto word_iter = findStr(
word_begin, begin_title
);
}
<<<<<<<
HEAD
/**
* Checks for url in string word
* @param word
*/
bool
check_url
(
string
&
word
)
{
if
(
char
*
pos
=
strstr
(
"href"
,
word
))
{
while
(
pos
!=
"
\"
"
&&
pos
!=
"
\'
"
)
++
pos
;
// take everything until next quote
string
url
=
""
;
++
pos
;
while
(
pos
!=
"
\"
"
&&
pos
!=
"
\'
"
)
{
//filter out everything except http, https
url
+=
*
pos
;
}
// send it back to the crawler
URL_PQ
.
push
(
url
);
return
true
;
}
return
false
;
}
/**
* <title >AJF</title>
* @param word
*/
string
check_title_handle
(
string
&
word
)
{
string
titleTag
=
"<title>"
;
string
closeTitleTag
=
"</title>"
;
auto
wordBegin
=
word
.
begin
();
string
allTitles
=
""
;
while
(
wordBegin
!=
nullptr
)
{
wordBegin
=
findStr
(
wordBegin
,
titleTag
);
if
(
wordBegin
==
nullptr
)
{
return
allTitles
;
}
//increments until first letter past opening title tag
wordBegin
+=
7
;
auto
end_title
=
findStr
(
wordBegin
,
closeTitleTag
);
while
(
wordBegin
!=
end_title
)
{
allTitles
+=
*
wordBegin
;
++
wordBegin
;
if
(
wordBegin
==
nullptr
)
{
return
allTitles
;
}
}
//increments until first letter past closing title tag
wordBegin
+=
8
;
}
return
allTitles
;
}
string
check_title
(
string
&
word
)
{
if
(
char
*
pos
=
strstr
(
"<title>"
,
word
))
{
pos
+=
6
;
end_pos
=
strstr
(
"</title>"
,
word
);
string
title
=
""
;
while
(
pos
!=
end_pos
)
{
++
pos
;
title
+=
*
pos
;
}
return
title
;
}
}
//TODO
};
This diff is collapsed.
Click to expand it.
Tokenizer.
cpp
→
Tokenizer.
h
+
0
−
0
View file @
eb55e2f3
File moved
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment