Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
rheas
wiki-gender-analysis-v1
Commits
0298834a
Commit
0298834a
authored
Jan 26, 2019
by
rheas
Browse files
conform to pylint
parent
257a5eb6
Changes
1
Hide whitespace changes
Inline
Side-by-side
wikiscrape.py
View file @
0298834a
"""Scrapes wikipedia for people born in specified year"""
import
os
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
import
codecs
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions
as
EC
from
selenium.common.exceptions
import
TimeoutException
from
selenium.common.exceptions
import
NoSuchElementException
import
codecs
import
click
@
click
.
command
()
@
click
.
argument
(
'year'
)
def
main
(
year
):
"""Saves source files of every person in specified year"""
browser
=
webdriver
.
Chrome
(
executable_path
=
'/Users/rhea/Desktop/chromedriver'
)
browser
.
get
(
"https://en.wikipedia.org/w/index.php?title=Category:"
+
year
+
"_births"
)
# Wait 20 seconds for page to load
timeout
=
20
try
:
WebDriverWait
(
browser
,
timeout
).
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"mw-category"
)))
except
TimeoutException
:
print
(
"Timed out waiting for page to load"
)
browser
.
quit
()
#make directory titled $year to store html
if
not
os
.
path
.
isdir
(
year
):
os
.
mkdir
(
year
)
path
=
os
.
path
.
abspath
(
"."
)
+
"/"
+
year
+
"/"
page_one
=
True
next_exists
=
True
next_link
=
browser
.
find_element_by_link_text
(
'next page'
)
while
next_exists
:
if
page_one
:
page_one
=
False
else
:
next_link
.
click
()
people_list
=
browser
.
find_elements_by_xpath
(
"//div[@class='mw-category']//div//ul//li"
)
people
=
[
x
.
text
for
x
in
people_list
]
for
person
in
people
:
person_link
=
browser
.
find_element_by_link_text
(
person
)
try
:
person_link
.
click
()
save_path
=
path
+
person
.
replace
(
" "
,
""
).
replace
(
"/"
,
"-"
)
file_object
=
codecs
.
open
(
save_path
,
"w"
,
"utf-8"
)
html
=
browser
.
page_source
file_object
.
write
(
html
)
browser
.
execute_script
(
"window.history.go(-1)"
)
except
NoSuchElementException
:
print
(
person
+
"'s page not found"
)
try
:
next_link
=
browser
.
find_element_by_link_text
(
'next page'
)
except
NoSuchElementException
:
print
(
"last page"
)
next_exists
=
False
#option = webdriver.chrome.options.Options()
#option.add_argument(" — incognito")
year
=
"1863"
browser
=
webdriver
.
Chrome
(
executable_path
=
'/Users/rhea/Desktop/chromedriver'
)
browser
.
get
(
"https://en.wikipedia.org/wiki/Category:"
+
year
+
"_births"
)
# Wait 20 seconds for page to load
timeout
=
20
try
:
WebDriverWait
(
browser
,
timeout
).
until
(
EC
.
presence_of_element_located
((
By
.
CLASS_NAME
,
"mw-category"
)))
except
TimeoutException
:
print
(
"Timed out waiting for page to load"
)
browser
.
quit
()
# find_elements_by_xpath returns an array of selenium objects.
if
not
os
.
path
.
isdir
(
year
):
os
.
mkdir
(
year
)
path
=
os
.
path
.
abspath
(
"."
)
+
"/"
+
year
+
"/"
pageOne
=
True
nextExists
=
True
while
nextExists
:
if
not
pageOne
:
nextLink
.
click
()
pageOne
=
False
people_list
=
browser
.
find_elements_by_xpath
(
"//div[@class='mw-category']//div[2]//ul[1]//li"
)
people
=
[
x
.
text
for
x
in
people_list
]
for
person
in
people
:
personLink
=
browser
.
find_element_by_link_text
(
person
)
try
:
personLink
.
click
()
savePath
=
path
+
person
.
replace
(
" "
,
""
).
replace
(
"/"
,
"-"
)
fileObject
=
codecs
.
open
(
savePath
,
"w"
,
"utf-8"
)
html
=
browser
.
page_source
fileObject
.
write
(
html
)
browser
.
execute_script
(
"window.history.go(-1)"
)
except
NoSuchElementException
:
print
(
person
+
"'s page not found"
)
try
:
nextLink
=
browser
.
find_element_by_link_text
(
'next page'
)
nextExists
=
True
except
NoSuchElementException
:
print
(
"last page"
)
nextExists
=
False
if
__name__
==
"__main__"
:
# pylint: disable=no-value-for-parameter
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment