Commit 0298834a authored by rheas's avatar rheas
Browse files

conform to pylint

parent 257a5eb6
"""Scrapes wikipedia for people born in specified year"""
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import codecs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import codecs
import click
@click.command()
@click.argument('year')
def main(year):
"""Saves source files of every person in specified year"""
browser = webdriver.Chrome(executable_path='/Users/rhea/Desktop/chromedriver')
browser.get("https://en.wikipedia.org/w/index.php?title=Category:" + year + "_births")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(
EC.presence_of_element_located((By.CLASS_NAME, "mw-category")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
#make directory titled $year to store html
if not os.path.isdir(year):
os.mkdir(year)
path = os.path.abspath(".") + "/" + year + "/"
page_one = True
next_exists = True
next_link = browser.find_element_by_link_text('next page')
while next_exists:
if page_one:
page_one = False
else:
next_link.click()
people_list = browser.find_elements_by_xpath("//div[@class='mw-category']//div//ul//li")
people = [x.text for x in people_list]
for person in people:
person_link = browser.find_element_by_link_text(person)
try:
person_link.click()
save_path = path + person.replace(" ", "").replace("/", "-")
file_object = codecs.open(save_path, "w", "utf-8")
html = browser.page_source
file_object.write(html)
browser.execute_script("window.history.go(-1)")
except NoSuchElementException:
print(person + "'s page not found")
try:
next_link = browser.find_element_by_link_text('next page')
except NoSuchElementException:
print("last page")
next_exists = False
#option = webdriver.chrome.options.Options()
#option.add_argument(" — incognito")
year = "1863"
browser = webdriver.Chrome(executable_path='/Users/rhea/Desktop/chromedriver')
browser.get("https://en.wikipedia.org/wiki/Category:" + year + "_births")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "mw-category")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# find_elements_by_xpath returns an array of selenium objects.
if not os.path.isdir(year):
os.mkdir(year)
path = os.path.abspath(".") + "/" + year + "/"
pageOne = True
nextExists = True
while nextExists:
if not pageOne:
nextLink.click()
pageOne = False
people_list = browser.find_elements_by_xpath("//div[@class='mw-category']//div[2]//ul[1]//li")
people = [x.text for x in people_list]
for person in people:
personLink = browser.find_element_by_link_text(person)
try:
personLink.click()
savePath = path + person.replace(" ","").replace("/","-")
fileObject = codecs.open(savePath, "w", "utf-8")
html = browser.page_source
fileObject.write(html)
browser.execute_script("window.history.go(-1)")
except NoSuchElementException:
print(person + "'s page not found")
try:
nextLink = browser.find_element_by_link_text('next page')
nextExists = True
except NoSuchElementException:
print("last page")
nextExists = False
if __name__ == "__main__":
# pylint: disable=no-value-for-parameter
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment