Commit 177c14bd authored by rheas's avatar rheas
Browse files

wikiscrape - saves all html source for all those born in any specified year

parent 930ce6fc
selenium
codecs
\ No newline at end of file
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import codecs
#option = webdriver.chrome.options.Options()
#option.add_argument(" — incognito")
year = "1863"
browser = webdriver.Chrome(executable_path='/Users/rhea/Desktop/chromedriver')
browser.get("https://en.wikipedia.org/wiki/Category:" + year + "_births")
# Wait 20 seconds for page to load
timeout = 20
try:
WebDriverWait(browser, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "mw-category")))
except TimeoutException:
print("Timed out waiting for page to load")
browser.quit()
# find_elements_by_xpath returns an array of selenium objects.
if not os.path.isdir(year):
os.mkdir(year)
path = os.path.abspath(".") + "/" + year + "/"
pageOne = True
nextExists = True
while nextExists:
if not pageOne:
nextLink.click()
pageOne = False
people_list = browser.find_elements_by_xpath("//div[@class='mw-category']//div[2]//ul[1]//li")
people = [x.text for x in people_list]
for person in people:
personLink = browser.find_element_by_link_text(person)
try:
personLink.click()
savePath = path + person.replace(" ","").replace("/","-")
fileObject = codecs.open(savePath, "w", "utf-8")
html = browser.page_source
fileObject.write(html)
browser.execute_script("window.history.go(-1)")
except NoSuchElementException:
print(person + "'s page not found")
try:
nextLink = browser.find_element_by_link_text('next page')
nextExists = True
except NoSuchElementException:
print("last page")
nextExists = False
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment