Commit 28867729 authored by Cayetano Wagner's avatar Cayetano Wagner
Browse files

html to json parsing

parent e14a5c25
from bs4 import BeautifulSoup
from time import sleep
from copy import deepcopy
import unicodedata
import json
# read in html
with open('stories/html/TurtleDickTalesfromtheCrypt.html', 'r') as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc, 'html.parser')
stories = []
story = {'title': None, 'text': ''}
# html docs has the structure
# h1 (Tales from the Crypt)
# ...
# h3 (Author name)
# h2 (Story title)
# ...(Story contents contains in 'p' tags)
# h2
# ...
# h3
# h2
# ...
# h2
# ...
# iterate over html tree
for child in soup.body.children:
if child.name == 'h2':
if story['title'] is not None:
# flush story to stories
# print(story)
stories.append(deepcopy(story))
# reset story text
story = {'title': child.get_text(), 'text': '' }
else:
# unicode NFKD normailzation required to clear &nbsp characters
story['title'] = unicodedata.normalize('NFKD', child.get_text())
elif child.name == 'p':
if story['title'] is not None:
# append text
story['text'] += ' '
story['text'] += unicodedata.normalize('NFKD', child.get_text())
else:
pass
# save to json
print(stories)
with open('stories/json/stories.json', 'w+') as f:
json.dump(stories, f, ensure_ascii=False)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment