Commit f59b4d29 authored by Cayetano Wagner's avatar Cayetano Wagner
Browse files

naive bigram generator

parent 28867729
import json
import re
import itertools
import random
from collections import defaultdict
# unpack
with open('stories/json/stories.json', 'r') as f:
stories = json.load(f)
# preprocess story text
for story in stories:
story['text'] = story['text'].lower()
# contraction hard cases
story['text'] = re.sub(r'\'m ', r' am ', story['text']) # I'm -> I am
story['text'] = re.sub(r'\'s ', r' is ', story['text']) # it's -> it is
story['text'] = re.sub(r'\'ve ', r' have ', story['text']) # they've -> they have
story['text'] = re.sub(r'n\'t', r' not ', story['text']) # hadn't -> had not
story['text'] = re.sub(r'\'re ', r' are ', story['text']) # they're -> they are
story['text'] = re.sub(r'\'ll ', r' will ', story['text']) # I'll -> I will
# 'd can mean "would" "did" or "had" so it is ignored
# surround . , / \ ? ! $ % & * ( ) + = - < > " ; : with whitespace for proper parsing
story['text'] = re.sub(r'\s*(\.|\,|\/|\\|\?|\!|\$|\%|\*|\(|\)|\+|\=|\-|\<|\>|\"|\:|\;)\s*', r' \1 ', story['text'])
# set up token probabilities helpers
class Token:
def __init__(self):
self.counts = defaultdict(int)
def __str__(self):
return self.counts.__str__()
def sliding_window(iterable, n=2):
# based from https://napsterinblue.github.io/notes/python/internals/itertools_sliding_window/
iterables = itertools.tee(iterable, n)
for iterable, num_skipped in zip(iterables, itertools.count()):
for _ in range(num_skipped):
next(iterable, None)
return zip(*iterables)
# calculate token probabilities and average story length
num_stories = 0
num_sentences = 0
tokens = {}
for story in stories:
num_stories += 1
for token, next_token in sliding_window(story['text'].split()):
if token not in tokens:
tokens[token] = Token()
# increments word count for next_token
tokens[token].counts[next_token] += 1
# track number of sentences
if next_token == '.':
num_sentences += 1
# print(token, tokens[token])
print('number of unique words', len(tokens))
print('number of training stories', num_stories)
print('average number of sentences in a story', num_sentences / num_stories)
# write story
curr_word = 'when'
text = curr_word
max_sentence = num_sentences / num_stories
written_sentences = 0
while written_sentences < max_sentence:
# write sentence
while True:
next_word = random.choices(list(tokens[curr_word].counts.keys()), weights=list(tokens[curr_word].counts.values()))[0]
text += ' '
text += next_word
curr_word = next_word
if next_word == '.':
written_sentences += 1
break
print(text)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment