The following example demonstrates text generation through a Markov chain model using trigrams. The followings are done in the code below:
Use the Natural Language Tool Kit (NLTK) and Brown corpus (a big collection of English text).
Build unigram, bigram, and trigram statistics.
Filter out stopwords and punctuation from bigrams and trigrams.
Build a trigram language model: given two words (w1, w2), it learns how often different third words w3 follow.
Use that model to generate random text starting from the prefix ("he", "said").
#
# import necessary libraries
#
import string # for punctuation characters
import random # for random choices in text generation
import nltk # for NLP tools
from nltk import FreqDist # frequency distribution (counts of items)
from nltk.corpus import brown # for Brown corpus
from collections import defaultdict, Counter # convenient dicts for building frequency tables.
from nltk.util import ngrams # generate bigrams and trigrams easily
#
# download necessary NLTK packages and corpus
#
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('brown')
#
# define stopwords and punctuation
#
stop_words = set(nltk.corpus.stopwords.words('english')) # common words to filter out
string.punctuation += '"\'-—' # include extra symbols
removal_list = list(stop_words) + list(string.punctuation) + ['lt', 'rt'] # everything to remove if it appears
# in an n-gram (stopwords, punctuation, lt, rt)
#
# load sentences from the Brown corpus
#
sents = brown.sents() # list of sentences, each already tokenized into words
#
# initialize lists for storing n-grams
#
unigram = []
bigram = []
trigram = []
#
# Generate n-grams
#
for sentence in sents:
sentence = [word.lower() for word in sentence if word not in string.punctuation] # lowercase all words and remove
# tokens that are pure punctuation
unigram.extend(sentence) # add all words to the unigram list.
bigram.extend(list(ngrams(sentence, 2, pad_left=True, pad_right=True))) # create Bigrams (sequences of 2 words)
trigram.extend(list(ngrams(sentence, 3, pad_left=True, pad_right=True))) # create Trigrams (sequences of 3 words)
#
# remove stopwords from n-grams
#
def remove_stopwords(ngrams, n):
# take a list of 2-grams or 3-grams and returns only those where all words are not in removal_list;
# any n-gram that contains a stopword or punctuation is dropped
if n == 2:
return [(a, b)
for (a, b) in ngrams
if a not in removal_list and b not in removal_list] # put None at the beginning/end to get
# n-grams that straddle sentence boundaries
elif n == 3:
return [(a, b, c)
for (a, b, c) in ngrams
if a not in removal_list and b not in removal_list and c not in removal_list]
#
# remove stopwords from n-grams
#
# apply that filtering to bigram and trigram
bigram = remove_stopwords(bigram, 2)
trigram = remove_stopwords(trigram, 3)
#
# calculate frequency distributions
#
freq_bi = FreqDist(bigram) # how many times bigram (w1, w2) appears
freq_tri = FreqDist(trigram) # how many times trigram (w1, w2, w3) appears,
# freq_tri is a dict-like structure where keys are trigrams (w1, w2, w3)
#
# create a dictionary of trigram frequencies
#
d = defaultdict(Counter)
for ngram in freq_tri:
if None not in ngram: # skip trigrams that were created by padding (those have None)
d[ngram[:-1]][ngram[-1]] += freq_tri[ngram] # ngram[:-1] = (w1, w2) – the prefix,
# ngram[-1] = w3 – the suffix (the next word)
#
# function to generate text
#
def generate_text(prefix, n=20): # parameter prefix contains initial pair of words, like ("he", "said")
for _ in range(n):
suffix_candidates = list(d.get(prefix, Counter()).elements()) # look up d[prefix], the Counter of possible next
# words; .elements() returns each word repeated
# according to its count, so randomly picking
# from it makes frequent words more likely
if not suffix_candidates: # if suffix_candidates is empty, no data for that prefix
new_prefix = random.choice(unigram), random.choice(unigram) # reset by picking two random unigrams
yield new_prefix[0] # yield the first word of the new prefix
prefix = new_prefix
else:
suffix = random.choice(suffix_candidates) # pick one random unigram
yield suffix # yield that suffix (this becomes the next word in the output)
prefix = (*prefix[1:], suffix) # update prefix to be the last two words
#
# generate text
#
prefix = ("he", "said") # start generation from the phrase "he said"
generated_text = list(generate_text(prefix)) # generate 20 words (by default) following that prefix
# print them as a single sentence-like string
if generated_text:
print(" ".join(generated_text))
else:
print("No text generated.")