I'm trying to find a way to make a transition matrix using unigrams, bigrams, and trigrams for a given text using python and numpy. Each row's probabilities should equal to one. I did this first with bigrams and it worked fine:
distinct_words = list(word_dict.keys())
dwc = len(distinct_words)
matrix = np.zeros((dwc, dwc), dtype=np.float)
for i in range(len(distinct_words)):
word = distinct_words[i]
first_word_idx = i
total = 0
for bigram, count in ngrams.items():
word_1, word_2 = bigram.split(" ")
if word_1 == word:
total += count
for bigram, count in ngrams.items():
word_1, word_2 = bigram.split(" ")
if word_1 == word:
second_word_idx = index_dict[word_2]
matrix[first_word_idx,second_word_idx] = count / total
But now I want to add unigrams and trigrams and weight their probabilities (trigrams * .6, bigrams * .2, unigrams *.2). I don't think my python is very succinct, which is one problem, but also I don't know how to use multiple n-grams (and weights, although honestly weights are secondary) so that I can still get all of the probabilities from any given row to add up to one.
distinct_words = list(word_dict.keys())
dwc = len(distinct_words)
matrix = np.zeros((dwc, dwc), dtype=np.float)
for i in range(len(distinct_words)):
word = distinct_words[i]
first_word_index = i
bi_total = 0
tri_total=0
tri_prob = 0
bi_prob = 0
uni_prob = word_dict[word] / len(distinct_words)
if i < len(distinct_words)-1:
for trigram, count in trigrams.items():
word_1, word_2, word_3 = trigram.split()
if word_1 + word_2 == word + distinct_words[i+1]:
tri_total += count
for trigram, count in trigrams.items():
word_1, word_2, word_3 = trigram.split()
if word_1 + word_2 == word + distinct_words[i+1]:
second_word_index = index_dict[word_2]
tri_prob = count/bigrams[word_1 + " " + word_2]
for bigram, count in bigrams.items():
word_1, word_2 = bigram.split(" ")
if word_1 == word:
bi_total += count
for bigram, count in bigrams.items():
word_1, word_2 = bigram.split(" ")
if word_1 == word:
second_word_index = index_dict[word_2]
bi_prob = count / bi_total
matrix[first_word_index,second_word_index] = (tri_prob * .4) + (bi_prob * .2) + (word_dict[word]/len(word_dict) *.2)
I'm reading off of this lecture for how to set up my probability matrix and it seems to make sense, but I'm not sure where I'm going wrong.
If it helps, my n_grams are coming from this- it just produces a dictionary of the n_gram as a string and its count.
def get_ngram(words, n):
word_dict = {}
for i, word in enumerate(words):
if i > (n-2):
n_gram = []
for num in range(n):
index = i - num
n_gram.append(words[index])
if len(n_gram) > 1:
formatted_gram = ""
for word in reversed(n_gram):
formatted_gram += word + " "
else:
formatted_gram = n_gram[0]
stripped = formatted_gram.strip() if formatted_gram else n_gram[0]
if stripped in word_dict:
word_dict[stripped] += 1
else:
word_dict[stripped] = 1
return word_dict
from Multiple ngrams in transition matrix, probability not adding to 1
No comments:
Post a Comment