{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Viterbi in Log Space\n", "\n", "This notebook contains code to convert the Viterbi algorithm to\n", "log space, which will fix the problem with underflow creating\n", "the `Key Error: None` that some of you have seen. \n", "\n", "Another feature of this patch is that it avoids the creation\n", "of arrays for the statistics on starting, transition, and \n", "emission probabilities. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": false }, "outputs": [], "source": [ "import math\n", "import numpy as np\n", "from numpy.random import shuffle, seed, choice\n", "from tqdm import tqdm\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "defaultdict(()>,\n", " {'ADV': -2.3930587143697037,\n", " 'ADP': -2.096822269821264,\n", " 'NOUN': -1.957998969941659,\n", " 'PRON': -1.8344798463296286,\n", " '.': -2.419954018382205,\n", " 'DET': -1.544452595414875,\n", " 'PRT': -3.305633563734661,\n", " 'NUM': -4.085662444826815,\n", " 'ADJ': -3.3714726607982355,\n", " 'CONJ': -3.013325971560988,\n", " 'VERB': -3.0981130838165694,\n", " 'X': -7.555556357775206})" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calculate statistics from part of tagged sentences\n", "\n", "# First, create dictionary of starting frequencies for each tag\n", "\n", "start_p = defaultdict(int)\n", "\n", "# etc. to create start_p\n", " \n", "# now convert it into log probabilities\n", " \n", "start_p_log = defaultdict(lambda: float('-inf'))\n", "\n", "for tag in start_p:\n", " start_p_log[tag] = np.log( start_p[tag] )\n", " \n", "start_p_log" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# First, create nested dictionary of transition frequencies for each tag\n", "# where t2 follows t1\n", "\n", "trans_p = defaultdict(lambda: defaultdict(int))\n", "\n", "# etc to create trans_p\n", "\n", "# now convert it into log probabilities\n", " \n", "trans_p_log = defaultdict(lambda: defaultdict(lambda: float('-inf')))\n", "\n", "for t1 in trans_p:\n", " for t2 in trans_p[t1]:\n", " trans_p_log[t1][t2] = np.log( trans_p[t1][t2] )\n", " " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# First, create nested dictionary of emission frequencies for each tag:\n", "# given tag, how many times does word occur with that tag?\n", "\n", "emit_p = defaultdict(lambda: defaultdict(int))\n", "\n", "# etc to create emit_p\n", " \n", "# now convert it into log probabilities\n", " \n", "emit_p_log = defaultdict(lambda: defaultdict(lambda: float('-inf')))\n", "\n", "for t in emit_p:\n", " for w in emit_p[t]:\n", " emit_p_log[t][w] = np.log( emit_p[t][w] )\n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Adapted from the Wikipedia article just referenced\n", "# Obs_sequence is sequence of words \"observed\"; returns\n", "# most likely sequence of states = POS. \n", "\n", "# start_p, trans_p, emit_p are dictionaries giving probabilities\n", "# set logspace to False if these are normal probabilities (not in log space)\n", "\n", "def viterbi(obs_sequence, obs, states, start_p, trans_p, emit_p,logspace=True):\n", " \n", " V = [{}]\n", " for st in states:\n", " if logspace:\n", " V[0] [st] = {\"prob\": start_p[st] + emit_p[st][obs_sequence[0]], \"prev\": None}\n", " else:\n", " V[0] [st] = {\"prob\": start_p[st] * emit_p[st][obs_sequence[0]], \"prev\": None}\n", " \n", " # Run Viterbi when t > 0\n", " \n", " for t in range(1, len(obs_sequence)):\n", " V.append({})\n", " for st in states:\n", " \n", " if logspace:\n", " max_tr_prob = V[t - 1] [states[0]] [\"prob\"] + trans_p[states[0]] [st]\n", " else:\n", " max_tr_prob = V[t - 1] [states[0]] [\"prob\"] * trans_p[states[0]] [st]\n", " \n", " prev_st_selected = states[0]\n", " \n", " for prev_st in states[1:]:\n", " \n", " if logspace:\n", " tr_prob = V[t - 1] [prev_st] [\"prob\"] + trans_p[prev_st] [st]\n", " else:\n", " tr_prob = V[t - 1] [prev_st] [\"prob\"] * trans_p[prev_st] [st]\n", " \n", " if tr_prob > max_tr_prob:\n", " max_tr_prob = tr_prob\n", " prev_st_selected = prev_st\n", "\n", " if logspace:\n", " max_prob = max_tr_prob + emit_p[st] [obs_sequence[t]]\n", " else:\n", " max_prob = max_tr_prob * emit_p[st] [obs_sequence[t]]\n", " \n", " V[t] [st] = {\"prob\": max_prob, \"prev\": prev_st_selected}\n", "\n", " opt = []\n", " max_prob = float('-inf')\n", " best_st = None\n", "\n", " # Get most probable state and its backtrack\n", " for st, data in V[-1].items():\n", " if data[\"prob\"] > max_prob:\n", " max_prob = data[\"prob\"]\n", " best_st = st\n", " opt.append(best_st)\n", " previous = best_st\n", "\n", " # Follow the backtrack till the first observation\n", " for t in range(len(V) - 2, -1, -1):\n", " opt.insert(0, V[t + 1] [previous] [\"prev\"])\n", " previous = V[t + 1] [previous] [\"prev\"]\n", "\n", " return (opt,max_prob)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "She \tPRON \tPRON \n", "said \tVERB \tVERB \n", ", \t. \t. \n", "`` \t. \t. \n", "I \tPRON \tPRON \n", "guess \tVERB \tVERB \n", "the \tDET \tDET \n", "Lord \tNOUN \tNOUN \n", "looks \tVERB \tVERB \n", "out \tPRT \tPRT \n", "for \tADP \tADP \n", "fools \tNOUN \tNOUN \n", ", \t. \t. \n", "drunkards \tNOUN \tNOUN \n", ", \t. \t. \n", "and \tCONJ \tCONJ \n", "innocents \tNOUN \tNOUN \n", "'' \t. \t. \n", ". \t. \t. \n" ] } ], "source": [ "# Try it with normal probabilities\n", "\n", "sent = tagged_sentences[3]\n", "(s,t) = list(zip(*sent))\n", "(t_hat,p) = viterbi(s, np.array(all_words), np.array(all_tags),start_p,trans_p,emit_p,logspace=False)\n", "\n", "for k in range(len(s)):\n", " print(f'{s[k]:20}\\t{t[k]:10}\\t{t_hat[k]:10}')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "She \tPRON \tPRON \n", "said \tVERB \tVERB \n", ", \t. \t. \n", "`` \t. \t. \n", "I \tPRON \tPRON \n", "guess \tVERB \tVERB \n", "the \tDET \tDET \n", "Lord \tNOUN \tNOUN \n", "looks \tVERB \tVERB \n", "out \tPRT \tPRT \n", "for \tADP \tADP \n", "fools \tNOUN \tNOUN \n", ", \t. \t. \n", "drunkards \tNOUN \tNOUN \n", ", \t. \t. \n", "and \tCONJ \tCONJ \n", "innocents \tNOUN \tNOUN \n", "'' \t. \t. \n", ". \t. \t. \n" ] } ], "source": [ "# try it with log probabilities\n", "\n", "sent = tagged_sentences[3]\n", "(s,t) = list(zip(*sent))\n", "(t_hat,p) = viterbi(s, np.array(all_words), np.array(all_tags),start_p_log,trans_p_log,emit_p_log)\n", "\n", "for k in range(len(s)):\n", " print(f'{s[k]:20}\\t{t[k]:10}\\t{t_hat[k]:10}')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }