(ns foundations.computational.linguistics (:require [reagent.core :as r] [reagent.dom :as rd] [clojure.zip :as z] [clojure.pprint :refer [pprint]] [clojure.string :refer [index-of]] ;[clojure.string :as str] )) (enable-console-print!) (defn log [a-thing] (.log js/console a-thing))

(defn render-vega [spec elem] (when spec (let [spec (clj->js spec) opts {:renderer "canvas" :mode "vega" :actions { :export true, :source true, :compiled true, :editor true}}] (-> (js/vegaEmbed elem spec (clj->js opts)) (.then (fn [res] (. js/vegaTooltip (vega (.-view res) spec)))) (.catch (fn [err] (log err))))))) (defn vega "Reagent component that renders vega" [spec] (r/create-class {:display-name "vega" :component-did-mount (fn [this] (render-vega spec (rd/dom-node this))) :component-will-update (fn [this [_ new-spec]] (render-vega new-spec (rd/dom-node this))) :reagent-render (fn [spec] [:div#vis])})) ;making a histogram from a list of observations (defn list-to-hist-data-lite [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) {:values (into [] (map ->rec l))}) (defn makehist-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "category", :type "ordinal"}, :y {:aggregate "count", :type "quantitative"} } }) (defn list-to-hist-data [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) [{:name "raw", :values (into [] (map ->rec l))} {:name "aggregated" :source "raw" :transform [{:as ["count"] :type "aggregate" :groupby ["category"]}]} {:name "agg-sorted" :source "aggregated" :transform [{:type "collect" :sort {:field "category"}}]} ]) (defn makehist [data] (let [n (count (distinct ((data 0) :values))) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "agg-sorted", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "agg-sorted", :field "count"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "agg-sorted"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "count"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.count", :offset -2}, :text {:signal "tooltip.count"}, :fillOpacity [ {:test "isNaN(tooltip.count)", :value 0}, {:value 1} ] } } } ] })) (defn hist [l] (-> l list-to-hist-data makehist vega)) ; for making bar plots (defn list-to-barplot-data-lite [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:values (into [] (map ->rec l m))}) (defn makebarplot-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "element", :type "ordinal"}, :y {:field "value", :type "quantitative"} } }) (defn list-to-barplot-data [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:name "table", :values (into [] (map ->rec l m))}) (defn makebarplot [data] (let [n (count (data :values)) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "table", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "table", :field "amount"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "table"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "amount"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.amount", :offset -2}, :text {:signal "tooltip.amount"}, :fillOpacity [ {:test "isNaN(tooltip.amount)", :value 0}, {:value 1} ] } } } ] })) (defn barplot [l m] (vega (makebarplot (list-to-barplot-data l m)))) ; now, for tree making ;(thanks to Taylor Wood's answer in this thread on stackoverflow: ; https://stackoverflow.com/questions/57911965) (defn count-up-to-right [loc] (if (z/up loc) (loop [x loc, pops 0] (if (z/right x) pops (recur (z/up x) (inc pops)))) 0)) (defn list-to-tree-spec [l] """ takes a list and walks through it (with clojure.zip library) and builds the record format for the spec needed to for vega""" (loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []] (cond (z/end? loc) acc (z/end? (z/next loc)) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))}) (and (z/node loc) (not (z/branch? loc))) (recur (z/next loc) (inc next-id) (cond (not (z/right loc)) (let [n (count-up-to-right loc) popn (apply comp (repeat n pop))] (some-> parent-ids not-empty popn)) (not (z/left loc)) (conj parent-ids next-id) :else parent-ids) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))})) :else (recur (z/next loc) next-id parent-ids acc)))) (defn maketree [w h tree-spec] """ makes vega spec for a tree given tree-spec in the right json-like format """ {:$schema "https://vega.github.io/schema/vega/v5.json" :data [{:name "tree" :transform [{:key "id" :parentKey "parent" :type "stratify"} {:as ["x" "y" "depth" "children"] :method {:signal "layout"} :size [{:signal "width"} {:signal "height"}] :type "tree"}] :values tree-spec } {:name "links" :source "tree" :transform [{:type "treelinks"} {:orient "horizontal" :shape {:signal "links"} :type "linkpath"}]}] :height h :marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}} :from {:data "links"} :type "path"} {:encode {:enter {:size {:value 50} :stroke {:value "#fff"}} :update {:fill {:field "depth" :scale "color"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "symbol"} {:encode {:enter {:baseline {:value "bottom"} :font {:value "Courier"} :fontSize {:value 14} :angle {:value 0} :text {:field "name"}} :update {:align {:signal "datum.children ? 'center' : 'center'"} :dy {:signal "datum.children ? -6 : -6"} :opacity {:signal "labels ? 1 : 0"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "text"}] :padding 5 :scales [{:domain {:data "tree" :field "depth"} :name "color" :range {:scheme "magma"} :type "linear" :zero true}] :signals [{:bind {:input "checkbox"} :name "labels" :value true} {:bind {:input "radio" :options ["tidy" "cluster"]} :name "layout" :value "tidy"} {:name "links" :value "line"}] :width w} ) (defn tree-depth "get the depth of a tree (list)" [list] (if (seq? list) (inc (apply max 0 (map tree-depth list))) 0)) (defn tree "plot tree using vega" [list] (let [spec (list-to-tree-spec list) h (* 30 (tree-depth list))] (vega (maketree 700 h spec))))

← →


(def categories '(N V Adj Adv P stop))

(def vocabulary '(Call me Ishmael))

(defn logsumexp [& log-vals]
  (let [mx (apply max log-vals)]
    (+ mx
       (Math/log2
	     (apply +
	       (map (fn [z] (Math/pow 2 z))
		    (map (fn [x] (- x mx))
			log-vals)))))))
			
(defn flip [p]
  (if (< (rand 1) p)
    true
    false))

(defn sample-categorical [outcomes params]
  (if (flip (first params))
    (first outcomes)
    (sample-categorical (rest outcomes)
                        (normalize (rest params)))))

(defn score-categorical [outcome outcomes params]
  (if (empty? params)
  (throw "no matching outcome")
  
    (if (= outcome (first outcomes))
     (Math/log2 (first params))
      (score-categorical outcome (rest outcomes) (rest params)))))

(defn normalize [params]
  (let [sum (apply + params)]
  (map (fn [x] (/ x sum)) params)))

(defn sample-gamma [shape scale]
(apply + (repeatedly
  shape  (fn []
		     (- (Math/log2 (rand))))
		)))

(defn sample-dirichlet [pseudos]
  (let [gammas (map (fn [sh]
                     (sample-gamma sh 1))
                pseudos)]
				(normalize gammas)))

(defn update-context [order old-context new-symbol]
  (if (>= (count old-context) order)
    (throw "Context too long!")
    (if (= (count old-context) (- order 1))
      (concat  (rest old-context) (list new-symbol))
      (concat old-context (list new-symbol)))))

(defn hmm-unfold [transition observation order context current stop?]
  (if (stop? current)
    (list current)
    (let [new-context (update-context
	           order
		       context
		       current)
	        nxt (transition new-context)]
	  (cons [current (observation current)]
	  (hmm-unfold
	         transition
	         observation
	         n-gram-order 
			 new-context
			 nxt
			 stop?)))))

(defn all-but-last [l]
  (cond (empty? l) (throw "bad thing")
        (empty? (rest l)) '()
        :else (cons (first l) (all-but-last (rest l)))))

Our goal is to represent the the posterior distribution over sequences of categories for an HMM/FSA.

\[\Pr(c^{(1)},\cdots,c^{(k)} \mid w^{(1)}, \cdots, w^{(k)})=\frac{\prod_{i=1}^k \Pr(w^{(i)} \mid c^{(i)}) \Pr(c^{(i)} \mid c^{(i-1)})}{\sum_{c^{\prime (1)},\cdots,c^{\prime (k)}}\prod_{i=1}^k \Pr(w^{(i)} \mid c^{(i)}) \Pr(c^{(i)^\prime} \mid c^{(i-1)})}\]

Recall that we can represent a distribution either as a sampler or a scorer. In this unit, we will look at how we can implement efficient samplers and scorers for this posterior distribution—algorthims that get around the problem of exponential blow up that we get if we try to represent this distribution in the naive way.

The denominator of the expression above is just the forward probability of the whole string. However,as before, this expression does not suggest an efficient way to represent this distribution. In particular, it suggests no way to sample a sequence other than to enumerate all sequences, score each one, and then renormalize using the forward probability. Since there are an exponential number of sequences, this is not a useful way to do sampling from the posterior.

Can we do better?

Let’s consider the posterior probability of an arbitrary state at time step $k$. Let’s assume that we know that the state at time step $k+1$ is equal to some particular state $c^{(k+1)}$. We will consider the conditional probability over states at time step $c^{(k)}$ given that the state in time step $k+1$, $c^{(k+1)}$ adn the string. In other words

\[\Pr(c^{(k)} \mid c^{(k+1)}, w^{(1)}, \cdots, w^{(n)})\]

First, let’s consider the joint distribution,

\[\gamma_{t}(c_{1}^{(t)}, c_{2}^{(t+1)}) = \Pr(c_{1}^{(t)}, c_{2}^{(t+1)}, w^{(1)}, \cdots, w^{(\vec{w})})\]

This is equal to

\[\mathbf{fw}(w^{(1)}, \cdots, w^{(k)}, c^{(k)})\Pr(c^{(k+1)}\mid c^{(k)})\Pr(w^{(k+1)}\mid c^{(k+1)})\mathbf{bk}(c^{(k+1)}, w^{(k+2)},\cdots,w^{(n)})\]

By the definition of conditional probability

\[\Pr(c^{(k)} \mid c^{(k+1)}, w^{(1)}, \cdots, w^{(n)}) = \frac{\Pr(c^{(k)}, c^{(k+1)}, w^{(1)}, \cdots, w^{(n)})}{\sum_{\bar{c}^{(k)}} \Pr(\bar{c}^{(k)}, c^{(k+1)}, w^{(1)}, \cdots, w^{(n)})}\]

Substituting in the expression above.

\[\Pr(c^{(k)} \mid c^{(k+1)}, w^{(1)}, \cdots, w^{(n)}) = \frac{\mathbf{fw}(w^{(1)}, \cdots, w^{(k)}, c^{(k)})\Pr(c^{(k+1)}\mid c^{(k)}) Pr( w^{(k+1)} \mid c^{(k+1)}) \mathbf{bk}(c^{(k+1)}, w^{(k+1)},\cdots,w^{(n)})}{\sum_{\bar{c}^{(k)}} \mathbf{fw}(w^{(1)}, \cdots, w^{(k)}, \bar{c}^{(k)})\Pr(c^{(k+1)}\mid \bar{c}^{(k)}) \Pr( w^{(k+1)} \mid c^{(k+1)}) \mathbf{bk}(c^{(k+1)}, w^{(k+2)},\cdots,w^{(n)})}\]

Finally, noting that $\Pr( w^{(k+1)} \mid c^{(k+1)})$ and $\mathbf{bk}(c^{(k+1)}, w^{(k+1)},\cdots,w^{(n)})$ are constants in the above expression so they cancel.

\[\Pr(c^{(k)} \mid c^{(k+1)}, w^{(1)}, \cdots, w^{(n)}) = \frac{\mathbf{fw}(w^{(1)}, \cdots, w^{(k)}, c^{(k)})\Pr(c^{(k+1)}\mid c^{(k)}) }{\sum_{\bar{c}^{(k)}} \mathbf{fw}(w^{(1)}, \cdots, w^{(k)}, \bar{c}^{(k)})\Pr(c^{(k+1)}\mid \bar{c}^{(k)})}\]

Note what we have found here. We can compute the exact probability of each preceding state $c^{(k)}$ given a specific following state $c^{(k+1)}$ using the forward probabilities of each. Thus, once we compute the trellis of forward probabilities, we can use these to sample a path through the posterior distribution.

This algorithm is known as forward filtering, backward sampling.

← 32 Bayesian Parsing Methods 34 Parameter Estimation and Learning →

33 Exact Sampling and Scoring