(ns foundations.computational.linguistics (:require [reagent.core :as r] [reagent.dom :as rd] [clojure.zip :as z] [clojure.pprint :refer [pprint]] [clojure.string :refer [index-of]] ;[clojure.string :as str] )) (enable-console-print!) (defn log [a-thing] (.log js/console a-thing))

(defn render-vega [spec elem] (when spec (let [spec (clj->js spec) opts {:renderer "canvas" :mode "vega" :actions { :export true, :source true, :compiled true, :editor true}}] (-> (js/vegaEmbed elem spec (clj->js opts)) (.then (fn [res] (. js/vegaTooltip (vega (.-view res) spec)))) (.catch (fn [err] (log err))))))) (defn vega "Reagent component that renders vega" [spec] (r/create-class {:display-name "vega" :component-did-mount (fn [this] (render-vega spec (rd/dom-node this))) :component-will-update (fn [this [_ new-spec]] (render-vega new-spec (rd/dom-node this))) :reagent-render (fn [spec] [:div#vis])})) ;making a histogram from a list of observations (defn list-to-hist-data-lite [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) {:values (into [] (map ->rec l))}) (defn makehist-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "category", :type "ordinal"}, :y {:aggregate "count", :type "quantitative"} } }) (defn list-to-hist-data [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) [{:name "raw", :values (into [] (map ->rec l))} {:name "aggregated" :source "raw" :transform [{:as ["count"] :type "aggregate" :groupby ["category"]}]} {:name "agg-sorted" :source "aggregated" :transform [{:type "collect" :sort {:field "category"}}]} ]) (defn makehist [data] (let [n (count (distinct ((data 0) :values))) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "agg-sorted", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "agg-sorted", :field "count"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "agg-sorted"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "count"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.count", :offset -2}, :text {:signal "tooltip.count"}, :fillOpacity [ {:test "isNaN(tooltip.count)", :value 0}, {:value 1} ] } } } ] })) (defn hist [l] (-> l list-to-hist-data makehist vega)) ; for making bar plots (defn list-to-barplot-data-lite [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:values (into [] (map ->rec l m))}) (defn makebarplot-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "element", :type "ordinal"}, :y {:field "value", :type "quantitative"} } }) (defn list-to-barplot-data [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:name "table", :values (into [] (map ->rec l m))}) (defn makebarplot [data] (let [n (count (data :values)) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "table", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "table", :field "amount"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "table"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "amount"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.amount", :offset -2}, :text {:signal "tooltip.amount"}, :fillOpacity [ {:test "isNaN(tooltip.amount)", :value 0}, {:value 1} ] } } } ] })) (defn barplot [l m] (vega (makebarplot (list-to-barplot-data l m)))) ; now, for tree making ;(thanks to Taylor Wood's answer in this thread on stackoverflow: ; https://stackoverflow.com/questions/57911965) (defn count-up-to-right [loc] (if (z/up loc) (loop [x loc, pops 0] (if (z/right x) pops (recur (z/up x) (inc pops)))) 0)) (defn list-to-tree-spec [l] """ takes a list and walks through it (with clojure.zip library) and builds the record format for the spec needed to for vega""" (loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []] (cond (z/end? loc) acc (z/end? (z/next loc)) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))}) (and (z/node loc) (not (z/branch? loc))) (recur (z/next loc) (inc next-id) (cond (not (z/right loc)) (let [n (count-up-to-right loc) popn (apply comp (repeat n pop))] (some-> parent-ids not-empty popn)) (not (z/left loc)) (conj parent-ids next-id) :else parent-ids) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))})) :else (recur (z/next loc) next-id parent-ids acc)))) (defn maketree [w h tree-spec] """ makes vega spec for a tree given tree-spec in the right json-like format """ {:$schema "https://vega.github.io/schema/vega/v5.json" :data [{:name "tree" :transform [{:key "id" :parentKey "parent" :type "stratify"} {:as ["x" "y" "depth" "children"] :method {:signal "layout"} :size [{:signal "width"} {:signal "height"}] :type "tree"}] :values tree-spec } {:name "links" :source "tree" :transform [{:type "treelinks"} {:orient "horizontal" :shape {:signal "links"} :type "linkpath"}]}] :height h :marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}} :from {:data "links"} :type "path"} {:encode {:enter {:size {:value 50} :stroke {:value "#fff"}} :update {:fill {:field "depth" :scale "color"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "symbol"} {:encode {:enter {:baseline {:value "bottom"} :font {:value "Courier"} :fontSize {:value 14} :angle {:value 0} :text {:field "name"}} :update {:align {:signal "datum.children ? 'center' : 'center'"} :dy {:signal "datum.children ? -6 : -6"} :opacity {:signal "labels ? 1 : 0"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "text"}] :padding 5 :scales [{:domain {:data "tree" :field "depth"} :name "color" :range {:scheme "magma"} :type "linear" :zero true}] :signals [{:bind {:input "checkbox"} :name "labels" :value true} {:bind {:input "radio" :options ["tidy" "cluster"]} :name "layout" :value "tidy"} {:name "links" :value "line"}] :width w} ) (defn tree-depth "get the depth of a tree (list)" [list] (if (seq? list) (inc (apply max 0 (map tree-depth list))) 0)) (defn tree "plot tree using vega" [list] (let [spec (list-to-tree-spec list) h (* 30 (tree-depth list))] (vega (maketree 700 h spec))))

← →


(defn prefix? [pr str]
  (if (> (count pr) (count str))
      false
      (if (empty? pr)
          true
          (if (= (first pr) (first str))
              (prefix? (rest pr) (rest str))
              false))))

(defn flip [p]
  (if (< (rand 1) p)
    true
    false))

(defn normalize [params]
  (let [sum (apply + params)]
    (map (fn [x] (/ x sum)) params)))

(defn list-foldr [f base lst]
  (if (empty? lst)
      base
      (f (first lst)
      (list-foldr f base (rest lst)))))
	  
(defn sample-categorical [outcomes params]
  (if (flip (first params))
    (first outcomes)
    (sample-categorical (rest outcomes)
    (normalize (rest params)))))

(defn score-categorical [outcome outcomes params]
  (if (empty? params)
    (throw "no matching outcome")
    (if (= outcome (first outcomes))
      (first params)
      (score-categorical outcome (rest outcomes) (rest params)))))

(defn score-BOW-sentence [sen]
  (list-foldr
   (fn [word rest-score]
     (* (score-categorical word vocabulary probabilities)
        rest-score))
   1
   sen))

(defn score-corpus [corpus]
  (list-foldr
   (fn [sen rst]
     (* (score-BOW-sentence sen) rst))
   1
   corpus))

(defn log-score-BOW-sentence [sen]
  (list-foldr
   (fn [word rest-score]
     (+ (Math/log2 (score-categorical word vocabulary probabilities))
        rest-score))
   0
   sen))

(defn log-score-corpus [corpus]
  (list-foldr
   (fn [sen rest-score]
     (+ (log-score-BOW-sentence sen) rest-score))
   0
   corpus))

As we emphasized in Classes of Formal Languages, our primary scientific goal is to give a mathematical characterization of universal grammar modeled—for the time being—as a class of formal languages. Now that we have begun looking at probabilistic formal languages defined using categorical distributions over the vocabulary, we can easily generalize the idea of a class of formal languages to a class of probabilistic formal languages or, more accurately, a class of models of probabilistic languages. The bag-of-words approach to generating and scoring sentence structures that we have set up constitutes a class of models, where each particular model is given by choosing some specific vocabulary and distribution over this vocabulary.

Why should we care about classes of probabilistic models? One reason is that by choosing such a class of models, we can begin to define procedures for doing inference about hypotheses, for example, for how learners might choose between models in light of some input data. We can define inference procedures which can serve as formal models of language learning and language processing. In order to do this, we will need a way of choosing between models in a class based on some probabilistic criteria.

Comparing Models in the Class of Categorical BOW Models

If we wish to compare how well models do with respect to a particular training data set, one natural way to do this is by using the likelihood of each model given the dataset. In the last chapter, we introduced the class of categorical BOW models. Each model in this class is parameterized or indexed by some parameter vector $\vec{\theta} \in \Theta$. We gave an expression for the log likelihood of the model indexed by $\vec{\theta}$:

\[\mathcal{L}(\vec{\theta}; \mathbf{C})\]

With a such a simple class of models, it may be hard to see how one model could be much better or worse than another—they all have very little structure. But consider the following example. Suppose our corpus consisted of seven copies of the sentence call me Ishmael.

(def my-corpus 
  '((Call me Ishmael)
    (Call me Ishmael)
    (Call me Ishmael)
    (Call me Ishmael)
    (Call me Ishmael)
    (Call me Ishmael)
    (Call me Ishmael)))

(def vocabulary
  '(Call me Ishmael Some years ago - never mind
         how long precisely having little or no money 
         in my purse and nothing particular to interest 
         on shore I thought would sail about a see the 
         watery part of world))

(def probabilities
  (repeatedly
   (count vocabulary)
   (fn [] (/ 1 (count vocabulary)))))

(log-score-corpus my-corpus)

Here we have replaced our corpus with many repeats of the same sentence, while holding our vocabulary constant. Here the probability of each element of our vocabulary is $\frac{1}{|V|} = \frac{1}{39}$ because we used a uniform distribution over the entire set of words from before. However, in our corpus, we only use three of these words, several times each. Clearly we can do better by setting the probability associated with the words call me and Ishmael higher than the probability of the other words.

Let’s see what happens when we reparameterize our categorical to place all of its probability mass on just the three words that appear in the corpus.

(def vocabulary-subset '(Call me Ishmael))

(def vocabulary-subset-size (count vocabulary-subset))

(def probabilities 
  (concat 
   ; uniform on subset
   (repeat vocabulary-subset-size (/ 1 vocabulary-subset-size)) 
   ; zero outside subset
   (repeat (- (count vocabulary) vocabulary-subset-size) 0))) 

(log-score-corpus my-corpus)

As we can see, this model assigns a significantly higher score to this corpus. This in fact, is much higher probability, remember that the log probability is (negative) the number of times that we must multiply $\frac{1}{2}$ together to get the probability.

The Principle of Maximum Likelihood

This method of choosing hypotheses is known in statistics as the principle of maximum likelihood (we will see others later). This principle states that we should choose the model in our class which maximizes the likelihood of our data. If our class of models is indexed by some parameter vector $\vec\theta \in \Theta$ then the principle of maximum likehood says we should choose some model indexed by the parameters $\vec{\hat{\theta}}{}^{\mathrm{ML}}$ that maximize the (log) likelihood of the model given the data $\mathbf{C}$, i.e.,

\[\DeclareMathOperator*{\argmax}{arg\,max} \vec{\hat{\theta}}{}^{\mathrm{ML}} = \argmax_{\vec{\theta} \in \Theta} \mathcal{L}(\vec{\theta};\mathbf{C})\]

Recall that the likelihood of the corpus under our model is:

\[\Pr(\mathbf{C} \mid \vec{\theta}) = \prod_{w \in V} \theta_{w}^{n_{w}}\]

In other words, it is just the product of the probabilities of each word raised to the number of times that word appeared in the corpus. Thus, the log likelihood is as follows.

\[\mathcal{L}(\theta;\mathbf{C}) = \log \prod_{w \in V} \theta_{w}^{n_{w}} =\sum_{w \in V} n_{w}\log(\theta_{w})\]

Which probabilities maximize this quantity? It is a bit complicated to show (the usual proof uses Lagrange multipliers), but the probabilities for each word which maximize the log likelihood in the case of a BOW model are

\[\hat{\theta}{}^{\mathrm{ML}}_{w} = \frac{n_{w}}{\sum_{w^\prime \in V} n_{w^\prime}} = \frac{n_{w}}{|\mathbf{C}|}\]

In other words, the optimal probabilities for each word under the principle of maximum likelihood are simply the renormalized counts. Now we can see why our restriction of the vocabulary to just the words in the restricted corpus gave us a higher likelihood—it was the maximum likelihood estimator.

← 11 Bag-of-Words Models 13 Smoothing →

12 Maximum Likelihoood

Comparing Models in the Class of Categorical BOW Models

The Principle of Maximum Likelihood