(ns foundations.computational.linguistics
(:require [reagent.core :as r]
[reagent.dom :as rd]
[clojure.zip :as z]
[clojure.pprint :refer [pprint]]
[clojure.string :refer [index-of]]
;[clojure.string :as str]
))
(enable-console-print!)
(defn log [a-thing]
(.log js/console a-thing))
(defn render-vega [spec elem]
(when spec
(let [spec (clj->js spec)
opts {:renderer "canvas"
:mode "vega"
:actions {
:export true,
:source true,
:compiled true,
:editor true}}]
(-> (js/vegaEmbed elem spec (clj->js opts))
(.then (fn [res]
(. js/vegaTooltip (vega (.-view res) spec))))
(.catch (fn [err]
(log err)))))))
(defn vega
"Reagent component that renders vega"
[spec]
(r/create-class
{:display-name "vega"
:component-did-mount (fn [this]
(render-vega spec (rd/dom-node this)))
:component-will-update (fn [this [_ new-spec]]
(render-vega new-spec (rd/dom-node this)))
:reagent-render (fn [spec]
[:div#vis])}))
;making a histogram from a list of observations
(defn list-to-hist-data-lite [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
{:values (into [] (map ->rec l))})
(defn makehist-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "category",
:type "ordinal"},
:y {:aggregate "count",
:type "quantitative"}
}
})
(defn list-to-hist-data [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
[{:name "raw",
:values (into [] (map ->rec l))}
{:name "aggregated"
:source "raw"
:transform
[{:as ["count"]
:type "aggregate"
:groupby ["category"]}]}
{:name "agg-sorted"
:source "aggregated"
:transform
[{:type "collect"
:sort {:field "category"}}]}
])
(defn makehist [data]
(let [n (count (distinct ((data 0) :values)))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "agg-sorted", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "agg-sorted", :field "count"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "agg-sorted"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "count"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.count", :offset -2},
:text {:signal "tooltip.count"},
:fillOpacity [
{:test "isNaN(tooltip.count)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn hist [l]
(-> l
list-to-hist-data
makehist
vega))
; for making bar plots
(defn list-to-barplot-data-lite [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:values (into [] (map ->rec l m))})
(defn makebarplot-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "element", :type "ordinal"},
:y {:field "value", :type "quantitative"}
}
})
(defn list-to-barplot-data [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:name "table",
:values (into [] (map ->rec l m))})
(defn makebarplot [data]
(let [n (count (data :values))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "table", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "table", :field "amount"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "table"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "amount"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.amount", :offset -2},
:text {:signal "tooltip.amount"},
:fillOpacity [
{:test "isNaN(tooltip.amount)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn barplot [l m]
(vega (makebarplot (list-to-barplot-data l m))))
; now, for tree making
;(thanks to Taylor Wood's answer in this thread on stackoverflow:
; https://stackoverflow.com/questions/57911965)
(defn count-up-to-right [loc]
(if (z/up loc)
(loop [x loc, pops 0]
(if (z/right x)
pops
(recur (z/up x) (inc pops))))
0))
(defn list-to-tree-spec [l]
""" takes a list and walks through it (with clojure.zip library)
and builds the record format for the spec needed to for vega"""
(loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []]
(cond
(z/end? loc) acc
(z/end? (z/next loc))
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))})
(and (z/node loc) (not (z/branch? loc)))
(recur
(z/next loc)
(inc next-id)
(cond
(not (z/right loc))
(let [n (count-up-to-right loc)
popn (apply comp (repeat n pop))]
(some-> parent-ids not-empty popn))
(not (z/left loc))
(conj parent-ids next-id)
:else parent-ids)
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))}))
:else
(recur (z/next loc) next-id parent-ids acc))))
(defn maketree [w h tree-spec]
""" makes vega spec for a tree given tree-spec in the right json-like format """
{:$schema "https://vega.github.io/schema/vega/v5.json"
:data [{:name "tree"
:transform [{:key "id" :parentKey "parent" :type "stratify"}
{:as ["x" "y" "depth" "children"]
:method {:signal "layout"}
:size [{:signal "width"} {:signal "height"}]
:type "tree"}]
:values tree-spec
}
{:name "links"
:source "tree"
:transform [{:type "treelinks"}
{:orient "horizontal"
:shape {:signal "links"}
:type "linkpath"}]}]
:height h
:marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}}
:from {:data "links"}
:type "path"}
{:encode {:enter {:size {:value 50} :stroke {:value "#fff"}}
:update {:fill {:field "depth" :scale "color"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "symbol"}
{:encode {:enter {:baseline {:value "bottom"}
:font {:value "Courier"}
:fontSize {:value 14}
:angle {:value 0}
:text {:field "name"}}
:update {:align {:signal "datum.children ? 'center' : 'center'"}
:dy {:signal "datum.children ? -6 : -6"}
:opacity {:signal "labels ? 1 : 0"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "text"}]
:padding 5
:scales [{:domain {:data "tree" :field "depth"}
:name "color"
:range {:scheme "magma"}
:type "linear"
:zero true}]
:signals [{:bind {:input "checkbox"} :name "labels" :value true}
{:bind {:input "radio" :options ["tidy" "cluster"]}
:name "layout"
:value "tidy"}
{:name "links"
:value "line"}]
:width w}
)
(defn tree-depth
"get the depth of a tree (list)"
[list]
(if (seq? list)
(inc (apply max 0 (map tree-depth list)))
0))
(defn tree
"plot tree using vega"
[list]
(let [spec (list-to-tree-spec list)
h (* 30 (tree-depth list))]
(vega (maketree 700 h spec))))
(defn prefix? [pr str]
(if (> (count pr) (count str))
false
(if (empty? pr)
true
(if (= (first pr) (first str))
(prefix? (rest pr) (rest str))
false))))
(defn flip [p]
(if (< (rand 1) p)
true
false))
(defn normalize [params]
(let [sum (apply + params)]
(map (fn [x] (/ x sum)) params)))
(defn list-foldr [f base lst]
(if (empty? lst)
base
(f (first lst)
(list-foldr f base (rest lst)))))
(defn sample-categorical [outcomes params]
(if (flip (first params))
(first outcomes)
(sample-categorical (rest outcomes)
(normalize (rest params)))))
(defn score-categorical [outcome outcomes params]
(if (empty? params)
(throw "no matching outcome")
(if (= outcome (first outcomes))
(first params)
(score-categorical outcome (rest outcomes) (rest params)))))
(defn score-BOW-sentence [sen]
(list-foldr
(fn [word rest-score]
(* (score-categorical word vocabulary probabilities)
rest-score))
1
sen))
(defn score-corpus [corpus]
(list-foldr
(fn [sen rst]
(* (score-BOW-sentence sen) rst))
1
corpus))
(defn log-score-BOW-sentence [sen]
(list-foldr
(fn [word rest-score]
(+ (Math/log2 (score-categorical word vocabulary probabilities))
rest-score))
0
sen))
(defn log-score-corpus [corpus]
(list-foldr
(fn [sen rest-score]
(+ (log-score-BOW-sentence sen) rest-score))
0
corpus))
As we emphasized in
Classes of Formal Languages,
our primary scientific goal is to give a mathematical characterization
of universal grammar modeled—for the time being—as a
class of formal languages. Now that we have begun looking at
probabilistic formal languages defined using categorical distributions
over the vocabulary, we can easily generalize the idea of a class of
formal languages to a class of probabilistic formal languages or,
more accurately, a class of models of probabilistic languages. The
bag-of-words approach to generating and scoring sentence structures
that we have set up constitutes a class of models, where each
particular model is given by choosing some specific vocabulary and
distribution over this vocabulary.
Why should we care about classes of probabilistic models? One reason
is that by choosing such a class of models, we can begin to define
procedures for doing inference about hypotheses, for example, for
how learners might choose between models in light of some input
data. We can define inference procedures which can serve as formal
models of language learning and language processing. In order to do
this, we will need a way of choosing between models in a class based
on some probabilistic criteria.
Comparing Models in the Class of Categorical BOW Models
If we wish to compare how well models do with respect to a particular
training data set, one natural way to do this is by using the
likelihood of each model given the dataset. In the last chapter, we
introduced the class of categorical BOW models. Each model in this
class is parameterized or indexed by some parameter vector
\(\vec{\theta} \in \Theta\). We gave an expression for the log
likelihood of the model indexed by \(\vec{\theta}\):
\[\mathcal{L}(\vec{\theta}; \mathbf{C})\]
With a such a simple class of models, it may be hard to see how one
model could be much better or worse than another—they all have
very little structure. But consider the following example. Suppose our
corpus consisted of seven copies of the sentence call me Ishmael.
(def my-corpus
'((Call me Ishmael)
(Call me Ishmael)
(Call me Ishmael)
(Call me Ishmael)
(Call me Ishmael)
(Call me Ishmael)
(Call me Ishmael)))
(def vocabulary
'(Call me Ishmael Some years ago - never mind
how long precisely having little or no money
in my purse and nothing particular to interest
on shore I thought would sail about a see the
watery part of world))
(def probabilities
(repeatedly
(count vocabulary)
(fn [] (/ 1 (count vocabulary)))))
(log-score-corpus my-corpus)
Here we have replaced our corpus with many repeats of the same
sentence, while holding our vocabulary constant. Here the probability
of each element of our vocabulary is \(\frac{1}{|V|} = \frac{1}{39}\)
because we used a uniform distribution over the entire set of words
from before. However, in our corpus, we only use three of these words,
several times each. Clearly we can do better by setting the
probability associated with the words call me and Ishmael higher
than the probability of the other words.
Let’s see what happens when we reparameterize our categorical to place
all of its probability mass on just the three words that appear in the
corpus.
(def vocabulary-subset '(Call me Ishmael))
(def vocabulary-subset-size (count vocabulary-subset))
(def probabilities
(concat
; uniform on subset
(repeat vocabulary-subset-size (/ 1 vocabulary-subset-size))
; zero outside subset
(repeat (- (count vocabulary) vocabulary-subset-size) 0)))
(log-score-corpus my-corpus)
As we can see, this model assigns a significantly higher score to this
corpus. This in fact, is much higher probability, remember that the
log probability is (negative) the number of times that we must
multiply \(\frac{1}{2}\) together to get the probability.
The Principle of Maximum Likelihood
This method of choosing hypotheses is known in statistics as the
principle of maximum likelihood (we will see others later). This
principle states that we should choose the model in our class which
maximizes the likelihood of our data. If our class of models is
indexed by some parameter vector \(\vec\theta \in \Theta\) then the
principle of maximum likehood says we should choose some model indexed
by the parameters \(\vec{\hat{\theta}}{}^{\mathrm{ML}}\) that maximize
the (log) likelihood of the model given the data \(\mathbf{C}\), i.e.,
\[\DeclareMathOperator*{\argmax}{arg\,max}
\vec{\hat{\theta}}{}^{\mathrm{ML}} = \argmax_{\vec{\theta} \in \Theta} \mathcal{L}(\vec{\theta};\mathbf{C})\]
Recall that the likelihood of the corpus under our model is:
\[\Pr(\mathbf{C} \mid \vec{\theta})
= \prod_{w \in V} \theta_{w}^{n_{w}}\]
In other words, it is just the product of the probabilities of each
word raised to the number of times that word appeared in the corpus.
Thus, the log likelihood is as follows.
\[\mathcal{L}(\theta;\mathbf{C})
= \log \prod_{w \in V} \theta_{w}^{n_{w}}
=\sum_{w \in V} n_{w}\log(\theta_{w})\]
Which probabilities maximize this quantity? It is a bit complicated to
show (the usual proof uses Lagrange multipliers), but the
probabilities for each word which maximize the log likelihood in the
case of a BOW model are
\[\hat{\theta}{}^{\mathrm{ML}}_{w}
= \frac{n_{w}}{\sum_{w^\prime \in V}
n_{w^\prime}}
= \frac{n_{w}}{|\mathbf{C}|}\]
In other words, the optimal probabilities for each word under the
principle of maximum likelihood are simply the renormalized
counts. Now we can see why our restriction of the vocabulary to just
the words in the restricted corpus gave us a higher likelihood—it
was the maximum likelihood estimator.
← 11 Bag-of-Words Models
13 Smoothing →