(ns foundations.computational.linguistics
(:require [reagent.core :as r]
[reagent.dom :as rd]
[clojure.zip :as z]
[clojure.pprint :refer [pprint]]
[clojure.string :refer [index-of]]
;[clojure.string :as str]
))
(enable-console-print!)
(defn log [a-thing]
(.log js/console a-thing))
(defn render-vega [spec elem]
(when spec
(let [spec (clj->js spec)
opts {:renderer "canvas"
:mode "vega"
:actions {
:export true,
:source true,
:compiled true,
:editor true}}]
(-> (js/vegaEmbed elem spec (clj->js opts))
(.then (fn [res]
(. js/vegaTooltip (vega (.-view res) spec))))
(.catch (fn [err]
(log err)))))))
(defn vega
"Reagent component that renders vega"
[spec]
(r/create-class
{:display-name "vega"
:component-did-mount (fn [this]
(render-vega spec (rd/dom-node this)))
:component-will-update (fn [this [_ new-spec]]
(render-vega new-spec (rd/dom-node this)))
:reagent-render (fn [spec]
[:div#vis])}))
;making a histogram from a list of observations
(defn list-to-hist-data-lite [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
{:values (into [] (map ->rec l))})
(defn makehist-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "category",
:type "ordinal"},
:y {:aggregate "count",
:type "quantitative"}
}
})
(defn list-to-hist-data [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
[{:name "raw",
:values (into [] (map ->rec l))}
{:name "aggregated"
:source "raw"
:transform
[{:as ["count"]
:type "aggregate"
:groupby ["category"]}]}
{:name "agg-sorted"
:source "aggregated"
:transform
[{:type "collect"
:sort {:field "category"}}]}
])
(defn makehist [data]
(let [n (count (distinct ((data 0) :values)))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "agg-sorted", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "agg-sorted", :field "count"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "agg-sorted"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "count"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.count", :offset -2},
:text {:signal "tooltip.count"},
:fillOpacity [
{:test "isNaN(tooltip.count)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn hist [l]
(-> l
list-to-hist-data
makehist
vega))
; for making bar plots
(defn list-to-barplot-data-lite [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:values (into [] (map ->rec l m))})
(defn makebarplot-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "element", :type "ordinal"},
:y {:field "value", :type "quantitative"}
}
})
(defn list-to-barplot-data [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:name "table",
:values (into [] (map ->rec l m))})
(defn makebarplot [data]
(let [n (count (data :values))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "table", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "table", :field "amount"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "table"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "amount"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.amount", :offset -2},
:text {:signal "tooltip.amount"},
:fillOpacity [
{:test "isNaN(tooltip.amount)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn barplot [l m]
(vega (makebarplot (list-to-barplot-data l m))))
; now, for tree making
;(thanks to Taylor Wood's answer in this thread on stackoverflow:
; https://stackoverflow.com/questions/57911965)
(defn count-up-to-right [loc]
(if (z/up loc)
(loop [x loc, pops 0]
(if (z/right x)
pops
(recur (z/up x) (inc pops))))
0))
(defn list-to-tree-spec [l]
""" takes a list and walks through it (with clojure.zip library)
and builds the record format for the spec needed to for vega"""
(loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []]
(cond
(z/end? loc) acc
(z/end? (z/next loc))
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))})
(and (z/node loc) (not (z/branch? loc)))
(recur
(z/next loc)
(inc next-id)
(cond
(not (z/right loc))
(let [n (count-up-to-right loc)
popn (apply comp (repeat n pop))]
(some-> parent-ids not-empty popn))
(not (z/left loc))
(conj parent-ids next-id)
:else parent-ids)
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))}))
:else
(recur (z/next loc) next-id parent-ids acc))))
(defn maketree [w h tree-spec]
""" makes vega spec for a tree given tree-spec in the right json-like format """
{:$schema "https://vega.github.io/schema/vega/v5.json"
:data [{:name "tree"
:transform [{:key "id" :parentKey "parent" :type "stratify"}
{:as ["x" "y" "depth" "children"]
:method {:signal "layout"}
:size [{:signal "width"} {:signal "height"}]
:type "tree"}]
:values tree-spec
}
{:name "links"
:source "tree"
:transform [{:type "treelinks"}
{:orient "horizontal"
:shape {:signal "links"}
:type "linkpath"}]}]
:height h
:marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}}
:from {:data "links"}
:type "path"}
{:encode {:enter {:size {:value 50} :stroke {:value "#fff"}}
:update {:fill {:field "depth" :scale "color"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "symbol"}
{:encode {:enter {:baseline {:value "bottom"}
:font {:value "Courier"}
:fontSize {:value 14}
:angle {:value 0}
:text {:field "name"}}
:update {:align {:signal "datum.children ? 'center' : 'center'"}
:dy {:signal "datum.children ? -6 : -6"}
:opacity {:signal "labels ? 1 : 0"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "text"}]
:padding 5
:scales [{:domain {:data "tree" :field "depth"}
:name "color"
:range {:scheme "magma"}
:type "linear"
:zero true}]
:signals [{:bind {:input "checkbox"} :name "labels" :value true}
{:bind {:input "radio" :options ["tidy" "cluster"]}
:name "layout"
:value "tidy"}
{:name "links"
:value "line"}]
:width w}
)
(defn tree-depth
"get the depth of a tree (list)"
[list]
(if (seq? list)
(inc (apply max 0 (map tree-depth list)))
0))
(defn tree
"plot tree using vega"
[list]
(let [spec (list-to-tree-spec list)
h (* 30 (tree-depth list))]
(vega (maketree 700 h spec))))
As emphasized in Classes of Languages and Universal
Grammar, we are really interested in the
question of defining a modeling framework that captures possible human
languages, rather than individual human languages. From this
perspective, it is useful to ask what class of languages can be
defined by strictly-local grammars and automata.
The strictly \(n\)-local languages over vocabulary \(V\) are those
languages for which there exist strictly \(n\)-local grammars.
\[\mathrm{SL}_n = \left\{L \in \mathcal{P}({V^{*}}) \,\middle|\,
(\exists
\mathcal{G})[L=L(\mathcal{G}) \wedge \mathcal{G}\
\mathrm{is\ strictly}\ n\mathrm{-local} ]\right\}\]
In other words, the class of all SL\(_n\) languages (with respect to
alphabet \(V\)) is the class of all strictly \(n\)-local languages.
The strictly local languages are the union of all the strictly
\(n\)-local languages.
\[\mathrm{SL} = \bigcup\limits_{n \in \mathbb{N}} \mathrm{SL}_{n}\]
Hierarchies
One of the dual goals of this course is to understand how different
kinds of approaches to grammar can define different kinds of patterns
in strings or sentences. One of the most useful tools in the toolbox
of formal language theory is the notion of a hierarchy of classes of
formal languages. A hierarchy of formal languages (or models), is an
ordered sequence of classes of formal languages where classes later in
the sequence contain classes earlier in the sequence. This means that
patterns that can be captured by models lower in the hierarchy can be
captured by models higher in the hierarchy, but not vice versa.
The strictly-local languages represent our first example of a
hierarchy of such formal languages.
To see this, consider the relationship between \(\mathrm{SL}_n\) and
\(\mathrm{SL}_l\) for \(n \neq l\)? In general, we claim that
\(\mathrm{SL}_n \subsetneq \mathrm{SL}_{n+i} \forall n > 1, i \geq
1\). In other words,
\[\mathrm{SL}_1 \subsetneq \mathrm{SL}_2 \subsetneq \mathrm{SL}_3
\subsetneq \dots\]
How can we prove this? We need to show two things.
First, we need to show that each class in the hierarchy is contained
within the subsequent class. That is, \(\forall n[(\forall i\geq 1)[L
\in \mathrm{SL}_n \implies L \in \mathrm{SL}_{n+i}]]\). Then we need
to show that each class is proper subset of the next. That is,
\(\forall n[(\exists L)[L \in \mathrm{SL}_{n+1} \wedge L \notin
\mathrm{SL}_{n}]]\).
-
Proof part 1: \(\forall n[(\forall i\geq 1)[L \in \mathrm{SL}_n \implies L \in \mathrm{SL}_{n+i}]]\)
To prove this, we construct a strictly \((n+i)\)-local
grammar/automaton, \(\langle V_{n+i}, T_{n+i}\rangle\), from a
strictly \(n\)-local grammar/automaton \(\langle V_n,
T_n\rangle\).
Set \(V_{n+i} = V_{n}\) and let \(T_{n+i} =
\mathbf{F}_{n+i}(L(\langle V_n, T_n\rangle))\). We claim that \(L(\langle V_{n+i}, T_{n+i}\rangle) =L(\langle V_n, T_n\rangle)\). First, consider an arbitrary string \(x \in L(\langle V_n,
T_n\rangle)\), since \(\mathbf{F}_{n+i}\) reads off all of the
\(n+i\) factors in \(x\), then \(x\) must be in \(L(\langle
V_{n+i}, T_{n+i})\rangle\) since all of its factors must be in
\(T_{n+i}\). Thus \(L(\langle V_{n}, T_{n}\rangle) \subseteq
L(\langle V_{n+i}, T_{n+i}\rangle)\).
It suffices to show that \(L(\langle V_{n+i}, T_{n+i}\rangle)
\subseteq L(\langle V_{n}, T_{n}\rangle)\). We show this by
contradiction. Choose an arbitrary string \(y \in L(\langle
V_{n+i}, T_{n+i}\rangle)\). If \(y \notin L(\langle V_{n},
T_{n}\rangle)\) then it must contain a factor, \(z\) that is not
in \(T_{n}\). However, because \(T_{n+i} =
\mathbf{F}_{n+i}(L(\langle V_n, T_n\rangle))\),
\(\mathbf{F}_n(T_{n+i})\) cannot add any factors not in \(T_{n}\),
thus \(z\) could not be part of \(y\).
□
-
Proof part 2: \(\forall n[(\exists L)[L \in \mathrm{SL}_{n+1} \wedge L \notin \mathrm{SL}_{n}]]\)
We leave this second half of the proof as an exercise.
Weak Generative Capacity
It is useful at this point to introduce some additional terminology.
The generative capacity of a formalism (type of grammar or type of
automaton) defined as the sets that that formalism can be used to
define. When we specifically care about the string languages definable
by a formalism, the term weak generative capacity is used. In this
course, we will often examine the weak generative capacity of various
formalisms—finding surprising cases where similar formalisms define
different formal languages and different formalisms define identical
sets of formal languages. Weak generative capacity is often contrasted
with strong generative capacity which is typically used to mean the
capacity of a formalism to generate certain strings using some
particular kind of computation (e.g., using certain computation
graphs).
← 19 Dependencies between Words
21 N-gram Models →