(ns foundations.computational.linguistics (:require [reagent.core :as r] [reagent.dom :as rd] [clojure.zip :as z] [clojure.pprint :refer [pprint]] [clojure.string :refer [index-of]] ;[clojure.string :as str] )) (enable-console-print!) (defn log [a-thing] (.log js/console a-thing))

(defn render-vega [spec elem] (when spec (let [spec (clj->js spec) opts {:renderer "canvas" :mode "vega" :actions { :export true, :source true, :compiled true, :editor true}}] (-> (js/vegaEmbed elem spec (clj->js opts)) (.then (fn [res] (. js/vegaTooltip (vega (.-view res) spec)))) (.catch (fn [err] (log err))))))) (defn vega "Reagent component that renders vega" [spec] (r/create-class {:display-name "vega" :component-did-mount (fn [this] (render-vega spec (rd/dom-node this))) :component-will-update (fn [this [_ new-spec]] (render-vega new-spec (rd/dom-node this))) :reagent-render (fn [spec] [:div#vis])})) ;making a histogram from a list of observations (defn list-to-hist-data-lite [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) {:values (into [] (map ->rec l))}) (defn makehist-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "category", :type "ordinal"}, :y {:aggregate "count", :type "quantitative"} } }) (defn list-to-hist-data [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) [{:name "raw", :values (into [] (map ->rec l))} {:name "aggregated" :source "raw" :transform [{:as ["count"] :type "aggregate" :groupby ["category"]}]} {:name "agg-sorted" :source "aggregated" :transform [{:type "collect" :sort {:field "category"}}]} ]) (defn makehist [data] (let [n (count (distinct ((data 0) :values))) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "agg-sorted", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "agg-sorted", :field "count"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "agg-sorted"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "count"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.count", :offset -2}, :text {:signal "tooltip.count"}, :fillOpacity [ {:test "isNaN(tooltip.count)", :value 0}, {:value 1} ] } } } ] })) (defn hist [l] (-> l list-to-hist-data makehist vega)) ; for making bar plots (defn list-to-barplot-data-lite [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:values (into [] (map ->rec l m))}) (defn makebarplot-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "element", :type "ordinal"}, :y {:field "value", :type "quantitative"} } }) (defn list-to-barplot-data [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:name "table", :values (into [] (map ->rec l m))}) (defn makebarplot [data] (let [n (count (data :values)) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "table", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "table", :field "amount"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "table"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "amount"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.amount", :offset -2}, :text {:signal "tooltip.amount"}, :fillOpacity [ {:test "isNaN(tooltip.amount)", :value 0}, {:value 1} ] } } } ] })) (defn barplot [l m] (vega (makebarplot (list-to-barplot-data l m)))) ; now, for tree making ;(thanks to Taylor Wood's answer in this thread on stackoverflow: ; https://stackoverflow.com/questions/57911965) (defn count-up-to-right [loc] (if (z/up loc) (loop [x loc, pops 0] (if (z/right x) pops (recur (z/up x) (inc pops)))) 0)) (defn list-to-tree-spec [l] """ takes a list and walks through it (with clojure.zip library) and builds the record format for the spec needed to for vega""" (loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []] (cond (z/end? loc) acc (z/end? (z/next loc)) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))}) (and (z/node loc) (not (z/branch? loc))) (recur (z/next loc) (inc next-id) (cond (not (z/right loc)) (let [n (count-up-to-right loc) popn (apply comp (repeat n pop))] (some-> parent-ids not-empty popn)) (not (z/left loc)) (conj parent-ids next-id) :else parent-ids) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))})) :else (recur (z/next loc) next-id parent-ids acc)))) (defn maketree [w h tree-spec] """ makes vega spec for a tree given tree-spec in the right json-like format """ {:$schema "https://vega.github.io/schema/vega/v5.json" :data [{:name "tree" :transform [{:key "id" :parentKey "parent" :type "stratify"} {:as ["x" "y" "depth" "children"] :method {:signal "layout"} :size [{:signal "width"} {:signal "height"}] :type "tree"}] :values tree-spec } {:name "links" :source "tree" :transform [{:type "treelinks"} {:orient "horizontal" :shape {:signal "links"} :type "linkpath"}]}] :height h :marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}} :from {:data "links"} :type "path"} {:encode {:enter {:size {:value 50} :stroke {:value "#fff"}} :update {:fill {:field "depth" :scale "color"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "symbol"} {:encode {:enter {:baseline {:value "bottom"} :font {:value "Courier"} :fontSize {:value 14} :angle {:value 0} :text {:field "name"}} :update {:align {:signal "datum.children ? 'center' : 'center'"} :dy {:signal "datum.children ? -6 : -6"} :opacity {:signal "labels ? 1 : 0"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "text"}] :padding 5 :scales [{:domain {:data "tree" :field "depth"} :name "color" :range {:scheme "magma"} :type "linear" :zero true}] :signals [{:bind {:input "checkbox"} :name "labels" :value true} {:bind {:input "radio" :options ["tidy" "cluster"]} :name "layout" :value "tidy"} {:name "links" :value "line"}] :width w} ) (defn tree-depth "get the depth of a tree (list)" [list] (if (seq? list) (inc (apply max 0 (map tree-depth list))) 0)) (defn tree "plot tree using vega" [list] (let [spec (list-to-tree-spec list) h (* 30 (tree-depth list))] (vega (maketree 700 h spec))))

← →

As emphasized in Classes of Languages and Universal Grammar, we are really interested in the question of defining a modeling framework that captures possible human languages, rather than individual human languages. From this perspective, it is useful to ask what class of languages can be defined by strictly-local grammars and automata.

The strictly $n$-local languages over vocabulary $V$ are those languages for which there exist strictly $n$-local grammars.

\[\mathrm{SL}_n = \left\{L \in \mathcal{P}({V^{*}}) \,\middle|\, (\exists \mathcal{G})[L=L(\mathcal{G}) \wedge \mathcal{G}\ \mathrm{is\ strictly}\ n\mathrm{-local} ]\right\}\]

In other words, the class of all SL$_n$ languages (with respect to alphabet $V$) is the class of all strictly $n$-local languages.

The strictly local languages are the union of all the strictly $n$-local languages.

\[\mathrm{SL} = \bigcup\limits_{n \in \mathbb{N}} \mathrm{SL}_{n}\]

Hierarchies

One of the dual goals of this course is to understand how different kinds of approaches to grammar can define different kinds of patterns in strings or sentences. One of the most useful tools in the toolbox of formal language theory is the notion of a hierarchy of classes of formal languages. A hierarchy of formal languages (or models), is an ordered sequence of classes of formal languages where classes later in the sequence contain classes earlier in the sequence. This means that patterns that can be captured by models lower in the hierarchy can be captured by models higher in the hierarchy, but not vice versa.

The strictly-local languages represent our first example of a hierarchy of such formal languages.

To see this, consider the relationship between $\mathrm{SL}_n$ and $\mathrm{SL}_l$ for $n \neq l$? In general, we claim that $\mathrm{SL}_n \subsetneq \mathrm{SL}_{n+i} \forall n > 1, i \geq 1$. In other words,

\[\mathrm{SL}_1 \subsetneq \mathrm{SL}_2 \subsetneq \mathrm{SL}_3 \subsetneq \dots\]

How can we prove this? We need to show two things.

First, we need to show that each class in the hierarchy is contained within the subsequent class. That is, $\forall n[(\forall i\geq 1)[L \in \mathrm{SL}_n \implies L \in \mathrm{SL}_{n+i}]]$. Then we need to show that each class is proper subset of the next. That is, $\forall n[(\exists L)[L \in \mathrm{SL}_{n+1} \wedge L \notin \mathrm{SL}_{n}]]$.

Proof part 1: $\forall n[(\forall i\geq 1)[L \in \mathrm{SL}_n \implies L \in \mathrm{SL}_{n+i}]]$

To prove this, we construct a strictly $(n+i)$-local grammar/automaton, $\langle V_{n+i}, T_{n+i}\rangle$, from a strictly $n$-local grammar/automaton $\langle V_n, T_n\rangle$.

Set $V_{n+i} = V_{n}$ and let $T_{n+i} = \mathbf{F}_{n+i}(L(\langle V_n, T_n\rangle))$. We claim that $L(\langle V_{n+i}, T_{n+i}\rangle) =L(\langle V_n, T_n\rangle)$. First, consider an arbitrary string $x \in L(\langle V_n, T_n\rangle)$, since $\mathbf{F}_{n+i}$ reads off all of the $n+i$ factors in $x$, then $x$ must be in $L(\langle V_{n+i}, T_{n+i})\rangle$ since all of its factors must be in $T_{n+i}$. Thus $L(\langle V_{n}, T_{n}\rangle) \subseteq L(\langle V_{n+i}, T_{n+i}\rangle)$.

It suffices to show that $L(\langle V_{n+i}, T_{n+i}\rangle) \subseteq L(\langle V_{n}, T_{n}\rangle)$. We show this by contradiction. Choose an arbitrary string $y \in L(\langle V_{n+i}, T_{n+i}\rangle)$. If $y \notin L(\langle V_{n}, T_{n}\rangle)$ then it must contain a factor, $z$ that is not in $T_{n}$. However, because $T_{n+i} = \mathbf{F}_{n+i}(L(\langle V_n, T_n\rangle))$, $\mathbf{F}_n(T_{n+i})$ cannot add any factors not in $T_{n}$, thus $z$ could not be part of $y$.

□
Proof part 2: $\forall n[(\exists L)[L \in \mathrm{SL}_{n+1} \wedge L \notin \mathrm{SL}_{n}]]$

We leave this second half of the proof as an exercise.

Weak Generative Capacity

It is useful at this point to introduce some additional terminology. The generative capacity of a formalism (type of grammar or type of automaton) defined as the sets that that formalism can be used to define. When we specifically care about the string languages definable by a formalism, the term weak generative capacity is used. In this course, we will often examine the weak generative capacity of various formalisms—finding surprising cases where similar formalisms define different formal languages and different formalisms define identical sets of formal languages. Weak generative capacity is often contrasted with strong generative capacity which is typically used to mean the capacity of a formalism to generate certain strings using some particular kind of computation (e.g., using certain computation graphs).

← 19 Dependencies between Words 21 N-gram Models →

20 Strictly-Local Languages, Formal Hierarchies, and Weak Generative Capacity

Hierarchies

Weak Generative Capacity