(ns foundations.computational.linguistics (:require [reagent.core :as r] [reagent.dom :as rd] [clojure.zip :as z] [clojure.pprint :refer [pprint]] [clojure.string :refer [index-of]] ;[clojure.string :as str] )) (enable-console-print!) (defn log [a-thing] (.log js/console a-thing))

(defn render-vega [spec elem] (when spec (let [spec (clj->js spec) opts {:renderer "canvas" :mode "vega" :actions { :export true, :source true, :compiled true, :editor true}}] (-> (js/vegaEmbed elem spec (clj->js opts)) (.then (fn [res] (. js/vegaTooltip (vega (.-view res) spec)))) (.catch (fn [err] (log err))))))) (defn vega "Reagent component that renders vega" [spec] (r/create-class {:display-name "vega" :component-did-mount (fn [this] (render-vega spec (rd/dom-node this))) :component-will-update (fn [this [_ new-spec]] (render-vega new-spec (rd/dom-node this))) :reagent-render (fn [spec] [:div#vis])})) ;making a histogram from a list of observations (defn list-to-hist-data-lite [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) {:values (into [] (map ->rec l))}) (defn makehist-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "category", :type "ordinal"}, :y {:aggregate "count", :type "quantitative"} } }) (defn list-to-hist-data [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) [{:name "raw", :values (into [] (map ->rec l))} {:name "aggregated" :source "raw" :transform [{:as ["count"] :type "aggregate" :groupby ["category"]}]} {:name "agg-sorted" :source "aggregated" :transform [{:type "collect" :sort {:field "category"}}]} ]) (defn makehist [data] (let [n (count (distinct ((data 0) :values))) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "agg-sorted", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "agg-sorted", :field "count"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "agg-sorted"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "count"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.count", :offset -2}, :text {:signal "tooltip.count"}, :fillOpacity [ {:test "isNaN(tooltip.count)", :value 0}, {:value 1} ] } } } ] })) (defn hist [l] (-> l list-to-hist-data makehist vega)) ; for making bar plots (defn list-to-barplot-data-lite [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:values (into [] (map ->rec l m))}) (defn makebarplot-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "element", :type "ordinal"}, :y {:field "value", :type "quantitative"} } }) (defn list-to-barplot-data [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:name "table", :values (into [] (map ->rec l m))}) (defn makebarplot [data] (let [n (count (data :values)) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "table", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "table", :field "amount"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "table"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "amount"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.amount", :offset -2}, :text {:signal "tooltip.amount"}, :fillOpacity [ {:test "isNaN(tooltip.amount)", :value 0}, {:value 1} ] } } } ] })) (defn barplot [l m] (vega (makebarplot (list-to-barplot-data l m)))) ; now, for tree making ;(thanks to Taylor Wood's answer in this thread on stackoverflow: ; https://stackoverflow.com/questions/57911965) (defn count-up-to-right [loc] (if (z/up loc) (loop [x loc, pops 0] (if (z/right x) pops (recur (z/up x) (inc pops)))) 0)) (defn list-to-tree-spec [l] """ takes a list and walks through it (with clojure.zip library) and builds the record format for the spec needed to for vega""" (loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []] (cond (z/end? loc) acc (z/end? (z/next loc)) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))}) (and (z/node loc) (not (z/branch? loc))) (recur (z/next loc) (inc next-id) (cond (not (z/right loc)) (let [n (count-up-to-right loc) popn (apply comp (repeat n pop))] (some-> parent-ids not-empty popn)) (not (z/left loc)) (conj parent-ids next-id) :else parent-ids) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))})) :else (recur (z/next loc) next-id parent-ids acc)))) (defn maketree [w h tree-spec] """ makes vega spec for a tree given tree-spec in the right json-like format """ {:$schema "https://vega.github.io/schema/vega/v5.json" :data [{:name "tree" :transform [{:key "id" :parentKey "parent" :type "stratify"} {:as ["x" "y" "depth" "children"] :method {:signal "layout"} :size [{:signal "width"} {:signal "height"}] :type "tree"}] :values tree-spec } {:name "links" :source "tree" :transform [{:type "treelinks"} {:orient "horizontal" :shape {:signal "links"} :type "linkpath"}]}] :height h :marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}} :from {:data "links"} :type "path"} {:encode {:enter {:size {:value 50} :stroke {:value "#fff"}} :update {:fill {:field "depth" :scale "color"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "symbol"} {:encode {:enter {:baseline {:value "bottom"} :font {:value "Courier"} :fontSize {:value 14} :angle {:value 0} :text {:field "name"}} :update {:align {:signal "datum.children ? 'center' : 'center'"} :dy {:signal "datum.children ? -6 : -6"} :opacity {:signal "labels ? 1 : 0"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "text"}] :padding 5 :scales [{:domain {:data "tree" :field "depth"} :name "color" :range {:scheme "magma"} :type "linear" :zero true}] :signals [{:bind {:input "checkbox"} :name "labels" :value true} {:bind {:input "radio" :options ["tidy" "cluster"]} :name "layout" :value "tidy"} {:name "links" :value "line"}] :width w} ) (defn tree-depth "get the depth of a tree (list)" [list] (if (seq? list) (inc (apply max 0 (map tree-depth list))) 0)) (defn tree "plot tree using vega" [list] (let [spec (list-to-tree-spec list) h (* 30 (tree-depth list))] (vega (maketree 700 h spec))))

← →


(defn prefix? [pr str]
  (if (> (count pr) (count str))
      false
      (if (empty? pr)
          true
          (if (= (first pr) (first str))
              (prefix? (rest pr) (rest str))
              false))))

We saw in the preceding section that we cannot use finite formal languages as models of natural language sentences. We will need infinite formal languages to model some natural languages like English. But how do we define an infinite language?

It is here that our notion of a model of computation become indispensable. While we cannot directly list the elements of an infinite formal language, we can write a computer program that characterizes the set in some way—an intensional definition of the language.

Take the example of the 2-alternating language $L=(ab)^*$, that is, the set of all strings which consist of $ab$ concatenated with itself some number of times, e.g., $\{\epsilon, ab, abab, ababab, ...\}$. How could we characterize this set with a computer program? In general, there are two main approaches: recognition and generation.

Language Recognizers

The first approach, language recognition solves the problem by implementing a program which tells us whether or not a given string is in the language in question, that is, by implementing the characteristic function of the target language. Recall that a procedure that returns a boolean value is called a predicate. How do we write a predicate for the language $L=(ab)^*$? We will need some way to iterate through all of the different strings in this language—up to any length. How can we do this? Using recursion, of course!

(defn in-ab*? [str]
    (if (empty? str)
        true
        (if (prefix? '(a b) str)
            (in-ab*? (rest (rest str)))
            false)))

Let’s break this example down. The function is recursive. The most basic case is if the string is empty, that is, if the formal string is equal to $\epsilon$.

(in-ab*? '())

Next consider the case where the string isn’t empty. Now, we need to check if the beginning of the string is equal to $ab$. If it isn’t, then the result is simply false.

(in-ab*? '(b b a b))

If the prefix of the string is equal to $ab$, we remove it and continue recursing on the remainder, repeating our checks as we go. This is just another example of how we can destructure an object via recursion. If we can decompose our string into a sequence of (a b) substrings, it is well-formed.

(in-ab*? '(a b a b a b))

Otherwise, it is ill-formed.

(in-ab*? '(a b a b a a))

Language Generators

Another way to specify a formal language using a computer program is by generation—providing a program which constructs or generates the elements of the set.

Consider again our formal language $(ab)^*$, how can we construct the strings in this set? If we knew the number of copies $n$ of the string $ab$ in advance, we could easily write a procedure that generated the target string.

(defn generate-ab* [n]
  (if (= n 0)
    '()
    (concat '(a b) (generate-ab* (- n 1)))))

(generate-ab* 10)

Of course, this procedure doesn’t directly generate the whole set $(ab)^*$ directly since it requires input $n$. However, in a certain sense it does precisely characterize the entire set. In particular, this function provides a mapping from an infinite set which we understand well, the natural numbers $\mathbb{N}$, to the set of strings in $(ab)^*$. In fact, this function is a bijection (one to one). For each natural number $n$, it returns a single element of $(ab)^*$—namely, the string with $n$ copies of $ab$—and for each string in $(ab)^*$ there is a single natural number which describes it.

There are at least two ways in which we can use such a mapping. First, since we know how to count from $0$ to any natural number, no matter how high, we could use this procedure to generate any subset of elements of the set $(ab)^*$ up to some number of repetitions of $ab$, in order—that is $\{(ab)^n \mid n \leq k\}$ for some $k$. This perspective on characterizing the infinite set is known as enumeration.

Second, we are free to imagine that there is another, unknown, process which somehow chooses the natural number $n$. This choice process can work in any way—for example, it could sample $n$ from some probability distribution, or, alternatively, choose it using some unknown semi-magical process. The critical point is that we can implement a generator like this without understanding how it will be used by a caller who has some way of choosing $n$. The important thing is that however we choose $n$ we get a member of the set $(ab)^*$, and we can choose whatever $n$ we like to get any member we like. This perspective is called nondeterministic choice.

In some sense, the nondetermistic choice perspective is a more fundamental characterization of the set. To enumerate the set, we have to pick some particular ordering of $n$ (for instance counting from $0$ upwards). But when we think about the language $(ab)^*$, the order in which we list the elements doesn’t matter. The nondeterministic choice perspective emphasizes that we don’t really care how you pick this order; we just care which strings are in the set and which ones are not.

Recognizers and generators are both ways of characterizing sets intensionally—by description—rather than extensionally, that is, by listing. They differ in one important way. While recognizers give you a yes or no answer (at least when the language is decidable ), generators only give you a characterization for the strings that are actually in the language.

Is it possible to convert between these perspectives?

Generative Models of Language

Modern linguistics is often called generative linguistics. This name stems from the use of generators as models of linguistic structure. The field actually uses both the generator and recognizer perspectives to define models of language. This dual perspective also corresponds to the dual problem of explaining both language production (generation) and comprehension (recognition). This perspective also underlies the difference between directed and undirected models in probabilistic modelling and is related to the distinction between proof and model theory in formal logic. The important thing is that we will try to define finite programs that characterize the set of possible English sentences. In the next units, we will start building such models of natural language sentence structure.

← 6 Universal Grammar and Classes of Formal Languages 8 Distributions over Languages →

7 Infinite Languages and Recursion

Language Recognizers

Language Generators

Generative Models of Language