(ns foundations.computational.linguistics
(:require [reagent.core :as r]
[reagent.dom :as rd]
[clojure.zip :as z]
[clojure.pprint :refer [pprint]]
[clojure.string :refer [index-of]]
;[clojure.string :as str]
))
(enable-console-print!)
(defn log [a-thing]
(.log js/console a-thing))
(defn render-vega [spec elem]
(when spec
(let [spec (clj->js spec)
opts {:renderer "canvas"
:mode "vega"
:actions {
:export true,
:source true,
:compiled true,
:editor true}}]
(-> (js/vegaEmbed elem spec (clj->js opts))
(.then (fn [res]
(. js/vegaTooltip (vega (.-view res) spec))))
(.catch (fn [err]
(log err)))))))
(defn vega
"Reagent component that renders vega"
[spec]
(r/create-class
{:display-name "vega"
:component-did-mount (fn [this]
(render-vega spec (rd/dom-node this)))
:component-will-update (fn [this [_ new-spec]]
(render-vega new-spec (rd/dom-node this)))
:reagent-render (fn [spec]
[:div#vis])}))
;making a histogram from a list of observations
(defn list-to-hist-data-lite [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
{:values (into [] (map ->rec l))})
(defn makehist-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "category",
:type "ordinal"},
:y {:aggregate "count",
:type "quantitative"}
}
})
(defn list-to-hist-data [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
[{:name "raw",
:values (into [] (map ->rec l))}
{:name "aggregated"
:source "raw"
:transform
[{:as ["count"]
:type "aggregate"
:groupby ["category"]}]}
{:name "agg-sorted"
:source "aggregated"
:transform
[{:type "collect"
:sort {:field "category"}}]}
])
(defn makehist [data]
(let [n (count (distinct ((data 0) :values)))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "agg-sorted", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "agg-sorted", :field "count"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "agg-sorted"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "count"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.count", :offset -2},
:text {:signal "tooltip.count"},
:fillOpacity [
{:test "isNaN(tooltip.count)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn hist [l]
(-> l
list-to-hist-data
makehist
vega))
; for making bar plots
(defn list-to-barplot-data-lite [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:values (into [] (map ->rec l m))})
(defn makebarplot-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "element", :type "ordinal"},
:y {:field "value", :type "quantitative"}
}
})
(defn list-to-barplot-data [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:name "table",
:values (into [] (map ->rec l m))})
(defn makebarplot [data]
(let [n (count (data :values))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "table", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "table", :field "amount"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "table"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "amount"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.amount", :offset -2},
:text {:signal "tooltip.amount"},
:fillOpacity [
{:test "isNaN(tooltip.amount)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn barplot [l m]
(vega (makebarplot (list-to-barplot-data l m))))
; now, for tree making
;(thanks to Taylor Wood's answer in this thread on stackoverflow:
; https://stackoverflow.com/questions/57911965)
(defn count-up-to-right [loc]
(if (z/up loc)
(loop [x loc, pops 0]
(if (z/right x)
pops
(recur (z/up x) (inc pops))))
0))
(defn list-to-tree-spec [l]
""" takes a list and walks through it (with clojure.zip library)
and builds the record format for the spec needed to for vega"""
(loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []]
(cond
(z/end? loc) acc
(z/end? (z/next loc))
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))})
(and (z/node loc) (not (z/branch? loc)))
(recur
(z/next loc)
(inc next-id)
(cond
(not (z/right loc))
(let [n (count-up-to-right loc)
popn (apply comp (repeat n pop))]
(some-> parent-ids not-empty popn))
(not (z/left loc))
(conj parent-ids next-id)
:else parent-ids)
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))}))
:else
(recur (z/next loc) next-id parent-ids acc))))
(defn maketree [w h tree-spec]
""" makes vega spec for a tree given tree-spec in the right json-like format """
{:$schema "https://vega.github.io/schema/vega/v5.json"
:data [{:name "tree"
:transform [{:key "id" :parentKey "parent" :type "stratify"}
{:as ["x" "y" "depth" "children"]
:method {:signal "layout"}
:size [{:signal "width"} {:signal "height"}]
:type "tree"}]
:values tree-spec
}
{:name "links"
:source "tree"
:transform [{:type "treelinks"}
{:orient "horizontal"
:shape {:signal "links"}
:type "linkpath"}]}]
:height h
:marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}}
:from {:data "links"}
:type "path"}
{:encode {:enter {:size {:value 50} :stroke {:value "#fff"}}
:update {:fill {:field "depth" :scale "color"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "symbol"}
{:encode {:enter {:baseline {:value "bottom"}
:font {:value "Courier"}
:fontSize {:value 14}
:angle {:value 0}
:text {:field "name"}}
:update {:align {:signal "datum.children ? 'center' : 'center'"}
:dy {:signal "datum.children ? -6 : -6"}
:opacity {:signal "labels ? 1 : 0"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "text"}]
:padding 5
:scales [{:domain {:data "tree" :field "depth"}
:name "color"
:range {:scheme "magma"}
:type "linear"
:zero true}]
:signals [{:bind {:input "checkbox"} :name "labels" :value true}
{:bind {:input "radio" :options ["tidy" "cluster"]}
:name "layout"
:value "tidy"}
{:name "links"
:value "line"}]
:width w}
)
(defn tree-depth
"get the depth of a tree (list)"
[list]
(if (seq? list)
(inc (apply max 0 (map tree-depth list)))
0))
(defn tree
"plot tree using vega"
[list]
(let [spec (list-to-tree-spec list)
h (* 30 (tree-depth list))]
(vega (maketree 700 h spec))))
The goal of modern generative linguistics is to achieve a precise
computational understanding of how language works: How do speakers
turn the meanings they wish to communicate into utterances that can be
spoken, written, or signed, how do listeners map these incoming
signals to the meanings that they understand, and how do learners come
to acquire the systems necessary to solve these problems.
Clearly, these are a big questions whose answer will involve many
components. These will range from understanding the perceptual systems
of the human mind to having a clearer ideas about what it means to
“mean.” Like all complex scientific problems, we need to make some
simplifying assumptions in order to make progress. Generally speaking,
building scientific theories involves two kinds of mutually
reinforcing work: We delimit the empirical phenomena that we wish to
explain and we create simplified models of the world to explain these
phenomena. These processes feed one another: Without some set of
theoretical background assumptions it is impossible to define what
counts as a phenomenon. Similarly, the phenomena we wish to explain
will dictate much about the modeling assumptions we must make. Both
of these kinds of scientific work thus involve a loop of idealization,
simplification of the complexity of the real world, and
evaluation. Both are critical to building scientific theories.
What kind of phenomena do we wish to explain in this course? What
sorts of simplifications will we make in defining our phenomena and
what sorts of idealizations will we need to developing our models?
Our first simplifying assumption is that we will focus on just the
problem of explaining the structure of sentences. Consider the
following English sentence.
John loves Mary.
What can we say about a sentence like this? First, it is obviously
built from a set of basic units like words and morphemes. Second,
the meaning of the sentence is compositional: The meaning of the
whole sentence is the result of combining the meaning of the
individual words used in the sentence together with the way they are
combined. For example,
Mary loves John.
means something quite different from the John loves Mary. As English
speakers, we know something which tells us that the ordering of the
words affects their combined meaning in systematic and predictable
ways. Many combinations of words are not valid in English.
* loves Mary John.
Here we are using the linguistic convention of using the symbol *
at the beginning of a sequence that is not a possible English sentence
or—in technical terms—is ungrammatical. While we may be
able to guess what the speaker intended (or maybe not) if we heard
this sequence of words, we know that it isn’t valid English. In the
fifties, Chomsky gave the following famous trio of examples which
illustrates this point more forcefully.
Revolutionary new ideas appear infrequently.
Colorless green ideas sleep furiously.
* Furiously sleep ideas green colorless.
In the preceding examples, the first two sentences are well-formed in
English, while the last is not. What is striking is that despite
being well-formed, the second sentence, unlike the first, doesn’t
seem to mean anything that makes any sense. Chomsky used this example
to illustrate the point that whatever principles tell us what a
possible English sentence is, they must be at least partially
independent of whether or not the sequence has a definite meaning or
is otherwise useful.
Another famous example comes from Lewis Carroll’s poem Jabberwocky
which begins.
’Twas brillig, and the slithy toves
Did gyre and gimble in the wabe;
All mimsy were the borogoves,
And the mome raths outgrabe.
These examples seem to suggest that we might make a start of our study
of sentence structure by asking which sequences of words are
possible, or grammatical, English sentences and which are not.
With this in mind, in this course, we will simplify our empirical
problem to focus just on the domain of sentence structure, known in
linguistics as syntax and on the question of grammaticality of
particular sequences of words.
Note that in choosing to study the concept of grammaticality we are
setting aside many important questions—including even the very
question we set out to answer at the beginning of this chapter: How
does form map to meaning and vice versa! We are simplifying
greatly—the concept of grammaticality implies that strings of
words can be always categorized as possible or impossible English
sentences. But, it is very easy to find cases where answers aren’t
clear cut.
More people have visited Berlin than I have.
This sentence sounds correct, but it has the property that it doesn’t
seem to mean anything! Our intuitions about possible and impossible
sentences come in degrees not simply a binary
distinctions. Nevertheless, it is still useful to start with the
idealization that it can be captured as a binary distinction.
It is very likely that the phenomenon of grammaticality as exemplified
above isn’t a single phenomenon, but many. For example,
the problems with the following two sentences seem to be differences
in kind rather than necessarily the same form of ungrammaticality.
* The man walk quickly.
* Man the quickly walks.
Here we have the intuition that the subject-verb agreement error in
the first example is quite different from the word salad in the
second example. This suggests that there are multiple different mental
processes involved in understanding language and each might have
somewhat different notions of well-formedness.
This is all to emphasize, right from the beginning, that we are making
drastic empirical simplifications. We are introducing concepts which
are somewhat vaguely defined, capture only limited empirical
phenomena, and whose relationship with the data may be
complex. Whether grammaticality turns out to be a useful theoretical
concept will depend largely on whether or not it suggests useful
models, leads to interesting predictions, and generates valuable
questions and refinements.
← 3 Recursion
5 Formal Languages →