(ns foundations.computational.linguistics
(:require [reagent.core :as r]
[reagent.dom :as rd]
[clojure.zip :as z]
[clojure.pprint :refer [pprint]]
[clojure.string :refer [index-of]]
;[clojure.string :as str]
))
(enable-console-print!)
(defn log [a-thing]
(.log js/console a-thing))
(defn render-vega [spec elem]
(when spec
(let [spec (clj->js spec)
opts {:renderer "canvas"
:mode "vega"
:actions {
:export true,
:source true,
:compiled true,
:editor true}}]
(-> (js/vegaEmbed elem spec (clj->js opts))
(.then (fn [res]
(. js/vegaTooltip (vega (.-view res) spec))))
(.catch (fn [err]
(log err)))))))
(defn vega
"Reagent component that renders vega"
[spec]
(r/create-class
{:display-name "vega"
:component-did-mount (fn [this]
(render-vega spec (rd/dom-node this)))
:component-will-update (fn [this [_ new-spec]]
(render-vega new-spec (rd/dom-node this)))
:reagent-render (fn [spec]
[:div#vis])}))
;making a histogram from a list of observations
(defn list-to-hist-data-lite [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
{:values (into [] (map ->rec l))})
(defn makehist-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "category",
:type "ordinal"},
:y {:aggregate "count",
:type "quantitative"}
}
})
(defn list-to-hist-data [l]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category])
[{:name "raw",
:values (into [] (map ->rec l))}
{:name "aggregated"
:source "raw"
:transform
[{:as ["count"]
:type "aggregate"
:groupby ["category"]}]}
{:name "agg-sorted"
:source "aggregated"
:transform
[{:type "collect"
:sort {:field "category"}}]}
])
(defn makehist [data]
(let [n (count (distinct ((data 0) :values)))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "agg-sorted", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "agg-sorted", :field "count"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "agg-sorted"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "count"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.count", :offset -2},
:text {:signal "tooltip.count"},
:fillOpacity [
{:test "isNaN(tooltip.count)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn hist [l]
(-> l
list-to-hist-data
makehist
vega))
; for making bar plots
(defn list-to-barplot-data-lite [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:values (into [] (map ->rec l m))})
(defn makebarplot-lite [data]
{
:$schema "https://vega.github.io/schema/vega-lite/v4.json",
:data data,
:mark "bar",
:encoding {
:x {:field "element", :type "ordinal"},
:y {:field "value", :type "quantitative"}
}
})
(defn list-to-barplot-data [l m]
""" takes a list and returns a record
in the right format for vega data,
with each list element the label to a field named 'x'"""
(defrecord rec [category amount])
{:name "table",
:values (into [] (map ->rec l m))})
(defn makebarplot [data]
(let [n (count (data :values))
h 200
pad 5
w (if (< n 20) (* n 35) (- 700 (* 2 pad)))]
{
:$schema "https://vega.github.io/schema/vega/v5.json",
:width w,
:height h,
:padding pad,
:data data,
:signals [
{:name "tooltip",
:value {},
:on [{:events "rect:mouseover", :update "datum"},
{:events "rect:mouseout", :update "{}"}]}
],
:scales [
{:name "xscale",
:type "band",
:domain {:data "table", :field "category"},
:range "width",
:padding 0.05,
:round true},
{:name "yscale",
:domain {:data "table", :field "amount"},
:nice true,
:range "height"}
],
:axes [
{ :orient "bottom", :scale "xscale" },
{ :orient "left", :scale "yscale" }
],
:marks [
{:type "rect",
:from {:data "table"},
:encode {
:enter {
:x {:scale "xscale", :field "category"},
:width {:scale "xscale", :band 1},
:y {:scale "yscale", :field "amount"},
:y2 {:scale "yscale", :value 0}
},
:update {:fill {:value "steelblue"}},
:hover {:fill {:value "green"}}
}
},
{:type "text",
:encode {
:enter {
:align {:value "center"},
:baseline {:value "bottom"},
:fill {:value "#333"}
},
:update {
:x {:scale "xscale", :signal "tooltip.category", :band 0.5},
:y {:scale "yscale", :signal "tooltip.amount", :offset -2},
:text {:signal "tooltip.amount"},
:fillOpacity [
{:test "isNaN(tooltip.amount)", :value 0},
{:value 1}
]
}
}
}
]
}))
(defn barplot [l m]
(vega (makebarplot (list-to-barplot-data l m))))
; now, for tree making
;(thanks to Taylor Wood's answer in this thread on stackoverflow:
; https://stackoverflow.com/questions/57911965)
(defn count-up-to-right [loc]
(if (z/up loc)
(loop [x loc, pops 0]
(if (z/right x)
pops
(recur (z/up x) (inc pops))))
0))
(defn list-to-tree-spec [l]
""" takes a list and walks through it (with clojure.zip library)
and builds the record format for the spec needed to for vega"""
(loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []]
(cond
(z/end? loc) acc
(z/end? (z/next loc))
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))})
(and (z/node loc) (not (z/branch? loc)))
(recur
(z/next loc)
(inc next-id)
(cond
(not (z/right loc))
(let [n (count-up-to-right loc)
popn (apply comp (repeat n pop))]
(some-> parent-ids not-empty popn))
(not (z/left loc))
(conj parent-ids next-id)
:else parent-ids)
(conj acc
{:id (str next-id)
:name (str (z/node loc))
:parent (when (seq parent-ids)
(str (peek parent-ids)))}))
:else
(recur (z/next loc) next-id parent-ids acc))))
(defn maketree [w h tree-spec]
""" makes vega spec for a tree given tree-spec in the right json-like format """
{:$schema "https://vega.github.io/schema/vega/v5.json"
:data [{:name "tree"
:transform [{:key "id" :parentKey "parent" :type "stratify"}
{:as ["x" "y" "depth" "children"]
:method {:signal "layout"}
:size [{:signal "width"} {:signal "height"}]
:type "tree"}]
:values tree-spec
}
{:name "links"
:source "tree"
:transform [{:type "treelinks"}
{:orient "horizontal"
:shape {:signal "links"}
:type "linkpath"}]}]
:height h
:marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}}
:from {:data "links"}
:type "path"}
{:encode {:enter {:size {:value 50} :stroke {:value "#fff"}}
:update {:fill {:field "depth" :scale "color"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "symbol"}
{:encode {:enter {:baseline {:value "bottom"}
:font {:value "Courier"}
:fontSize {:value 14}
:angle {:value 0}
:text {:field "name"}}
:update {:align {:signal "datum.children ? 'center' : 'center'"}
:dy {:signal "datum.children ? -6 : -6"}
:opacity {:signal "labels ? 1 : 0"}
:x {:field "x"}
:y {:field "y"}}}
:from {:data "tree"}
:type "text"}]
:padding 5
:scales [{:domain {:data "tree" :field "depth"}
:name "color"
:range {:scheme "magma"}
:type "linear"
:zero true}]
:signals [{:bind {:input "checkbox"} :name "labels" :value true}
{:bind {:input "radio" :options ["tidy" "cluster"]}
:name "layout"
:value "tidy"}
{:name "links"
:value "line"}]
:width w}
)
(defn tree-depth
"get the depth of a tree (list)"
[list]
(if (seq? list)
(inc (apply max 0 (map tree-depth list)))
0))
(defn tree
"plot tree using vega"
[list]
(let [spec (list-to-tree-spec list)
h (* 30 (tree-depth list))]
(vega (maketree 700 h spec))))
This is a course on the foundations of computational linguistics.
Goals
Most courses taught on computational linguistics or natural language processing
tend to be fairly advanced survey courses, assuming a large amount of
knowledge of computer science and covering a variety of somewhat unrelated
modeling techniques, problem domains, and tasks. This course takes a different
approach, seeking to build up a small set of conceptual tools and empirical
methods from the ground up.
Some specific goals of the course include:
- Learning foundational conceptual tools needed to understand computational
models of linguistic structure.
- Understanding the notion of a model of computation and how it can be applied
to understand aspects of linguistic structure.
- Understanding how to develop empirical evaluations of such models.
- Learning a bunch of specific, useful skills such as programming in
Clojure.
Content
The modern study of language originated in the 1950s when tools from the
recently-emerged theory of computation started to be applied to the problem of
language. One important early result in this area was the Chomsky Hierarchy
which gave a containment hierarchy of phrase-structure grammars or grammars
built out of rewrite rules like the following:
\[\texttt{S} \longrightarrow \texttt{NP}\ \texttt{VP}\]
Chomsky showed that restricting the form of these rules into four classes
(left-linear, context-free, context-sensitive, and unrestricted) gave
rise to classes of formal languages which were strictly nested.
\[\mathcal{L}_{\texttt{LL}} \subset \mathcal{L}_{\texttt{CF}} \subset
\mathcal{L}_{\texttt{CS}} \subset \mathcal{L}_{\texttt{UR}}\]
The Chomsky hierarchy remains a foundational result in theoretical computer
science, and forms a basic entry point for studying formal grammars in
linguistics. The conceptual tools underlying the hierarchy are useful for
understanding even modern systems such as recurrent or recursive neural networks
and transformers.
In COMPLING 445/645 we will study the Chomsky Hierarchy up to the left-linear
languages. We will cover refinements of this hierarchy and how to work with its
extensions using probability. Along the way, we will introduce the foundations
of probability theory and Bayesian inference.
What this course is not
- A survey of current approaches in NLP (see COMP 550 below).
- A course about some specific software (e.g., a Python library).
- A course that expects a lot of background knowledge (we will try to
build everything up from first concepts, but it will be fast).
What you need: curiosity, ambition, fearlessness, and ability to use Google to
find answers to technical problems.
What is the difference between various CL/NLP offerings at McGill?
McGill now offers a number of courses in CL/NLP that cover somewhat overlapping
but mostly complementary material, with differences in assumed background
knowledge and focus. While some topics will appear in several courses, the
emphasis will be different enough that you could fruitfully take all courses (if
you are an undergraduate student).
-
COMP 550 NLP (Prof. Jackie
Cheung): This course assumes more computational background, including deep
familiarity with probability and algorithms. COMP 550 focuses on technological
perspectives of natural language processing as a subdiscipline of artificial
intelligence. It includes a strong machine learning component. It covers
computational semantics, discourse, and applications such as automatic
summarization and machine translation, which COMP/LING 445 does not. COMP/LING
445 would be a valuable class to have taken before 550.
-
COMP 445 / LING 445/645
Computational Linguistics (Dr. Eva Portelance): This course goes in depth into
the fundamental and formal mathematical and linguistic principles that underlie
modern computational linguistics, with an emphasis on formalization of
linguistic analysis. It draws connections to the theory of computation (automata
theory), and rigorously derives some of models that form the basic analytical
toolbox of computational linguistics, which COMP 550 does not. The tools taught
in this course can serve as the foundation for later coursework in computational
linguistics and NLP.
-
COMP 345 / LING 345 From Language to Data
Science (Prof. Siva Reddy/Timothy O’Donnell): This course is a hands-on
introduction to working with linguistic data. It assumes less programming and
mathematical background than either COMP 550 or COMP/LING 445. It is the least
advanced and most applied of the three courses. It’s focus will be on the data
science of language and as such it will make use of both engineering examples as
well as scientific examples, like working with data derived from
psycholinguistic experiments.
-
COMP/LING 596/484 Probabilistic Programming (Prof. Timothy O’Donnell):
This course extends some of the tools developed in 445 in the direction of
probabilistic programming languages. Probabilistic models are fundamental tool
in computational cognitive science and artificial intelligence. In recent years,
a new paradigm has emerged for specifying and deploying such models:
probabilistic programming. Probabilistic programming refers to the use of
high-level programming languages which are designed specifically to express
probabilistic models, and which are often understood as expressing a ``natively
probabilistic’’ model of computation. This examines a number of topics in
probabilistic modeling and inference from the perspective of probabilistic
programming, using examples drawn from AI and computational cognitive science.
COMP/LING 445 provides a good conceptual foundation for this course.
-
COMP/LING 596/483 Computational Linguistics 2 (Prof. Timothy O’Donnell):
This course extends is the direct extension of COMP/LING 445, covering the
second half of this textbook.
-
COMP 599 / LING 484/782
Natural Language Understanding with Deep Learning / Computational Semantics
(Prof. Siva Reddy): This course is a more advanced seminar-style course in
natural language understanding using tools from neural networks and deep
learning. The course examines how concepts in NLU such as meaning or
applications such as question answering, have shifted in recent years, what has
been gained with each paradigm shift, and what has been lost? The course
critically evaluates existing ideas and tries to come up with new ideas that
challenge existing limitations. COMP 550 and any of the advanced courses above
would be helpful as starting points for this.
Programming
We will use Clojure, a LISP based on the \(\lambda\)-calculus. Clojure
has several advantages.
- It is functional
- It is simple to learn, but powerful.
- It is increasingly important (especially thanks to ClojureScript).
- LISP and the \(\lambda\)-calculus are pervasive in linguistics and have been
used frequently in the history of AI.
- It is related to a number of probabilistic programming languages.
- It allows metalinguistic abstraction.
Resources
General LISP and Clojure learning resources:
- SICP — Structure and Interpretation of Computer Programs, by Abelson & Sussman. This book is for MIT Scheme, but still one of the best programming books ever written. See also the accompanying video lectures (1986).
- SICP in Clojure — a version of SICP with MIT Scheme code translated into Clojure.
- Clojure
- ClojureScript
2 Models of Computation →