(ns foundations.computational.linguistics (:require [reagent.core :as r] [reagent.dom :as rd] [clojure.zip :as z] [clojure.pprint :refer [pprint]] [clojure.string :refer [index-of]] ;[clojure.string :as str] )) (enable-console-print!) (defn log [a-thing] (.log js/console a-thing))

(defn render-vega [spec elem] (when spec (let [spec (clj->js spec) opts {:renderer "canvas" :mode "vega" :actions { :export true, :source true, :compiled true, :editor true}}] (-> (js/vegaEmbed elem spec (clj->js opts)) (.then (fn [res] (. js/vegaTooltip (vega (.-view res) spec)))) (.catch (fn [err] (log err))))))) (defn vega "Reagent component that renders vega" [spec] (r/create-class {:display-name "vega" :component-did-mount (fn [this] (render-vega spec (rd/dom-node this))) :component-will-update (fn [this [_ new-spec]] (render-vega new-spec (rd/dom-node this))) :reagent-render (fn [spec] [:div#vis])})) ;making a histogram from a list of observations (defn list-to-hist-data-lite [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) {:values (into [] (map ->rec l))}) (defn makehist-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "category", :type "ordinal"}, :y {:aggregate "count", :type "quantitative"} } }) (defn list-to-hist-data [l] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category]) [{:name "raw", :values (into [] (map ->rec l))} {:name "aggregated" :source "raw" :transform [{:as ["count"] :type "aggregate" :groupby ["category"]}]} {:name "agg-sorted" :source "aggregated" :transform [{:type "collect" :sort {:field "category"}}]} ]) (defn makehist [data] (let [n (count (distinct ((data 0) :values))) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "agg-sorted", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "agg-sorted", :field "count"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "agg-sorted"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "count"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.count", :offset -2}, :text {:signal "tooltip.count"}, :fillOpacity [ {:test "isNaN(tooltip.count)", :value 0}, {:value 1} ] } } } ] })) (defn hist [l] (-> l list-to-hist-data makehist vega)) ; for making bar plots (defn list-to-barplot-data-lite [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:values (into [] (map ->rec l m))}) (defn makebarplot-lite [data] { :$schema "https://vega.github.io/schema/vega-lite/v4.json", :data data, :mark "bar", :encoding { :x {:field "element", :type "ordinal"}, :y {:field "value", :type "quantitative"} } }) (defn list-to-barplot-data [l m] """ takes a list and returns a record in the right format for vega data, with each list element the label to a field named 'x'""" (defrecord rec [category amount]) {:name "table", :values (into [] (map ->rec l m))}) (defn makebarplot [data] (let [n (count (data :values)) h 200 pad 5 w (if (< n 20) (* n 35) (- 700 (* 2 pad)))] { :$schema "https://vega.github.io/schema/vega/v5.json", :width w, :height h, :padding pad, :data data, :signals [ {:name "tooltip", :value {}, :on [{:events "rect:mouseover", :update "datum"}, {:events "rect:mouseout", :update "{}"}]} ], :scales [ {:name "xscale", :type "band", :domain {:data "table", :field "category"}, :range "width", :padding 0.05, :round true}, {:name "yscale", :domain {:data "table", :field "amount"}, :nice true, :range "height"} ], :axes [ { :orient "bottom", :scale "xscale" }, { :orient "left", :scale "yscale" } ], :marks [ {:type "rect", :from {:data "table"}, :encode { :enter { :x {:scale "xscale", :field "category"}, :width {:scale "xscale", :band 1}, :y {:scale "yscale", :field "amount"}, :y2 {:scale "yscale", :value 0} }, :update {:fill {:value "steelblue"}}, :hover {:fill {:value "green"}} } }, {:type "text", :encode { :enter { :align {:value "center"}, :baseline {:value "bottom"}, :fill {:value "#333"} }, :update { :x {:scale "xscale", :signal "tooltip.category", :band 0.5}, :y {:scale "yscale", :signal "tooltip.amount", :offset -2}, :text {:signal "tooltip.amount"}, :fillOpacity [ {:test "isNaN(tooltip.amount)", :value 0}, {:value 1} ] } } } ] })) (defn barplot [l m] (vega (makebarplot (list-to-barplot-data l m)))) ; now, for tree making ;(thanks to Taylor Wood's answer in this thread on stackoverflow: ; https://stackoverflow.com/questions/57911965) (defn count-up-to-right [loc] (if (z/up loc) (loop [x loc, pops 0] (if (z/right x) pops (recur (z/up x) (inc pops)))) 0)) (defn list-to-tree-spec [l] """ takes a list and walks through it (with clojure.zip library) and builds the record format for the spec needed to for vega""" (loop [loc (z/seq-zip l), next-id 0, parent-ids [], acc []] (cond (z/end? loc) acc (z/end? (z/next loc)) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))}) (and (z/node loc) (not (z/branch? loc))) (recur (z/next loc) (inc next-id) (cond (not (z/right loc)) (let [n (count-up-to-right loc) popn (apply comp (repeat n pop))] (some-> parent-ids not-empty popn)) (not (z/left loc)) (conj parent-ids next-id) :else parent-ids) (conj acc {:id (str next-id) :name (str (z/node loc)) :parent (when (seq parent-ids) (str (peek parent-ids)))})) :else (recur (z/next loc) next-id parent-ids acc)))) (defn maketree [w h tree-spec] """ makes vega spec for a tree given tree-spec in the right json-like format """ {:$schema "https://vega.github.io/schema/vega/v5.json" :data [{:name "tree" :transform [{:key "id" :parentKey "parent" :type "stratify"} {:as ["x" "y" "depth" "children"] :method {:signal "layout"} :size [{:signal "width"} {:signal "height"}] :type "tree"}] :values tree-spec } {:name "links" :source "tree" :transform [{:type "treelinks"} {:orient "horizontal" :shape {:signal "links"} :type "linkpath"}]}] :height h :marks [{:encode {:update {:path {:field "path"} :stroke {:value "#ccc"}}} :from {:data "links"} :type "path"} {:encode {:enter {:size {:value 50} :stroke {:value "#fff"}} :update {:fill {:field "depth" :scale "color"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "symbol"} {:encode {:enter {:baseline {:value "bottom"} :font {:value "Courier"} :fontSize {:value 14} :angle {:value 0} :text {:field "name"}} :update {:align {:signal "datum.children ? 'center' : 'center'"} :dy {:signal "datum.children ? -6 : -6"} :opacity {:signal "labels ? 1 : 0"} :x {:field "x"} :y {:field "y"}}} :from {:data "tree"} :type "text"}] :padding 5 :scales [{:domain {:data "tree" :field "depth"} :name "color" :range {:scheme "magma"} :type "linear" :zero true}] :signals [{:bind {:input "checkbox"} :name "labels" :value true} {:bind {:input "radio" :options ["tidy" "cluster"]} :name "layout" :value "tidy"} {:name "links" :value "line"}] :width w} ) (defn tree-depth "get the depth of a tree (list)" [list] (if (seq? list) (inc (apply max 0 (map tree-depth list))) 0)) (defn tree "plot tree using vega" [list] (let [spec (list-to-tree-spec list) h (* 30 (tree-depth list))] (vega (maketree 700 h spec))))

→

This is a course on the foundations of computational linguistics.

Goals

Most courses taught on computational linguistics or natural language processing tend to be fairly advanced survey courses, assuming a large amount of knowledge of computer science and covering a variety of somewhat unrelated modeling techniques, problem domains, and tasks. This course takes a different approach, seeking to build up a small set of conceptual tools and empirical methods from the ground up.

Some specific goals of the course include:

Learning foundational conceptual tools needed to understand computational models of linguistic structure.
Understanding the notion of a model of computation and how it can be applied to understand aspects of linguistic structure.
Understanding how to develop empirical evaluations of such models.
Learning a bunch of specific, useful skills such as programming in Clojure.

Content

The modern study of language originated in the 1950s when tools from the recently-emerged theory of computation started to be applied to the problem of language. One important early result in this area was the Chomsky Hierarchy which gave a containment hierarchy of phrase-structure grammars or grammars built out of rewrite rules like the following:

\[\texttt{S} \longrightarrow \texttt{NP}\ \texttt{VP}\]

Chomsky showed that restricting the form of these rules into four classes (left-linear, context-free, context-sensitive, and unrestricted) gave rise to classes of formal languages which were strictly nested.

\[\mathcal{L}_{\texttt{LL}} \subset \mathcal{L}_{\texttt{CF}} \subset \mathcal{L}_{\texttt{CS}} \subset \mathcal{L}_{\texttt{UR}}\]

The Chomsky hierarchy remains a foundational result in theoretical computer science, and forms a basic entry point for studying formal grammars in linguistics. The conceptual tools underlying the hierarchy are useful for understanding even modern systems such as recurrent or recursive neural networks and transformers.

In COMPLING 445/645 we will study the Chomsky Hierarchy up to the left-linear languages. We will cover refinements of this hierarchy and how to work with its extensions using probability. Along the way, we will introduce the foundations of probability theory and Bayesian inference.

What this course is not

A survey of current approaches in NLP (see COMP 550 below).
A course about some specific software (e.g., a Python library).
A course that expects a lot of background knowledge (we will try to build everything up from first concepts, but it will be fast).

What you need: curiosity, ambition, fearlessness, and ability to use Google to find answers to technical problems.

What is the difference between various CL/NLP offerings at McGill?

McGill now offers a number of courses in CL/NLP that cover somewhat overlapping but mostly complementary material, with differences in assumed background knowledge and focus. While some topics will appear in several courses, the emphasis will be different enough that you could fruitfully take all courses (if you are an undergraduate student).

COMP 550 NLP (Prof. Jackie Cheung): This course assumes more computational background, including deep familiarity with probability and algorithms. COMP 550 focuses on technological perspectives of natural language processing as a subdiscipline of artificial intelligence. It includes a strong machine learning component. It covers computational semantics, discourse, and applications such as automatic summarization and machine translation, which COMP/LING 445 does not. COMP/LING 445 would be a valuable class to have taken before 550.
COMP 445 / LING 445/645 Computational Linguistics (Dr. Eva Portelance): This course goes in depth into the fundamental and formal mathematical and linguistic principles that underlie modern computational linguistics, with an emphasis on formalization of linguistic analysis. It draws connections to the theory of computation (automata theory), and rigorously derives some of models that form the basic analytical toolbox of computational linguistics, which COMP 550 does not. The tools taught in this course can serve as the foundation for later coursework in computational linguistics and NLP.
COMP 345 / LING 345 From Language to Data Science (Prof. Siva Reddy/Timothy O’Donnell): This course is a hands-on introduction to working with linguistic data. It assumes less programming and mathematical background than either COMP 550 or COMP/LING 445. It is the least advanced and most applied of the three courses. It’s focus will be on the data science of language and as such it will make use of both engineering examples as well as scientific examples, like working with data derived from psycholinguistic experiments.
COMP/LING 596/484 Probabilistic Programming (Prof. Timothy O’Donnell): This course extends some of the tools developed in 445 in the direction of probabilistic programming languages. Probabilistic models are fundamental tool in computational cognitive science and artificial intelligence. In recent years, a new paradigm has emerged for specifying and deploying such models: probabilistic programming. Probabilistic programming refers to the use of high-level programming languages which are designed specifically to express probabilistic models, and which are often understood as expressing a ``natively probabilistic’’ model of computation. This examines a number of topics in probabilistic modeling and inference from the perspective of probabilistic programming, using examples drawn from AI and computational cognitive science. COMP/LING 445 provides a good conceptual foundation for this course.
COMP/LING 596/483 Computational Linguistics 2 (Prof. Timothy O’Donnell): This course extends is the direct extension of COMP/LING 445, covering the second half of this textbook.
COMP 599 / LING 484/782 Natural Language Understanding with Deep Learning / Computational Semantics (Prof. Siva Reddy): This course is a more advanced seminar-style course in natural language understanding using tools from neural networks and deep learning. The course examines how concepts in NLU such as meaning or applications such as question answering, have shifted in recent years, what has been gained with each paradigm shift, and what has been lost? The course critically evaluates existing ideas and tries to come up with new ideas that challenge existing limitations. COMP 550 and any of the advanced courses above would be helpful as starting points for this.

Programming

We will use Clojure, a LISP based on the $\lambda$-calculus. Clojure has several advantages.

It is functional
It is simple to learn, but powerful.
It is increasingly important (especially thanks to ClojureScript).
LISP and the $\lambda$-calculus are pervasive in linguistics and have been used frequently in the history of AI.
It is related to a number of probabilistic programming languages.
It allows metalinguistic abstraction.

Resources

General LISP and Clojure learning resources:

SICP — Structure and Interpretation of Computer Programs, by Abelson & Sussman. This book is for MIT Scheme, but still one of the best programming books ever written. See also the accompanying video lectures (1986).
SICP in Clojure — a version of SICP with MIT Scheme code translated into Clojure.
Clojure
ClojureScript