Source code for lambeq.bobcat.parser

# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from collections.abc import Iterable, Mapping
from dataclasses import dataclass, field
import math
from typing import overload, Tuple

from lambeq.bobcat.grammar import Grammar
from lambeq.bobcat.lexicon import Atom, Category
from lambeq.bobcat.lexicon import CATEGORIES
from lambeq.bobcat.rules import Rules
from lambeq.bobcat.tree import Dependency, Lexical, ParseTree

SpanT = Tuple[int, int]

NEGATIVE_INFINITY = -float('inf')


[docs]@dataclass
class Supertag:
    """A string category, annotated with its log probability."""
    category: str
    probability: float


[docs]@dataclass
class Sentence:
    """An input sentence.

    Attributes
    ----------
    words : list of str
        The tokens in the sentence.
    input_supertags : list of list of Supertag
        A list of supertags for each word.
    span_scores : dict of tuple of int and int to dict of int to float
        Mapping of a span to a dict of category (indices) mapped to
        their log probability.

    """

    words: list[str]
    input_supertags: list[list[Supertag]]
    span_scores: dict[SpanT, dict[int, float]]

    def __post_init__(self) -> None:
        if len(self.words) != len(self.input_supertags):
            raise ValueError(
                    '`words` must be the same length as `input_supertags`')

    def __len__(self) -> int:
        return len(self.words)


@dataclass
class Cell:
    """A cell in the chart.

    The cell maintains a list of trees in sorted order, up to the beam
    size (though may be larger if there are ties at the bottom), with
    the further restriction that only one tree is allowed per category.

    """

    beam_size: int
    trees: list[ParseTree] = field(default_factory=list)
    trees_map: dict[Category, ParseTree] = field(default_factory=dict)
    min_score: float = NEGATIVE_INFINITY

    def find(self, score: float) -> int:
        """Find the index where a tree with the given score can go."""
        trees = self.trees
        lo = 0
        hi = len(trees)
        while lo < hi:
            mid = (lo + hi) // 2
            cmp = trees[mid].score
            if score == cmp:
                return mid
            elif score > cmp:
                hi = mid
            else:
                lo = mid + 1
        return lo

    def add(self, to_add: Iterable[ParseTree]) -> int:
        """Add the trees to the cell.

        For each tree that is to be added, it is checked against the
        existing trees to determine whether it should be added, and if
        so, is added using a binary search; then, the beam is applied.

        """

        to_add = sorted(to_add, key=lambda tree: -tree.score)

        trees = self.trees
        trees_map = self.trees_map

        added = 0
        b = self.beam_size
        for tree in to_add:
            score = tree.score
            if len(trees) >= b and score < trees[-1].score:
                break

            # Check whether there exists a tree with the same category.
            # If there does, and it has a lower score, then remove the
            # old tree before inserting the new tree.
            # If the score is higher, then do nothing.
            insert: bool
            try:
                old_tree = trees_map[tree.cat]
            except KeyError:
                insert = True
            else:
                old_score = old_tree.score
                insert = score > old_score
                if insert:
                    old_index = self.find(old_score)
                    deleted = False
                    for i in range(old_index, len(trees)):
                        if trees[i] is old_tree:
                            del trees[i]
                            deleted = True
                            break
                        elif trees[i].score != old_score:
                            break
                    if not deleted:
                        for i in reversed(range(old_index)):
                            if trees[i] is old_tree:
                                del trees[i]
                                break

            if insert:
                trees.insert(self.find(score), tree)
                trees_map[tree.cat] = tree
                added += 1

                try:
                    cutoff = self.min_score = trees[b - 1].score
                    if trees[b].score < cutoff:
                        added -= len(trees) - b
                        for tree in trees[b:]:
                            del trees_map[tree.cat]
                        del trees[b:]
                except IndexError:
                    pass
        return added


@dataclass
class Chart:
    """The parse chart, containing a mapping from span to cell.

    A span (i, j) represents the phrase from the ith word to the jth
    word (inclusive), indexed from 0.

    """

    beam_size: int
    chart: dict[SpanT, Cell] = field(default_factory=dict)

    parse_tree_count: int = 0

    def __getitem__(self, index: SpanT) -> list[ParseTree]:
        return self.chart[index].trees

    def min_score(self, start: int, end: int) -> float:
        """Get the lowest score needed to add a tree to the given cell."""
        try:
            return self.chart[start, end].min_score
        except KeyError:
            return NEGATIVE_INFINITY

    def add(self, start: int, end: int, to_add: Iterable[ParseTree]) -> None:
        """Add parse trees to the cell in the chart."""
        if not to_add:
            return

        try:
            cell = self.chart[start, end]
        except KeyError:
            cell = self.chart[start, end] = Cell(self.beam_size)

        self.parse_tree_count += cell.add(to_add)


@dataclass
class ParseResult:
    """The result of a parse.

    This acts as a list of the most probable parse trees, in order, i.e.
    use `parse_result[0]` to access the most probable parse tree.

    Parameters
    ----------
    chart : Chart
        The parse chart.

    Attributes
    ----------
    words : list[str]
        The words in the sentence.
    root : list[str]
        The most probable parse trees, in order.

    """
    chart: Chart
    words: list[str] = field(init=False)
    root: list[ParseTree] = field(init=False)

    def __post_init__(self) -> None:
        self.words = []
        while True:
            try:
                tree = self.chart[len(self.words), len(self.words)][0]
            except KeyError:
                break
            else:
                while tree.left:
                    tree = tree.left
                self.words.append(tree.word)
        try:
            self.root = self.chart[0, len(self.words) - 1]
        except KeyError:
            self.root = []

    def __bool__(self) -> bool:
        return len(self.root) != 0

    def __len__(self) -> int:
        return len(self.root)

    @overload
    def __getitem__(self, index: int) -> ParseTree: ...

    @overload
    def __getitem__(self, index: slice) -> list[ParseTree]: ...

    def __getitem__(self, index: int | slice) -> ParseTree | list[ParseTree]:
        return self.root[index]

    def deps(
        self,
        tree: ParseTree | None = None
    ) -> tuple[list[Dependency], list[str]]:  # pragma: no cover
        """Get the dependencies and output tags of the parse.

        If `tree` is not specified, then this looks for the best scoring
        tree at the root of the parse; if there is none, then it
        amalgamates results from the best-scoring trees in the chart.

        """
        if tree is None:
            try:
                tree = self.root[0]
            except IndexError:
                return self._skim_deps()
        return tree.deps_and_tags

    def _skim_deps(
        self,
        start: int = 0,
        end: int | None = None
    ) -> tuple[list[Dependency], list[str]]:  # pragma: no cover
        if end is None:
            end = len(self.words) - 1

        if start > end:
            return [], []

        result_start = result_end = max_tree = None
        for span_length in reversed(range(end + 1 - start)):
            max_score = NEGATIVE_INFINITY
            for i in range(start, end + 1 - span_length):
                try:
                    cell = self.chart[i, i + span_length]
                except KeyError:
                    pass
                else:
                    tree = cell[0]
                    if tree.score > max_score:
                        max_score = tree.score
                        max_tree = tree

                        result_start = i
                        result_end = i + span_length
            if max_tree:
                break

        left_deps, left_tags = self._skim_deps(start, result_start - 1)
        tree_deps, tree_tags = max_tree.deps_and_tags
        right_deps, right_tags = self._skim_deps(result_end + 1, end)
        return (left_deps + tree_deps + right_deps,
                left_tags + tree_tags + right_tags)


[docs]class ChartParser:
[docs]    def __init__(self,
                 grammar: Grammar,
                 cats: Iterable[str],
                 root_cats: Iterable[str] | None,
                 eisner_normal_form: bool,
                 max_parse_trees: int,
                 beam_size: int,
                 input_tag_score_weight: float,
                 missing_cat_score: float,
                 missing_span_score: float) -> None:
        self.max_parse_trees = max_parse_trees

        self.categories = {}
        for plain_cat, markedup_cat in grammar.categories.items():
            self.categories[plain_cat] = Category.parse(markedup_cat)

        self.rules = Rules(eisner_normal_form, grammar, self.categories)

        self.input_tag_score_weight = input_tag_score_weight
        self.beam_size = beam_size

        try:
            self.missing_cat_score = math.log(missing_cat_score)
        except ValueError:
            self.missing_cat_score = NEGATIVE_INFINITY

        try:
            self.missing_span_score = math.log(missing_span_score)
        except ValueError:
            self.missing_span_score = NEGATIVE_INFINITY

        CONJ_TAG = '[conj]'
        self.result_cats: dict[tuple[str, tuple[Category, ...]], int] = {}
        cat_id = 0
        for cat_str in cats:
            chain = cat_str.split('::')
            res_cats: tuple[Category, ...]
            if len(chain) == 1 and chain[0].endswith(CONJ_TAG):
                base_cat = chain[0][:-len(CONJ_TAG)]
                if '/' in base_cat or '\\' in base_cat:
                    base_cat = f'({base_cat})'
                cat_modified = fr'({base_cat}\{base_cat})'
                res_cats = (Category.parse(cat_modified),)
                label = 'conj'
            else:
                res_cats = tuple(map(Category.parse, chain))
                label = 'unary' if len(chain) > 1 else 'binary'
            self.result_cats[label, res_cats] = cat_id
            cat_id += 1

        self.set_root_cats(root_cats)

[docs]    def set_root_cats(self,
                      root_cats: Iterable[Category | str] | None) -> None:
        if root_cats is None:
            self.root_cats = None
        else:
            try:
                self.root_cats = [(cat if isinstance(cat, Category)
                                   else CATEGORIES[cat, 0])
                                  for cat in root_cats]
            except KeyError as e:
                raise ValueError('Grammar does not contain root category: '
                                 f'{repr(e.args[0])}') from e

[docs]    def filter_root(self, trees: list[ParseTree]) -> list[ParseTree]:
        if self.root_cats is None:
            return trees
        else:
            results = []
            for tree in trees:
                for cat in self.root_cats:
                    if cat.matches(tree.cat):
                        results.append(tree)
                        break
            return results

[docs]    def __call__(self, sentence: Sentence) -> ParseResult:
        """Parse a sentence."""
        chart = Chart(self.beam_size)

        for i, (word, supertags) in enumerate(zip(sentence.words,
                                                  sentence.input_supertags)):
            results = []
            for supertag in supertags:
                tree = Lexical(self.categories[supertag.category], word, i + 1)
                tree.score = self.input_tag_score_weight * supertag.probability
                results.append(tree)

            try:
                span_scores = sentence.span_scores[i, i]
            except KeyError:
                pass
            else:
                if len(sentence) > 1:
                    results += self.rules.type_change(results)
                    results += self.rules.type_raise(results)

                    for tree in results:
                        if tree.left:
                            self.calc_score_unary(tree, span_scores)

            # filter root cats
            if len(sentence) == 1:
                results = self.filter_root(results)
            chart.add(i, i, results)

        for span_length in range(1, len(sentence)):
            for end in range(span_length, len(sentence)):
                if chart.parse_tree_count > self.max_parse_trees:
                    break

                start = end - span_length

                try:
                    span_scores = sentence.span_scores[start, end]
                except KeyError:
                    continue

                max_span_score = max((self.missing_cat_score,
                                      self.missing_span_score,
                                      *span_scores.values()))
                for split in range(start + 1, end + 1):
                    try:
                        left_trees = chart[start, split - 1]
                        right_trees = chart[split, end]
                    except KeyError:
                        continue

                    for left in left_trees:
                        for right in right_trees:
                            max_score = (left.score
                                         + right.score
                                         + max_span_score)
                            if max_score < chart.min_score(start, end):
                                break

                            results = self.rules.combine(left, right)

                            if results and len(sentence) > span_length + 1:
                                results += self.rules.type_change(results)
                                results += self.rules.type_raise(results)

                            # filter root cats
                            if span_length == len(sentence) - 1:
                                results = self.filter_root(results)

                            for tree in results:
                                if tree.right:
                                    self.calc_score_binary(tree, span_scores)
                                else:
                                    self.calc_score_unary(tree, span_scores)
                            chart.add(start, end, results)
        return ParseResult(chart)

[docs]    def calc_score_unary(self,
                         tree: ParseTree,
                         span_scores: Mapping[int, float]) -> None:
        """Calculate the score for a unary tree (chain)."""
        left = tree.left
        res_cat: tuple[str, tuple[Category, ...]]
        if left.right is None and left.left is not None:
            base = left.left
            res_cat = ('unary', (tree.cat, left.cat, left.left.cat))
        else:
            base = left
            res_cat = ('unary', (tree.cat, left.cat))

        if base.right is not None:
            tree.score = base.left.score + base.right.score
        else:
            tree.score = base.score

        cat_id = self.result_cats.get(res_cat)
        tree.score += self.get_span_score(span_scores, cat_id)

[docs]    def calc_score_binary(self,
                          tree: ParseTree,
                          span_scores: Mapping[int, float]) -> None:
        """Calculate the score for a binary tree."""
        if tree.coordinated:
            cat_id = self.result_cats.get(('conj', (tree.cat,)))
        else:
            cat = tree.cat
            try:
                cat_id = self.result_cats['binary', (tree.cat,)]
            except KeyError:
                if cat.atom == Atom.NP:
                    cat_no_nb = Category(cat.atom)
                    cat_id = self.result_cats.get(('binary', (cat_no_nb,)))
                else:
                    cat_id = None

        tree.score = (tree.left.score
                      + tree.right.score
                      + self.get_span_score(span_scores, cat_id))

[docs]    def get_span_score(self,
                       span_scores: Mapping[int, float],
                       cat_id: int | None) -> float:
        """Get the score in a span for a category (chain) ID."""
        if cat_id is None:
            return self.missing_cat_score
        try:
            return span_scores[cat_id]
        except KeyError:
            return self.missing_span_score