Source code for lambeq.text2diagram.tree_reader

# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from collections.abc import Callable
from enum import Enum

__all__ = ['TreeReader', 'TreeReaderMode']

from lambeq.backend.grammar import Box, Diagram, Id, Ty, Word
from lambeq.core.types import AtomicType
from lambeq.core.utils import SentenceType
from lambeq.text2diagram.base import Reader
from lambeq.text2diagram.bobcat_parser import BobcatParser
from lambeq.text2diagram.ccg_parser import CCGParser
from lambeq.text2diagram.ccg_rule import CCGRule
from lambeq.text2diagram.ccg_tree import CCGTree

S = AtomicType.SENTENCE


[docs]class TreeReaderMode(Enum):
    """An enumeration for :py:class:`TreeReader`.

    The words in the tree diagram can be combined using 3 modes:

    .. glossary::

        NO_TYPE
            The 'no type' mode names every rule box :py:obj:`UNIBOX`.

        RULE_ONLY
            The 'rule name' mode names every rule box based on the name
            of the original CCG rule. For example, for the forward
            application rule :py:obj:`FA(N << N)`, the rule box will be
            named :py:obj:`FA`.

        RULE_TYPE
            The 'rule type' mode names every rule box based on the name
            and type of the original CCG rule. For example, for the
            forward application rule :py:obj:`FA(N << N)`, the rule box
            will be named :py:obj:`FA(N << N)`.

        HEIGHT
            The 'height' mode names every rule box based on the
            tree height of its subtree. For example, a rule
            box directly combining two words will be named
            :py:obj:`layer_1`.

    """
    NO_TYPE = 0
    RULE_ONLY = 1
    RULE_TYPE = 2
    HEIGHT = 3


[docs]class TreeReader(Reader):
    """A reader that combines words according to a parse tree."""

[docs]    def __init__(
        self,
        ccg_parser: CCGParser | Callable[[], CCGParser] = BobcatParser,
        mode: TreeReaderMode = TreeReaderMode.NO_TYPE,
        word_type: Ty = S
    ) -> None:
        """Initialise a tree reader.

        Parameters
        ----------
        ccg_parser : CCGParser or callable, default: BobcatParser
            A :py:class:`CCGParser` object or a function that returns
            it. The parse tree produced by the parser is used to
            generate the tree diagram.
        mode : TreeReaderMode, default: TreeReaderMode.NO_TYPE
            Determines what boxes are used to combine the tree.
            See :py:class:`TreeReaderMode` for options.
        word_type : Ty, default: core.types.AtomicType.SENTENCE
            The type of each word box. By default, it uses the sentence
            type from :py:class:`.core.types.AtomicType`.

        """
        if not isinstance(mode, TreeReaderMode):
            raise ValueError(f'Mode must be one of {self.available_modes()}.')
        if not isinstance(ccg_parser, CCGParser):
            if not callable(ccg_parser):
                raise ValueError(f'{ccg_parser} should be a CCGParser or a '
                                 'function that returns a CCGParser.')
            ccg_parser = ccg_parser()
            if not isinstance(ccg_parser, CCGParser):
                raise ValueError(f'{ccg_parser} should be a CCGParser or a '
                                 'function that returns a CCGParser.')
        self.ccg_parser = ccg_parser
        self.mode = mode
        self.word_type = word_type

[docs]    @classmethod
    def available_modes(cls) -> list[str]:
        """The list of modes for initialising a tree reader."""
        return list(TreeReaderMode)

[docs]    @staticmethod
    def tree2diagram(tree: CCGTree,
                     mode: TreeReaderMode = TreeReaderMode.NO_TYPE,
                     word_type: Ty = S,
                     suppress_exceptions: bool = False) -> Diagram | None:
        """Convert a :py:class:`~.CCGTree` into a
        :py:class:`~lambeq.backend.grammar.Diagram` .

        This produces a tree-shaped diagram based on the output of the
        CCG parser.

        Parameters
        ----------
        tree : :py:class:`~.CCGTree`
            The CCG tree to be converted.
        mode : TreeReaderMode, default: TreeReaderMode.NO_TYPE
            Determines what boxes are used to combine the tree.
            See :py:class:`TreeReaderMode` for options.
        word_type : Ty, default: core.types.AtomicType.SENTENCE
            The type of each word box. By default, it uses the sentence
            type from :py:class:`.core.types.AtomicType`.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if a
            sentence fails to parse, instead of raising an exception,
            its return entry is :py:obj:`None`.

        Returns
        -------
        :py:class:`lambeq.backend.grammar.Diagram` or None
            The parsed diagram, or :py:obj:`None` on failure.

        """

        try:
            return TreeReader._tree2diagram(tree._resolved(), mode, word_type)
        except Exception as e:
            if suppress_exceptions:
                return None
            else:
                raise e

    @staticmethod
    def _tree2diagram(tree: CCGTree,
                      mode: TreeReaderMode = TreeReaderMode.NO_TYPE,
                      word_type: Ty = S) -> Diagram:
        if tree.rule == CCGRule.LEXICAL:
            return Word(tree.text, word_type).to_diagram()
        else:
            dom = word_type ** len(tree.children)
            cod = word_type

            if mode == TreeReaderMode.NO_TYPE:
                name = 'UNIBOX'
            elif mode == TreeReaderMode.HEIGHT:
                name = f'layer_{tree.height}'
            elif mode == TreeReaderMode.RULE_ONLY:
                name = tree.rule.value
            else:
                assert mode == TreeReaderMode.RULE_TYPE
                types = ', '.join(str(child.biclosed_type)
                                  for child in tree.children)
                name = f'{tree.rule.value}({types})'

            children = [TreeReader._tree2diagram(child, mode, word_type)
                        for child in tree.children]
            return Id().tensor(*children) >> Box(name, dom, cod)

[docs]    def sentence2diagram(self,
                         sentence: SentenceType,
                         tokenised: bool = False,
                         collapse_noun_phrases: bool = True,
                         suppress_exceptions: bool = False) -> Diagram | None:
        """Parse a sentence into a lambeq diagram.

        This produces a tree-shaped diagram based on the output of the
        CCG parser.

        Parameters
        ----------
        sentence : str or list of str
            The sentence to be parsed.
        tokenised : bool, default: False
            Whether the sentence has been passed as a list of tokens.
        collapse_noun_phrases : bool, default: True
            If set, then before converting each tree to a diagram, any
            noun phrase types in the tree are changed into nouns. This
            includes sub-types, e.g. `S/NP` becomes `S/N`.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if a
            sentence fails to parse, instead of raising an exception,
            its return entry is :py:obj:`None`.

        Returns
        -------
        :py:class:`lambeq.backend.grammar.Diagram` or None
            The parsed diagram, or :py:obj:`None` on failure.

        """

        tree = self.ccg_parser.sentence2tree(
            sentence=sentence,
            tokenised=tokenised,
            suppress_exceptions=suppress_exceptions)

        if tree is None:
            return None

        if collapse_noun_phrases:
            tree = tree.collapse_noun_phrases()

        return self.tree2diagram(tree,
                                 mode=self.mode,
                                 word_type=self.word_type,
                                 suppress_exceptions=suppress_exceptions)