Source code for lambeq.text2diagram.ccg_parser

# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

__all__ = ['CCGParser']

from abc import abstractmethod
from collections.abc import Iterable
import sys

from tqdm.auto import tqdm

from lambeq.backend.grammar import Diagram
from lambeq.core.globals import VerbosityLevel
from lambeq.core.utils import (SentenceBatchType, SentenceType,
                               tokenised_sentence_type_check)
from lambeq.text2diagram.base import Reader
from lambeq.text2diagram.ccg_tree import CCGTree


[docs]class CCGParser(Reader):
    """Base class for CCG parsers."""

    verbose = VerbosityLevel.SUPPRESS.value

[docs]    @abstractmethod
    def __init__(self,
                 root_cats: Iterable[str] | None = None,
                 verbose: str = VerbosityLevel.SUPPRESS.value) -> None:
        """Initialise the CCG parser."""

[docs]    @abstractmethod
    def sentences2trees(self,
                        sentences: SentenceBatchType,
                        tokenised: bool = False,
                        suppress_exceptions: bool = False,
                        verbose: str | None = None) -> list[CCGTree | None]:
        """Parse multiple sentences into a list of :py:class:`.CCGTree` s.

        Parameters
        ----------
        sentences : list of str, or list of list of str
            The sentences to be parsed, passed either as strings or as
            lists of tokens.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if a
            sentence fails to parse, instead of raising an exception,
            its return entry is :py:obj:`None`.
        tokenised : bool, default: False
            Whether each sentence has been passed as a list of tokens.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. Not all parsers
            implement all three levels of progress reporting, see the
            respective documentation for each parser. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Returns
        -------
        list of CCGTree or None
            The parsed trees. May contain :py:obj:`None` if exceptions
            are suppressed.

        """

[docs]    def sentence2tree(self,
                      sentence: SentenceType,
                      tokenised: bool = False,
                      suppress_exceptions: bool = False) -> CCGTree | None:
        """Parse a sentence into a :py:class:`.CCGTree`.

        Parameters
        ----------
        sentence : str, list[str]
            The sentence to be parsed, passed either as a string, or as
            a list of tokens.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if
            the sentence fails to parse, instead of raising an
            exception, returns :py:obj:`None`.
        tokenised : bool, default: False
            Whether the sentence has been passed as a list of tokens.

        Returns
        -------
        CCGTree or None
            The parsed tree, or :py:obj:`None` on failure.

        """
        if tokenised:
            if not tokenised_sentence_type_check(sentence):
                raise ValueError('`tokenised` set to `True`, but variable '
                                 '`sentence` does not have type '
                                 '`list[str]`.')
            sent: list[str] = [str(token) for token in sentence]
            return self.sentences2trees(
                            [sent],
                            suppress_exceptions=suppress_exceptions,
                            tokenised=tokenised,
                            verbose=VerbosityLevel.SUPPRESS.value)[0]
        else:
            if not isinstance(sentence, str):
                raise ValueError('`tokenised` set to `False`, but variable '
                                 '`sentence` does not have type `str`.')
            return self.sentences2trees(
                            [sentence],
                            suppress_exceptions=suppress_exceptions,
                            tokenised=tokenised,
                            verbose=VerbosityLevel.SUPPRESS.value)[0]

[docs]    def sentences2diagrams(self,
                           sentences: SentenceBatchType,
                           tokenised: bool = False,
                           planar: bool = False,
                           collapse_noun_phrases: bool = True,
                           suppress_exceptions: bool = False,
                           verbose: str | None = None) -> list[Diagram | None]:
        """Parse multiple sentences into a list of lambeq diagrams.

        Parameters
        ----------
        sentences : list of str, or list of list of str
            The sentences to be parsed.
        tokenised : bool, default: False
            Whether each sentence has been passed as a list of tokens.
        planar : bool, default: False
            Force diagrams to be planar when they contain
            crossed composition.
        collapse_noun_phrases : bool, default: True
            If set, then before converting each tree to a diagram, any
            noun phrase types in the tree are changed into nouns. This
            includes sub-types, e.g. `S/NP` becomes `S/N`.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if a
            sentence fails to parse, instead of raising an exception,
            its return entry is :py:obj:`None`.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. Not all parsers
            implement all three levels of progress reporting, see the
            respective documentation for each parser. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Returns
        -------
        list of :py:class:`lambeq.backend.grammar.Diagram` or None
            The parsed diagrams. May contain :py:obj:`None` if
            exceptions are suppressed.

        """
        trees = self.sentences2trees(sentences,
                                     suppress_exceptions=suppress_exceptions,
                                     tokenised=tokenised,
                                     verbose=verbose)
        diagrams: list[Diagram | None] = []
        if verbose is None:
            verbose = self.verbose
        if verbose is VerbosityLevel.TEXT.value:
            print('Turning parse trees to diagrams.', file=sys.stderr)
        for tree in tqdm(
                trees,
                desc='Parse trees to diagrams',
                leave=False,
                disable=verbose != VerbosityLevel.PROGRESS.value):
            if tree is not None:
                try:
                    diagram = tree.to_diagram(
                        planar=planar,
                        collapse_noun_phrases=collapse_noun_phrases
                    )
                except Exception as e:
                    if suppress_exceptions:
                        diagrams.append(None)
                    else:
                        raise e
                else:
                    diagrams.append(diagram)
            else:
                diagrams.append(None)
        return diagrams

[docs]    def sentence2diagram(self,
                         sentence: SentenceType,
                         tokenised: bool = False,
                         planar: bool = False,
                         collapse_noun_phrases: bool = True,
                         suppress_exceptions: bool = False) -> Diagram | None:
        """Parse a sentence into a lambeq diagram.

        Parameters
        ----------
        sentence : str or list of str
            The sentence to be parsed.
        tokenised : bool, default: False
            Whether the sentence has been passed as a list of tokens.
        planar : bool, default: False
            Force diagrams to be planar when they contain
            crossed composition.
        collapse_noun_phrases : bool, default: True
            If set, then before converting the tree to a diagram, all
            noun phrase types in the tree are changed into nouns. This
            includes sub-types, e.g. `S/NP` becomes `S/N`.
        suppress_exceptions : bool, default: False
            Whether to suppress exceptions. If :py:obj:`True`, then if
            the sentence fails to parse, instead of raising an
            exception, returns :py:obj:`None`.

        Returns
        -------
        :py:class:`lambeq.backend.grammar.Diagram` or None
            The parsed diagram, or :py:obj:`None` on failure.

        """
        if tokenised:
            if not tokenised_sentence_type_check(sentence):
                raise ValueError('`tokenised` set to `True`, but variable '
                                 '`sentence` does not have type '
                                 '`list[str]`.')
            sent: list[str] = [str(token) for token in sentence]
            return self.sentences2diagrams(
                            [sent],
                            planar=planar,
                            collapse_noun_phrases=collapse_noun_phrases,
                            suppress_exceptions=suppress_exceptions,
                            tokenised=tokenised,
                            verbose=VerbosityLevel.SUPPRESS.value)[0]
        else:
            if not isinstance(sentence, str):
                raise ValueError('`tokenised` set to `False`, but variable '
                                 '`sentence` does not have type `str`.')
            return self.sentences2diagrams(
                            [sentence],
                            planar=planar,
                            collapse_noun_phrases=collapse_noun_phrases,
                            suppress_exceptions=suppress_exceptions,
                            tokenised=tokenised,
                            verbose=VerbosityLevel.SUPPRESS.value)[0]