# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
__all__ = ['CCGParser']
from abc import abstractmethod
from collections.abc import Iterable
import sys
from tqdm.auto import tqdm
from lambeq.backend.grammar import Diagram
from lambeq.core.globals import VerbosityLevel
from lambeq.core.utils import (SentenceBatchType, SentenceType,
tokenised_sentence_type_check)
from lambeq.text2diagram.base import Reader
from lambeq.text2diagram.ccg_tree import CCGTree
[docs]
class CCGParser(Reader):
"""Base class for CCG parsers."""
verbose = VerbosityLevel.SUPPRESS.value
[docs]
@abstractmethod
def __init__(self,
root_cats: Iterable[str] | None = None,
verbose: str = VerbosityLevel.SUPPRESS.value) -> None:
"""Initialise the CCG parser."""
[docs]
@abstractmethod
def sentences2trees(self,
sentences: SentenceBatchType,
tokenised: bool = False,
suppress_exceptions: bool = False,
verbose: str | None = None) -> list[CCGTree | None]:
"""Parse multiple sentences into a list of :py:class:`.CCGTree` s.
Parameters
----------
sentences : list of str, or list of list of str
The sentences to be parsed, passed either as strings or as
lists of tokens.
suppress_exceptions : bool, default: False
Whether to suppress exceptions. If :py:obj:`True`, then if a
sentence fails to parse, instead of raising an exception,
its return entry is :py:obj:`None`.
tokenised : bool, default: False
Whether each sentence has been passed as a list of tokens.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. Not all parsers
implement all three levels of progress reporting, see the
respective documentation for each parser. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Returns
-------
list of CCGTree or None
The parsed trees. May contain :py:obj:`None` if exceptions
are suppressed.
"""
[docs]
def sentence2tree(self,
sentence: SentenceType,
tokenised: bool = False,
suppress_exceptions: bool = False) -> CCGTree | None:
"""Parse a sentence into a :py:class:`.CCGTree`.
Parameters
----------
sentence : str, list[str]
The sentence to be parsed, passed either as a string, or as
a list of tokens.
suppress_exceptions : bool, default: False
Whether to suppress exceptions. If :py:obj:`True`, then if
the sentence fails to parse, instead of raising an
exception, returns :py:obj:`None`.
tokenised : bool, default: False
Whether the sentence has been passed as a list of tokens.
Returns
-------
CCGTree or None
The parsed tree, or :py:obj:`None` on failure.
"""
if tokenised:
if not tokenised_sentence_type_check(sentence):
raise ValueError('`tokenised` set to `True`, but variable '
'`sentence` does not have type '
'`list[str]`.')
sent: list[str] = [str(token) for token in sentence]
return self.sentences2trees(
[sent],
suppress_exceptions=suppress_exceptions,
tokenised=tokenised,
verbose=VerbosityLevel.SUPPRESS.value)[0]
else:
if not isinstance(sentence, str):
raise ValueError('`tokenised` set to `False`, but variable '
'`sentence` does not have type `str`.')
return self.sentences2trees(
[sentence],
suppress_exceptions=suppress_exceptions,
tokenised=tokenised,
verbose=VerbosityLevel.SUPPRESS.value)[0]
[docs]
def sentences2diagrams(self,
sentences: SentenceBatchType,
tokenised: bool = False,
planar: bool = False,
collapse_noun_phrases: bool = True,
suppress_exceptions: bool = False,
verbose: str | None = None) -> list[Diagram | None]:
"""Parse multiple sentences into a list of lambeq diagrams.
Parameters
----------
sentences : list of str, or list of list of str
The sentences to be parsed.
tokenised : bool, default: False
Whether each sentence has been passed as a list of tokens.
planar : bool, default: False
Force diagrams to be planar when they contain
crossed composition.
collapse_noun_phrases : bool, default: True
If set, then before converting each tree to a diagram, any
noun phrase types in the tree are changed into nouns. This
includes sub-types, e.g. `S/NP` becomes `S/N`.
suppress_exceptions : bool, default: False
Whether to suppress exceptions. If :py:obj:`True`, then if a
sentence fails to parse, instead of raising an exception,
its return entry is :py:obj:`None`.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. Not all parsers
implement all three levels of progress reporting, see the
respective documentation for each parser. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Returns
-------
list of :py:class:`lambeq.backend.grammar.Diagram` or None
The parsed diagrams. May contain :py:obj:`None` if
exceptions are suppressed.
"""
trees = self.sentences2trees(sentences,
suppress_exceptions=suppress_exceptions,
tokenised=tokenised,
verbose=verbose)
diagrams: list[Diagram | None] = []
if verbose is None:
verbose = self.verbose
if verbose is VerbosityLevel.TEXT.value:
print('Turning parse trees to diagrams.', file=sys.stderr)
for tree in tqdm(
trees,
desc='Parse trees to diagrams',
leave=False,
disable=verbose != VerbosityLevel.PROGRESS.value):
if tree is not None:
try:
diagram = tree.to_diagram(
planar=planar,
collapse_noun_phrases=collapse_noun_phrases
)
except Exception as e:
if suppress_exceptions:
diagrams.append(None)
else:
raise e
else:
diagrams.append(diagram)
else:
diagrams.append(None)
return diagrams
[docs]
def sentence2diagram(self,
sentence: SentenceType,
tokenised: bool = False,
planar: bool = False,
collapse_noun_phrases: bool = True,
suppress_exceptions: bool = False) -> Diagram | None:
"""Parse a sentence into a lambeq diagram.
Parameters
----------
sentence : str or list of str
The sentence to be parsed.
tokenised : bool, default: False
Whether the sentence has been passed as a list of tokens.
planar : bool, default: False
Force diagrams to be planar when they contain
crossed composition.
collapse_noun_phrases : bool, default: True
If set, then before converting the tree to a diagram, all
noun phrase types in the tree are changed into nouns. This
includes sub-types, e.g. `S/NP` becomes `S/N`.
suppress_exceptions : bool, default: False
Whether to suppress exceptions. If :py:obj:`True`, then if
the sentence fails to parse, instead of raising an
exception, returns :py:obj:`None`.
Returns
-------
:py:class:`lambeq.backend.grammar.Diagram` or None
The parsed diagram, or :py:obj:`None` on failure.
"""
if tokenised:
if not tokenised_sentence_type_check(sentence):
raise ValueError('`tokenised` set to `True`, but variable '
'`sentence` does not have type '
'`list[str]`.')
sent: list[str] = [str(token) for token in sentence]
return self.sentences2diagrams(
[sent],
planar=planar,
collapse_noun_phrases=collapse_noun_phrases,
suppress_exceptions=suppress_exceptions,
tokenised=tokenised,
verbose=VerbosityLevel.SUPPRESS.value)[0]
else:
if not isinstance(sentence, str):
raise ValueError('`tokenised` set to `False`, but variable '
'`sentence` does not have type `str`.')
return self.sentences2diagrams(
[sentence],
planar=planar,
collapse_noun_phrases=collapse_noun_phrases,
suppress_exceptions=suppress_exceptions,
tokenised=tokenised,
verbose=VerbosityLevel.SUPPRESS.value)[0]