Source code for lambeq.text2diagram.ccgbank_parser

# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CCGBank parser
==============

The CCGBank is a translation of the Penn Treebank into a corpus of
Combinatory Categorial Grammar derivations, created by Julia Hockenmaier
and Mark Steedman.

This module provides a parser that automatically turns parses from
CCGBank into :py:class:`.CCGTree` s.

"""

from __future__ import annotations

__all__ = ['CCGBankParseError', 'CCGBankParser']

from collections.abc import Iterator
from pathlib import Path
import re
import sys
from typing import TYPE_CHECKING

from tqdm.auto import tqdm

from lambeq.core.globals import VerbosityLevel
from lambeq.core.utils import SentenceBatchType
from lambeq.text2diagram.ccg_parser import CCGParser
from lambeq.text2diagram.ccg_rule import CCGRule
from lambeq.text2diagram.ccg_tree import CCGTree
from lambeq.text2diagram.ccg_type import CCGType
from lambeq.typing import StrPathT

if TYPE_CHECKING:
    from lambeq.backend.grammar import Diagram


[docs]class CCGBankParseError(Exception):
    """Error raised if parsing fails in CCGBank."""

[docs]    def __init__(self, sentence: str = '', message: str = '') -> None:
        if message:
            self.sentence = sentence
            self.message = message
        else:
            self.sentence = ''
            self.message = sentence

    def __str__(self) -> str:
        if self.sentence:
            return f'Failed to parse {repr(self.sentence)}: {self.message}.'
        return self.message


[docs]class CCGBankParser(CCGParser):
    """A parser for CCGBank trees."""

    ccg_type_regex = re.compile(
            r'((?P<bare_cat>N|NP|S|PP)(\[[a-z]+])?|conj|LRB|RRB|[,.:;])')

    id_regex = re.compile(r"""ID=(?P<id>\S+)  # line begins with "ID=<id>"
                              .*              # rest of the line is ignored
                           """, re.DOTALL | re.VERBOSE)

    tree_regex = re.compile(r"""
        \(<                                 # begin with literal "(<"
           ((?P<is_leaf>        L)|T) \s+   # "L" or "T" depending on node type
           (?P<ccg_str>          \S+) \s+   # the CCG category
           (?(is_leaf)                      # if node is a leaf, then the
               (?P<mod_pos>      \S+) \s+   #    following 4 fields are present
               (?P<orig_pos>     \S+) \s+
               (?P<word>         \S+) \s+
               (?P<pred_arg_cat> \S+)
           |                                # otherwise, the following 2 fields
               (?P<head>         0|1) \s+   #    are present
               (?P<children>     \d+)
           )
          >                                 # close with ">"
        (?(is_leaf)\)|) \s*                 # if node is a leaf, then there is
                                            #    a matching ")"
        """, re.VERBOSE)

    escaped_words = {'-LCB-': '{', '-RCB-': '}', '-LRB-': '(', '-RRB-': ')'}

[docs]    def __init__(self,
                 root: StrPathT,
                 verbose: str = VerbosityLevel.SUPPRESS.value) -> None:
        """Initialise a CCGBank parser.

        Parameters
        ----------
        root : str or os.PathLike
            Path to the root of the corpus. The sections must be located
            in `<root>/data/AUTO`.
        verbose : str, default: 'suppress',
            See :py:class:`VerbosityLevel` for options.

        """
        if not VerbosityLevel.has_value(verbose):
            raise ValueError(f'`{verbose}` is not a valid verbose value for '
                             'CCGBankParser.')
        self.root = Path(root)
        self.verbose = verbose

[docs]    def section2trees(self,
                      section_id: int,
                      suppress_exceptions: bool = False,
                      verbose: str | None = None) -> dict[str, CCGTree | None]:
        """Parse a CCGBank section into trees.

        Parameters
        ----------
        section_id : int
            The section to parse.
        suppress_exceptions : bool, default: False
            Stop exceptions from being raised, instead returning
            :py:obj:`None` for a tree.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Returns
        -------
        trees : dict
            A dictionary of trees labelled by their ID in CCGBank. If a
            tree fails to parse and exceptions are suppressed, that
            entry is :py:obj:`None`.

        Raises
        ------
        CCGBankParseError
            If parsing fails and exceptions are not suppressed.

        """
        return {key: tree for key, tree in self.section2trees_gen(
            section_id,
            suppress_exceptions=suppress_exceptions,
            verbose=verbose)}

[docs]    def section2trees_gen(
        self,
        section_id: int,
        suppress_exceptions: bool = False,
        verbose: str | None = None
    ) -> Iterator[tuple[str, CCGTree | None]]:
        """Parse a CCGBank section into trees, given as a generator.

        The generator only reads data when it is accessed, providing the
        user with control over the reading process.

        Parameters
        ----------
        section_id : int
            The section to parse.
        suppress_exceptions : bool, default: False
            Stop exceptions from being raised, instead returning
            :py:obj:`None` for a tree.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Yields
        ------
        ID, tree : tuple of str and CCGTree
            ID in CCGBank and the corresponding tree. If a
            tree fails to parse and exceptions are suppressed, that
            entry is :py:obj:`None`.

        Raises
        ------
        CCGBankParseError
            If parsing fails and exceptions are not suppressed.

        """
        if verbose is None:
            verbose = self.verbose
        if not VerbosityLevel.has_value(verbose):
            raise ValueError(f'`{verbose}` is not a valid verbose value for '
                             'CCGBankParser.')
        path = self.root / 'data' / 'AUTO' / f'{section_id:02}'
        for file in sorted(path.iterdir()):
            with open(file) as f:
                if verbose == VerbosityLevel.TEXT.value:
                    print(f'Parsing `{file}`', file=sys.stderr)
                line_no = 0
                for line in tqdm(
                        f,
                        desc=f'Parsing `{file}`',
                        leave=False,
                        disable=verbose != VerbosityLevel.PROGRESS.value):
                    line_no += 1
                    match = self.id_regex.fullmatch(line)
                    if match:
                        line_no += 1
                        tree = None
                        try:
                            tree = self.sentence2tree(next(f).strip())
                        except CCGBankParseError as e:
                            if not suppress_exceptions:
                                raise CCGBankParseError(
                                    f'Failed to parse tree in `{file}` '
                                    f'line {line_no}: {e.message}'
                                ) from e
                        yield match['id'], tree
                    elif not suppress_exceptions:
                        raise CCGBankParseError('Failed to parse ID in '
                                                f'`{file}` line {line_no}')

[docs]    def section2diagrams(
        self,
        section_id: int,
        planar: bool = False,
        suppress_exceptions: bool = False,
        verbose: str | None = None
    ) -> dict[str, Diagram | None]:
        """Parse a CCGBank section into diagrams.

        Parameters
        ----------
        section_id : int
            The section to parse.
        planar : bool, default: False
            Force diagrams to be planar when they contain
            crossed composition.
        suppress_exceptions : bool, default: False
            Stop exceptions from being raised, instead returning
            :py:obj:`None` for a diagram.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.
        Returns
        -------
        diagrams : dict
            A dictionary of diagrams labelled by their ID in CCGBank. If
            a diagram fails to draw and exceptions are suppressed, that
            entry is replaced by :py:obj:`None`.

        Raises
        ------
        CCGBankParseError
            If parsing fails and exceptions are not suppressed.

        """
        return {key: diagram for key, diagram in self.section2diagrams_gen(
            section_id,
            planar=planar,
            suppress_exceptions=suppress_exceptions,
            verbose=verbose)}

[docs]    def section2diagrams_gen(
        self,
        section_id: int,
        planar: bool = False,
        suppress_exceptions: bool = False,
        verbose: str | None = None
    ) -> Iterator[tuple[str, Diagram | None]]:
        """Parse a CCGBank section into diagrams, given as a generator.

        The generator only reads data when it is accessed, providing the
        user with control over the reading process.

        Parameters
        ----------
        section_id : int
            The section to parse.
        planar : bool, default: False
            Force diagrams to be planar when they contain
            crossed composition.
        suppress_exceptions : bool, default: False
            Stop exceptions from being raised, instead returning
            :py:obj:`None` for a diagram.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Yields
        ------
        ID, diagram : tuple of str and Diagram
            ID in CCGBank and the corresponding diagram. If a
            diagram fails to draw and exceptions are suppressed, that
            entry is replaced by :py:obj:`None`.

        Raises
        ------
        CCGBankParseError
            If parsing fails and exceptions are not suppressed.

        """
        if verbose is None:
            verbose = self.verbose
        if not VerbosityLevel.has_value(verbose):
            raise ValueError(f'`{verbose}` is not a valid verbose value for '
                             'CCGBankParser.')
        trees = self.section2trees_gen(section_id,
                                       suppress_exceptions,
                                       verbose=verbose)
        for k, tree in trees:
            if tree is not None:
                try:
                    diagram = tree.to_diagram(planar)
                except Exception as e:
                    if suppress_exceptions:
                        diagram = None
                    else:
                        raise e
            else:
                diagram = None
            yield k, diagram

[docs]    def sentences2trees(self,
                        sentences: SentenceBatchType,
                        tokenised: bool = False,
                        suppress_exceptions: bool = False,
                        verbose: str | None = None) -> list[CCGTree | None]:
        """Parse a CCGBank sentence derivation into a CCGTree.

        The sentence must be in the format outlined in the CCGBank
        manual section D.2 and not just a list of words.

        Parameters
        ----------
        sentences : list of str
            List of sentences to parse.
        suppress_exceptions : bool, default: False
            Stop exceptions from being raised, instead returning
            :py:obj:`None` for a tree.
        tokenised : bool, default: False
            Whether the sentence has been passed as a list of tokens.
            For CCGBankParser, it should be kept `False`.
        verbose : str, optional
            See :py:class:`VerbosityLevel` for options. If set, takes
            priority over the :py:attr:`verbose` attribute of the
            parser.

        Returns
        -------
        trees : list of CCGTree
            A list of trees. If a tree fails to parse and exceptions are
            suppressed, that entry is :py:obj:`None`.

        Raises
        ------
        CCGBankParseError
            If parsing fails and exceptions are not suppressed.
        ValueError
            If `tokenised` flag is True (not valid for CCGBankParser).

        """
        if verbose is None:
            verbose = self.verbose
        if not VerbosityLevel.has_value(verbose):
            raise ValueError(f'`{verbose}` is not a valid verbose value for '
                             'CCGBankParser.')
        trees = []
        for sentence in sentences:
            if tokenised:
                raise ValueError('`tokenised` set to `True`, but this is not '
                                 'a valid value for CCGBankParser.')
            assert isinstance(sentence, str)
            tree = None
            try:
                tree, pos = CCGBankParser._build_ccgtree(sentence, 0)
                if pos < len(sentence):
                    raise CCGBankParseError(f'extra text from index {pos+1} - '
                                            f'{repr(sentence[pos:])}')
            except Exception as e:
                if not suppress_exceptions:
                    raise CCGBankParseError(sentence, str(e)) from e
            trees.append(tree)
        return trees

    @staticmethod
    def _build_ccgtree(sentence: str, start: int) -> tuple[CCGTree, int]:
        tree_match = CCGBankParser.tree_regex.match(sentence, pos=start)
        if not tree_match:
            raise CCGBankParseError(f'malformed tree from index {start+1} - '
                                    f'{repr(sentence[start:])}')

        ccg_str = tree_match['ccg_str']
        if ccg_str == r'((S[b]\NP)/NP)/':  # fix mistake in CCGBank
            ccg_str = r'(S[b]\NP)/NP'
        biclosed_type = CCGType.parse(
            ccg_str,
            map_atomic=CCGBankParser._map_atomic_type
        )
        pos = tree_match.end()
        if tree_match['is_leaf']:
            word = tree_match['word']
            try:
                word = CCGBankParser.escaped_words[word]
            except KeyError:
                pass
            ccg_tree = CCGTree(text=word,
                               biclosed_type=biclosed_type,
                               metadata={'original': tree_match})
        else:
            children = []
            while not sentence[pos] == ')':
                child, pos = CCGBankParser._build_ccgtree(sentence, pos)
                children.append(child)

            rule = CCGRule.infer_rule([child.biclosed_type
                                       for child in children],
                                      biclosed_type)
            ccg_tree = CCGTree(rule=rule,
                               biclosed_type=biclosed_type,
                               children=children,
                               metadata={'original': tree_match})
            pos += 2
        return ccg_tree, pos

    @staticmethod
    def _map_atomic_type(cat: str) -> str:
        match = CCGBankParser.ccg_type_regex.fullmatch(cat)
        if not match:
            raise CCGBankParseError(f'failed to parse atomic type {repr(cat)}')
        cat = match['bare_cat'] or cat
        if cat == 'N':
            return CCGType.NOUN.name
        elif cat == 'NP':
            return CCGType.NOUN_PHRASE.name
        elif cat == 'S':
            return CCGType.SENTENCE.name
        elif cat == 'PP':
            return CCGType.PREPOSITIONAL_PHRASE.name
        elif cat == 'conj':
            return CCGType.CONJUNCTION.name
        return CCGType.PUNCTUATION.name