Source code for lambeq.text2diagram.ccgbank_parser

# Copyright 2021-2024 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CCGBank parser
==============

The CCGBank is a translation of the Penn Treebank into a corpus of
Combinatory Categorial Grammar derivations, created by Julia Hockenmaier
and Mark Steedman.

This module provides a parser that automatically turns parses from
CCGBank into :py:class:`.CCGTree` s.

"""

from __future__ import annotations

__all__ = ['CCGBankParseError', 'CCGBankParser']

from collections.abc import Iterator
from pathlib import Path
import re
import sys
from typing import TYPE_CHECKING

from tqdm.auto import tqdm

from lambeq.core.globals import VerbosityLevel
from lambeq.core.utils import SentenceBatchType
from lambeq.text2diagram.ccg_parser import CCGParser
from lambeq.text2diagram.ccg_rule import CCGRule
from lambeq.text2diagram.ccg_tree import CCGTree
from lambeq.text2diagram.ccg_type import CCGType
from lambeq.typing import StrPathT

if TYPE_CHECKING:
    from lambeq.backend.grammar import Diagram


[docs]class CCGBankParseError(Exception): """Error raised if parsing fails in CCGBank."""
[docs] def __init__(self, sentence: str = '', message: str = '') -> None: if message: self.sentence = sentence self.message = message else: self.sentence = '' self.message = sentence
def __str__(self) -> str: if self.sentence: return f'Failed to parse {repr(self.sentence)}: {self.message}.' return self.message
[docs]class CCGBankParser(CCGParser): """A parser for CCGBank trees.""" ccg_type_regex = re.compile( r'((?P<bare_cat>N|NP|S|PP)(\[[a-z]+])?|conj|LRB|RRB|[,.:;])') id_regex = re.compile(r"""ID=(?P<id>\S+) # line begins with "ID=<id>" .* # rest of the line is ignored """, re.DOTALL | re.VERBOSE) tree_regex = re.compile(r""" \(< # begin with literal "(<" ((?P<is_leaf> L)|T) \s+ # "L" or "T" depending on node type (?P<ccg_str> \S+) \s+ # the CCG category (?(is_leaf) # if node is a leaf, then the (?P<mod_pos> \S+) \s+ # following 4 fields are present (?P<orig_pos> \S+) \s+ (?P<word> \S+) \s+ (?P<pred_arg_cat> \S+) | # otherwise, the following 2 fields (?P<head> 0|1) \s+ # are present (?P<children> \d+) ) > # close with ">" (?(is_leaf)\)|) \s* # if node is a leaf, then there is # a matching ")" """, re.VERBOSE) escaped_words = {'-LCB-': '{', '-RCB-': '}', '-LRB-': '(', '-RRB-': ')'}
[docs] def __init__(self, root: StrPathT, verbose: str = VerbosityLevel.SUPPRESS.value) -> None: """Initialise a CCGBank parser. Parameters ---------- root : str or os.PathLike Path to the root of the corpus. The sections must be located in `<root>/data/AUTO`. verbose : str, default: 'suppress', See :py:class:`VerbosityLevel` for options. """ if not VerbosityLevel.has_value(verbose): raise ValueError(f'`{verbose}` is not a valid verbose value for ' 'CCGBankParser.') self.root = Path(root) self.verbose = verbose
[docs] def section2trees(self, section_id: int, suppress_exceptions: bool = False, verbose: str | None = None) -> dict[str, CCGTree | None]: """Parse a CCGBank section into trees. Parameters ---------- section_id : int The section to parse. suppress_exceptions : bool, default: False Stop exceptions from being raised, instead returning :py:obj:`None` for a tree. verbose : str, optional See :py:class:`VerbosityLevel` for options. If set, takes priority over the :py:attr:`verbose` attribute of the parser. Returns ------- trees : dict A dictionary of trees labelled by their ID in CCGBank. If a tree fails to parse and exceptions are suppressed, that entry is :py:obj:`None`. Raises ------ CCGBankParseError If parsing fails and exceptions are not suppressed. """ return {key: tree for key, tree in self.section2trees_gen( section_id, suppress_exceptions=suppress_exceptions, verbose=verbose)}
[docs] def section2trees_gen( self, section_id: int, suppress_exceptions: bool = False, verbose: str | None = None ) -> Iterator[tuple[str, CCGTree | None]]: """Parse a CCGBank section into trees, given as a generator. The generator only reads data when it is accessed, providing the user with control over the reading process. Parameters ---------- section_id : int The section to parse. suppress_exceptions : bool, default: False Stop exceptions from being raised, instead returning :py:obj:`None` for a tree. verbose : str, optional See :py:class:`VerbosityLevel` for options. If set, takes priority over the :py:attr:`verbose` attribute of the parser. Yields ------ ID, tree : tuple of str and CCGTree ID in CCGBank and the corresponding tree. If a tree fails to parse and exceptions are suppressed, that entry is :py:obj:`None`. Raises ------ CCGBankParseError If parsing fails and exceptions are not suppressed. """ if verbose is None: verbose = self.verbose if not VerbosityLevel.has_value(verbose): raise ValueError(f'`{verbose}` is not a valid verbose value for ' 'CCGBankParser.') path = self.root / 'data' / 'AUTO' / f'{section_id:02}' for file in sorted(path.iterdir()): with open(file) as f: if verbose == VerbosityLevel.TEXT.value: print(f'Parsing `{file}`', file=sys.stderr) line_no = 0 for line in tqdm( f, desc=f'Parsing `{file}`', leave=False, disable=verbose != VerbosityLevel.PROGRESS.value): line_no += 1 match = self.id_regex.fullmatch(line) if match: line_no += 1 tree = None try: tree = self.sentence2tree(next(f).strip()) except CCGBankParseError as e: if not suppress_exceptions: raise CCGBankParseError( f'Failed to parse tree in `{file}` ' f'line {line_no}: {e.message}' ) from e yield match['id'], tree elif not suppress_exceptions: raise CCGBankParseError('Failed to parse ID in ' f'`{file}` line {line_no}')
[docs] def section2diagrams( self, section_id: int, planar: bool = False, suppress_exceptions: bool = False, verbose: str | None = None ) -> dict[str, Diagram | None]: """Parse a CCGBank section into diagrams. Parameters ---------- section_id : int The section to parse. planar : bool, default: False Force diagrams to be planar when they contain crossed composition. suppress_exceptions : bool, default: False Stop exceptions from being raised, instead returning :py:obj:`None` for a diagram. verbose : str, optional See :py:class:`VerbosityLevel` for options. If set, takes priority over the :py:attr:`verbose` attribute of the parser. Returns ------- diagrams : dict A dictionary of diagrams labelled by their ID in CCGBank. If a diagram fails to draw and exceptions are suppressed, that entry is replaced by :py:obj:`None`. Raises ------ CCGBankParseError If parsing fails and exceptions are not suppressed. """ return {key: diagram for key, diagram in self.section2diagrams_gen( section_id, planar=planar, suppress_exceptions=suppress_exceptions, verbose=verbose)}
[docs] def section2diagrams_gen( self, section_id: int, planar: bool = False, suppress_exceptions: bool = False, verbose: str | None = None ) -> Iterator[tuple[str, Diagram | None]]: """Parse a CCGBank section into diagrams, given as a generator. The generator only reads data when it is accessed, providing the user with control over the reading process. Parameters ---------- section_id : int The section to parse. planar : bool, default: False Force diagrams to be planar when they contain crossed composition. suppress_exceptions : bool, default: False Stop exceptions from being raised, instead returning :py:obj:`None` for a diagram. verbose : str, optional See :py:class:`VerbosityLevel` for options. If set, takes priority over the :py:attr:`verbose` attribute of the parser. Yields ------ ID, diagram : tuple of str and Diagram ID in CCGBank and the corresponding diagram. If a diagram fails to draw and exceptions are suppressed, that entry is replaced by :py:obj:`None`. Raises ------ CCGBankParseError If parsing fails and exceptions are not suppressed. """ if verbose is None: verbose = self.verbose if not VerbosityLevel.has_value(verbose): raise ValueError(f'`{verbose}` is not a valid verbose value for ' 'CCGBankParser.') trees = self.section2trees_gen(section_id, suppress_exceptions, verbose=verbose) for k, tree in trees: if tree is not None: try: diagram = tree.to_diagram(planar) except Exception as e: if suppress_exceptions: diagram = None else: raise e else: diagram = None yield k, diagram
[docs] def sentences2trees(self, sentences: SentenceBatchType, tokenised: bool = False, suppress_exceptions: bool = False, verbose: str | None = None) -> list[CCGTree | None]: """Parse a CCGBank sentence derivation into a CCGTree. The sentence must be in the format outlined in the CCGBank manual section D.2 and not just a list of words. Parameters ---------- sentences : list of str List of sentences to parse. suppress_exceptions : bool, default: False Stop exceptions from being raised, instead returning :py:obj:`None` for a tree. tokenised : bool, default: False Whether the sentence has been passed as a list of tokens. For CCGBankParser, it should be kept `False`. verbose : str, optional See :py:class:`VerbosityLevel` for options. If set, takes priority over the :py:attr:`verbose` attribute of the parser. Returns ------- trees : list of CCGTree A list of trees. If a tree fails to parse and exceptions are suppressed, that entry is :py:obj:`None`. Raises ------ CCGBankParseError If parsing fails and exceptions are not suppressed. ValueError If `tokenised` flag is True (not valid for CCGBankParser). """ if verbose is None: verbose = self.verbose if not VerbosityLevel.has_value(verbose): raise ValueError(f'`{verbose}` is not a valid verbose value for ' 'CCGBankParser.') trees = [] for sentence in sentences: if tokenised: raise ValueError('`tokenised` set to `True`, but this is not ' 'a valid value for CCGBankParser.') assert isinstance(sentence, str) tree = None try: tree, pos = CCGBankParser._build_ccgtree(sentence, 0) if pos < len(sentence): raise CCGBankParseError(f'extra text from index {pos+1} - ' f'{repr(sentence[pos:])}') except Exception as e: if not suppress_exceptions: raise CCGBankParseError(sentence, str(e)) from e trees.append(tree) return trees
@staticmethod def _build_ccgtree(sentence: str, start: int) -> tuple[CCGTree, int]: tree_match = CCGBankParser.tree_regex.match(sentence, pos=start) if not tree_match: raise CCGBankParseError(f'malformed tree from index {start+1} - ' f'{repr(sentence[start:])}') ccg_str = tree_match['ccg_str'] if ccg_str == r'((S[b]\NP)/NP)/': # fix mistake in CCGBank ccg_str = r'(S[b]\NP)/NP' biclosed_type = CCGType.parse( ccg_str, map_atomic=CCGBankParser._map_atomic_type ) pos = tree_match.end() if tree_match['is_leaf']: word = tree_match['word'] try: word = CCGBankParser.escaped_words[word] except KeyError: pass ccg_tree = CCGTree(text=word, biclosed_type=biclosed_type, metadata={'original': tree_match}) else: children = [] while not sentence[pos] == ')': child, pos = CCGBankParser._build_ccgtree(sentence, pos) children.append(child) rule = CCGRule.infer_rule([child.biclosed_type for child in children], biclosed_type) ccg_tree = CCGTree(rule=rule, biclosed_type=biclosed_type, children=children, metadata={'original': tree_match}) pos += 2 return ccg_tree, pos @staticmethod def _map_atomic_type(cat: str) -> str: match = CCGBankParser.ccg_type_regex.fullmatch(cat) if not match: raise CCGBankParseError(f'failed to parse atomic type {repr(cat)}') cat = match['bare_cat'] or cat if cat == 'N': return CCGType.NOUN.name elif cat == 'NP': return CCGType.NOUN_PHRASE.name elif cat == 'S': return CCGType.SENTENCE.name elif cat == 'PP': return CCGType.PREPOSITIONAL_PHRASE.name elif cat == 'conj': return CCGType.CONJUNCTION.name return CCGType.PUNCTUATION.name