# Copyright 2021-2023 Cambridge Quantum Computing Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CCGBank parser
==============
The CCGBank is a translation of the Penn Treebank into a corpus of
Combinatory Categorial Grammar derivations, created by Julia Hockenmaier
and Mark Steedman.
This module provides a parser that automatically turns parses from
CCGBank into :py:class:`.CCGTree` s.
"""
from __future__ import annotations
__all__ = ['CCGBankParseError', 'CCGBankParser']
from collections.abc import Iterator
from pathlib import Path
import re
import sys
from discopy.biclosed import Ty
from discopy.rigid import Diagram
from tqdm.auto import tqdm
from lambeq.core.globals import VerbosityLevel
from lambeq.core.utils import SentenceBatchType
from lambeq.text2diagram.ccg_parser import CCGParser
from lambeq.text2diagram.ccg_rule import CCGRule
from lambeq.text2diagram.ccg_tree import CCGTree
from lambeq.text2diagram.ccg_types import CCGAtomicType, str2biclosed
from lambeq.typing import StrPathT
[docs]class CCGBankParseError(Exception):
"""Error raised if parsing fails in CCGBank."""
[docs] def __init__(self, sentence: str = '', message: str = '') -> None:
if message:
self.sentence = sentence
self.message = message
else:
self.sentence = ''
self.message = sentence
def __str__(self) -> str:
if self.sentence:
return f'Failed to parse {repr(self.sentence)}: {self.message}.'
return self.message
[docs]class CCGBankParser(CCGParser):
"""A parser for CCGBank trees."""
ccg_type_regex = re.compile(
r'((?P<bare_cat>N|NP|S|PP)(\[[a-z]+])?|conj|LRB|RRB|[,.:;])')
id_regex = re.compile(r"""ID=(?P<id>\S+) # line begins with "ID=<id>"
.* # rest of the line is ignored
""", re.DOTALL | re.VERBOSE)
tree_regex = re.compile(r"""
\(< # begin with literal "(<"
((?P<is_leaf> L)|T) \s+ # "L" or "T" depending on node type
(?P<ccg_str> \S+) \s+ # the CCG category
(?(is_leaf) # if node is a leaf, then the
(?P<mod_pos> \S+) \s+ # following 4 fields are present
(?P<orig_pos> \S+) \s+
(?P<word> \S+) \s+
(?P<pred_arg_cat> \S+)
| # otherwise, the following 2 fields
(?P<head> 0|1) \s+ # are present
(?P<children> \d+)
)
> # close with ">"
(?(is_leaf)\)|) \s* # if node is a leaf, then there is
# a matching ")"
""", re.VERBOSE)
escaped_words = {'-LCB-': '{', '-RCB-': '}', '-LRB-': '(', '-RRB-': ')'}
[docs] def __init__(self,
root: StrPathT,
verbose: str = VerbosityLevel.SUPPRESS.value) -> None:
"""Initialise a CCGBank parser.
Parameters
----------
root : str or os.PathLike
Path to the root of the corpus. The sections must be located
in `<root>/data/AUTO`.
verbose : str, default: 'suppress',
See :py:class:`VerbosityLevel` for options.
"""
if not VerbosityLevel.has_value(verbose):
raise ValueError(f'`{verbose}` is not a valid verbose value for '
'CCGBankParser.')
self.root = Path(root)
self.verbose = verbose
[docs] def section2trees(self,
section_id: int,
suppress_exceptions: bool = False,
verbose: str | None = None) -> dict[str, CCGTree | None]:
"""Parse a CCGBank section into trees.
Parameters
----------
section_id : int
The section to parse.
suppress_exceptions : bool, default: False
Stop exceptions from being raised, instead returning
:py:obj:`None` for a tree.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Returns
-------
trees : dict
A dictionary of trees labelled by their ID in CCGBank. If a
tree fails to parse and exceptions are suppressed, that
entry is :py:obj:`None`.
Raises
------
CCGBankParseError
If parsing fails and exceptions are not suppressed.
"""
return {key: tree for key, tree in self.section2trees_gen(
section_id,
suppress_exceptions=suppress_exceptions,
verbose=verbose)}
[docs] def section2trees_gen(
self,
section_id: int,
suppress_exceptions: bool = False,
verbose: str | None = None
) -> Iterator[tuple[str, CCGTree | None]]:
"""Parse a CCGBank section into trees, given as a generator.
The generator only reads data when it is accessed, providing the
user with control over the reading process.
Parameters
----------
section_id : int
The section to parse.
suppress_exceptions : bool, default: False
Stop exceptions from being raised, instead returning
:py:obj:`None` for a tree.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Yields
------
ID, tree : tuple of str and CCGTree
ID in CCGBank and the corresponding tree. If a
tree fails to parse and exceptions are suppressed, that
entry is :py:obj:`None`.
Raises
------
CCGBankParseError
If parsing fails and exceptions are not suppressed.
"""
if verbose is None:
verbose = self.verbose
if not VerbosityLevel.has_value(verbose):
raise ValueError(f'`{verbose}` is not a valid verbose value for '
'CCGBankParser.')
path = self.root / 'data' / 'AUTO' / f'{section_id:02}'
for file in sorted(path.iterdir()):
with open(file) as f:
if verbose == VerbosityLevel.TEXT.value:
print(f'Parsing `{file}`', file=sys.stderr)
line_no = 0
for line in tqdm(
f,
desc=f'Parsing `{file}`',
leave=False,
disable=verbose != VerbosityLevel.PROGRESS.value):
line_no += 1
match = self.id_regex.fullmatch(line)
if match:
line_no += 1
tree = None
try:
tree = self.sentence2tree(next(f).strip())
except CCGBankParseError as e:
if not suppress_exceptions:
raise CCGBankParseError(
f'Failed to parse tree in `{file}` '
f'line {line_no}: {e.message}'
) from e
yield match['id'], tree
elif not suppress_exceptions:
raise CCGBankParseError('Failed to parse ID in '
f'`{file}` line {line_no}')
[docs] def section2diagrams(
self,
section_id: int,
planar: bool = False,
suppress_exceptions: bool = False,
verbose: str | None = None
) -> dict[str, Diagram | None]:
"""Parse a CCGBank section into diagrams.
Parameters
----------
section_id : int
The section to parse.
planar : bool, default: False
Force diagrams to be planar when they contain
crossed composition.
suppress_exceptions : bool, default: False
Stop exceptions from being raised, instead returning
:py:obj:`None` for a diagram.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Returns
-------
diagrams : dict
A dictionary of diagrams labelled by their ID in CCGBank. If
a diagram fails to draw and exceptions are suppressed, that
entry is replaced by :py:obj:`None`.
Raises
------
CCGBankParseError
If parsing fails and exceptions are not suppressed.
"""
return {key: diagram for key, diagram in self.section2diagrams_gen(
section_id,
planar=planar,
suppress_exceptions=suppress_exceptions,
verbose=verbose)}
[docs] def section2diagrams_gen(
self,
section_id: int,
planar: bool = False,
suppress_exceptions: bool = False,
verbose: str | None = None
) -> Iterator[tuple[str, Diagram | None]]:
"""Parse a CCGBank section into diagrams, given as a generator.
The generator only reads data when it is accessed, providing the
user with control over the reading process.
Parameters
----------
section_id : int
The section to parse.
planar : bool, default: False
Force diagrams to be planar when they contain
crossed composition.
suppress_exceptions : bool, default: False
Stop exceptions from being raised, instead returning
:py:obj:`None` for a diagram.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Yields
------
ID, diagram : tuple of str and Diagram
ID in CCGBank and the corresponding diagram. If a
diagram fails to draw and exceptions are suppressed, that
entry is replaced by :py:obj:`None`.
Raises
------
CCGBankParseError
If parsing fails and exceptions are not suppressed.
"""
if verbose is None:
verbose = self.verbose
if not VerbosityLevel.has_value(verbose):
raise ValueError(f'`{verbose}` is not a valid verbose value for '
'CCGBankParser.')
trees = self.section2trees_gen(section_id,
suppress_exceptions,
verbose=verbose)
for k, tree in trees:
if tree is not None:
try:
diagram = tree.to_diagram(planar)
except Exception as e:
if suppress_exceptions:
diagram = None
else:
raise e
else:
diagram = None
yield k, diagram
[docs] def sentences2trees(self,
sentences: SentenceBatchType,
tokenised: bool = False,
suppress_exceptions: bool = False,
verbose: str | None = None) -> list[CCGTree | None]:
"""Parse a CCGBank sentence derivation into a CCGTree.
The sentence must be in the format outlined in the CCGBank
manual section D.2 and not just a list of words.
Parameters
----------
sentences : list of str
List of sentences to parse.
suppress_exceptions : bool, default: False
Stop exceptions from being raised, instead returning
:py:obj:`None` for a tree.
tokenised : bool, default: False
Whether the sentence has been passed as a list of tokens.
For CCGBankParser, it should be kept `False`.
verbose : str, optional
See :py:class:`VerbosityLevel` for options. If set, takes
priority over the :py:attr:`verbose` attribute of the
parser.
Returns
-------
trees : list of CCGTree
A list of trees. If a tree fails to parse and exceptions are
suppressed, that entry is :py:obj:`None`.
Raises
------
CCGBankParseError
If parsing fails and exceptions are not suppressed.
ValueError
If `tokenised` flag is True (not valid for CCGBankParser).
"""
if verbose is None:
verbose = self.verbose
if not VerbosityLevel.has_value(verbose):
raise ValueError(f'`{verbose}` is not a valid verbose value for '
'CCGBankParser.')
trees = []
for sentence in sentences:
if tokenised:
raise ValueError('`tokenised` set to `True`, but this is not '
'a valid value for CCGBankParser.')
assert isinstance(sentence, str)
tree = None
try:
tree, pos = CCGBankParser._build_ccgtree(sentence, 0)
if pos < len(sentence):
raise CCGBankParseError(f'extra text from index {pos+1} - '
f'{repr(sentence[pos:])}')
except Exception as e:
if not suppress_exceptions:
raise CCGBankParseError(sentence, str(e)) from e
trees.append(tree)
return trees
@staticmethod
def _build_ccgtree(sentence: str, start: int) -> tuple[CCGTree, int]:
tree_match = CCGBankParser.tree_regex.match(sentence, pos=start)
if not tree_match:
raise CCGBankParseError(f'malformed tree from index {start+1} - '
f'{repr(sentence[start:])}')
ccg_str = tree_match['ccg_str']
if ccg_str == r'((S[b]\NP)/NP)/': # fix mistake in CCGBank
ccg_str = r'(S[b]\NP)/NP'
biclosed_type = str2biclosed(ccg_str,
str2type=CCGBankParser._parse_atomic_type)
pos = tree_match.end()
if tree_match['is_leaf']:
word = tree_match['word']
try:
word = CCGBankParser.escaped_words[word]
except KeyError:
pass
ccg_tree = CCGTree(text=word, biclosed_type=biclosed_type)
else:
children = []
while not sentence[pos] == ')':
child, pos = CCGBankParser._build_ccgtree(sentence, pos)
children.append(child)
rule = CCGRule.infer_rule(Ty().tensor(*(child.biclosed_type
for child in children)),
biclosed_type)
ccg_tree = CCGTree(rule=rule,
biclosed_type=biclosed_type,
children=children)
pos += 2
return ccg_tree, pos
@staticmethod
def _parse_atomic_type(cat: str) -> Ty:
match = CCGBankParser.ccg_type_regex.fullmatch(cat)
if not match:
raise CCGBankParseError(f'failed to parse atomic type {repr(cat)}')
cat = match['bare_cat'] or cat
if cat in ('N', 'NP'):
return CCGAtomicType.NOUN
elif cat == 'S':
return CCGAtomicType.SENTENCE
elif cat == 'PP':
return CCGAtomicType.PREPOSITIONAL_PHRASE
elif cat == 'conj':
return CCGAtomicType.CONJUNCTION
return CCGAtomicType.PUNCTUATION