# Copyright 2021-2024 Cambridge Quantum Computing Ltd.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Spacy Tokeniser===============A tokeniser that wraps SpaCy."""from__future__importannotations__all__=['SpacyTokeniser']fromcollections.abcimportIterablefromtypingimportTYPE_CHECKINGimportspacyimportspacy.lang.enfromlambeq.core.utilsimportget_spacy_tokeniserfromlambeq.tokeniserimportTokeniserifTYPE_CHECKING:importspacy.cli
[docs]classSpacyTokeniser(Tokeniser):"""Tokeniser class based on SpaCy."""
[docs]defsplit_sentences(self,text:str)->list[str]:"""Split input text into a list of sentences. Parameters ---------- text : str A single string that contains one or multiple sentences. Returns ------- list of str List of sentences, one sentence in each string. """return[str(sent)forsentinself.spacy_nlp(text).sents]
[docs]deftokenise_sentences(self,sentences:Iterable[str])->list[list[str]]:"""Tokenise a list of sentences. Parameters ---------- sentences : list of str A list of untokenised sentences. Returns ------- list of list of str A list of tokenised sentences, where each sentence is a list of tokens. """disable=['parser','tagger','ner','lemmatizer']tokenised=[]forsinsentences:s_cleaned=' '.join(s.split())tokenised.append([str(t)fortinself.tokeniser(s_cleaned,disable=disable)])returntokenised