Source code for iamsystem.spacy.tokenizer

""" Tokenize keywords and documents using spaCy tokenizer. """


from typing import Sequence

from spacy import Language

from iamsystem.spacy.token import TokenSpacyAdapter
from iamsystem.tokenization.api import ITokenizer
from iamsystem.tokenization.normalize import normalizeFun


[docs]class SpacyTokenizer(ITokenizer[TokenSpacyAdapter]): """A class that wraps spaCy's tokenizer."""
[docs] def __init__(self, nlp: Language, norm_fun: normalizeFun): """Create a tokenizer for iamsystem algorithm that uses spaCy's tokenizer. :param nlp: a spacy Language. :param norm_fun: a function that normalizes the 'norm\\_' attribute of a spaCy token, attribute used by iamsystem algorithm. """ self.nlp = nlp self.norm_fun = norm_fun
[docs] def tokenize(self, text: str) -> Sequence[TokenSpacyAdapter]: """Tokenize a text. This function is used only to tokenize the keywords by the matcher since this custom component receives from spaCy the document already tokenized. :param text: a string to tokenize with spaCy component. :return: an ordered sequence of tokens. """ with self.nlp.select_pipes(enable="tokenizer"): doc = self.nlp(text) tokens = [ TokenSpacyAdapter(spacy_token=token, norm_fun=self.norm_fun) for token in doc ] return tokens