Source code for iamsystem.spacy.component

""" Custom spaCy components to add iamsystem algorithm in a spaCy pipeline."""

import importlib

from abc import ABC
from abc import abstractmethod
from typing import Any
from typing import Dict
from typing import Iterable
from typing import List

from spacy import Language
from spacy.tokens import Doc
from spacy.tokens import Span

from iamsystem.fuzzy.api import FuzzyAlgo
from iamsystem.keywords.api import IEntity
from iamsystem.keywords.api import IKeyword
from iamsystem.matcher.api import IAnnotation
from iamsystem.matcher.api import IMatcher
from iamsystem.matcher.matcher import Matcher
from iamsystem.spacy.token import TokenSpacyAdapter
from iamsystem.spacy.tokenizer import SpacyTokenizer
from iamsystem.stopwords.api import IStopwords
from iamsystem.tokenization.api import ITokenizer
from iamsystem.tokenization.normalize import lower_no_accents
from iamsystem.tokenization.normalize import normalizeFun


class BaseCustomComp(ABC):
    """Base class to build a custom spaCy component."""

    def __init__(self, nlp: Language, name: str, attr: str = "iamsystem"):
        """Create a custom spaCy component.

        :param nlp: a spacy Language.
        :param name: the name of this spaCy component.
        :param attr: the attribute that stores iamsystem's annotations
            in a spaCy span instance.
        """

        self.name = name
        self.nlp = nlp
        self.attr = attr
        self._set_extensions()

    @property
    @abstractmethod
    def matcher(self) -> IMatcher[TokenSpacyAdapter]:
        """A :class:`~iamsystem.Matcher`.
        This method is abstract to allow several ways to make a dependency
        injection of a matcher in this class.
        """
        raise NotImplementedError

    def _set_extensions(self) -> None:
        """Add an extension to the spaCy Span class to store iamsystem's
        annotations."""
        if not Span.has_extension(self.attr):
            Span.set_extension(self.attr, default=None)

    def __call__(self, doc: Doc) -> Doc:
        """Function called by spaCy to execute this component."""
        if self.attr not in doc.spans:
            doc.spans[self.attr] = []
        iam_spans = self.process(doc)
        for span in iam_spans:
            doc.spans[self.attr].append(span)
        return doc

    def process(self, doc) -> List[Span]:
        """Annotate a document. Call IAMsystem algorithm."""
        tokens = self.matcher.tokenize(text=doc)
        anns: List[IAnnotation[TokenSpacyAdapter]] = self.matcher.annot_tokens(
            tokens=tokens
        )
        spacy_spans = []
        for ann in anns:
            start_i = ann.start_i
            end_i = ann.end_i
            # 'A label to attach to the span, e.g. for named entities.'
            labels = [str(keyword) for keyword in ann.keywords]
            label = ";".join(labels)
            # 'A knowledge base ID to attach to the span for named entities.'
            entities = [kw for kw in ann.keywords if isinstance(kw, IEntity)]
            if len(entities) != 0:
                kbids = [entity.kb_id for entity in entities]
                kb_id = ";".join(kbids)
                span = Span(
                    doc=doc, start=start_i, end=end_i, label=label, kb_id=kb_id
                )
            else:
                span = Span(doc=doc, start=start_i, end=end_i, label=label)
            span._.set(self.attr, ann)
            spacy_spans.append(span)
        return spacy_spans


[docs]@Language.factory("iamsystem") class IAMsystemSpacy(BaseCustomComp): """A stateful component. 'Component factories are callables that take settings and return a pipeline component function. This is useful if your component is stateful and if you need to customize their creation'. See: https://spacy.io/usage/processing-pipelines#custom-components """
[docs] def __init__( self, nlp: Language, name: str, keywords: Iterable[IKeyword], fuzzy_algos: Iterable[FuzzyAlgo], w: int = 1, remove_nested_annots: bool = True, stopwords: IStopwords[TokenSpacyAdapter] = None, norm_fun: normalizeFun = lower_no_accents, attr: str = "iamsystem", ): """ Create a custom spaCy component. :class:`~iamsystem.Matcher` uses spaCy tokenizer to tokenize the documents and the keywords. :param nlp: a spacy Language. :param name: the name of this spaCy component. :param keywords: a list of :class:`~iamsystem.IKeywords` to detect in a document. :param fuzzy_algos: a list of :class:`~iamsystem.FuzzyAlgo`. :param w: :class:`~iamsystem.Matcher`'s window parameter. :param remove_nested_annots: whether to remove nested annotations. :param stopwords: :class:`~iamsystem.IStopwords` instance. :param norm_fun: a function that normalizes the 'norm\\_' attribute of a spaCy token, attribute used by iamsystem. :param attr: the attribute to store iamsystem's annotation in a spaCy span instance. """ super().__init__( nlp=nlp, name=name, attr=attr, ) tokenizer: ITokenizer[TokenSpacyAdapter] = SpacyTokenizer( nlp=nlp, norm_fun=norm_fun ) self._matcher = Matcher(tokenizer=tokenizer, stopwords=stopwords) self._matcher.remove_nested_annots = remove_nested_annots self._matcher.w = w for algo in fuzzy_algos: self._matcher.add_fuzzy_algo(fuzzy_algo=algo) self._matcher.add_keywords(keywords=keywords)
@property def matcher(self) -> IMatcher[TokenSpacyAdapter]: """A matcher that uses spaCy tokenizer.""" return self._matcher
[docs]@Language.factory("iamsystem_matcher") class IAMsystemBuildSpacy(BaseCustomComp): """A serializable custom component."""
[docs] def __init__( self, nlp: Language, name: str, build_params: Dict[Any, Any], serialized_kw: Dict[Any, Any] = None, attr: str = "iamsystem", norm_fun: normalizeFun = None, ): """ Create a custom spaCy component. :class:`~iamsystem.Matcher` uses spaCy tokenizer to tokenize the documents and the keywords. :param nlp: a spacy Language. :param name: the name of this spaCy component. :param attr: the attribute to store iamsystem's annotation in a spaCy span instance. :param serialized_kw: a way to import serialized keywords. A dictionary containing 3 fields: - 'module': module name of the class to import. ex: 'iamsystem'. - 'class_name': the Keyword class to import. - 'kw': an iterable of dict created with the asdict() function. If None, keywords are expected in 'build_params'. :param norm_fun: a function that normalizes the 'norm\\_' attribute of a spaCy token, attribute used by iamsystem. Default to lower case and remove accents. :param build_params: :py:meth:`~iamsystem.Matcher.build` parameters, the spacy tokenizer will be used whatever the tokenizer value. """ super().__init__( nlp=nlp, name=name, attr=attr, ) if norm_fun is None: norm_fun = lower_no_accents tokenizer: ITokenizer[TokenSpacyAdapter] = SpacyTokenizer( nlp=nlp, norm_fun=norm_fun ) build_params["tokenizer"] = tokenizer # import serializable keywords: if serialized_kw is not None: KeywordClass = getattr( importlib.import_module(serialized_kw["module"]), serialized_kw["class_name"], ) kws = [KeywordClass(**params) for params in serialized_kw["kws"]] build_params["keywords"] = kws self._matcher = Matcher.build(**build_params)
@property def matcher(self) -> IMatcher[TokenSpacyAdapter]: """A matcher that uses spaCy tokenizer.""" return self._matcher