Source code for iamsystem.spacy.component

""" Custom spaCy components to add iamsystem algorithm in a spaCy pipeline."""

import importlib

from abc import ABC
from abc import abstractmethod
from typing import Any
from typing import Dict
from typing import Iterable
from typing import List

from spacy import Language
from spacy.tokens import Doc
from spacy.tokens import Span

from iamsystem.fuzzy.api import FuzzyAlgo
from iamsystem.keywords.api import IEntity
from iamsystem.keywords.api import IKeyword
from iamsystem.matcher.api import IAnnotation
from iamsystem.matcher.api import IMatcher
from iamsystem.matcher.matcher import Matcher
from iamsystem.spacy.token import TokenSpacyAdapter
from iamsystem.spacy.tokenizer import SpacyTokenizer
from iamsystem.stopwords.api import IStopwords
from iamsystem.tokenization.api import ITokenizer
from iamsystem.tokenization.normalize import lower_no_accents
from iamsystem.tokenization.normalize import normalizeFun


class BaseCustomComp(ABC):
    """Base class to build a custom spaCy component."""

    def __init__(self, nlp: Language, name: str, attr: str = "iamsystem"):
        """Create a custom spaCy component.

        :param nlp: a spacy Language.
        :param name: the name of this spaCy component.
        :param attr: the attribute that stores iamsystem's annotations
            in a spaCy span instance.
        """

        self.name = name
        self.nlp = nlp
        self.attr = attr
        self._set_extensions()

    @property
    @abstractmethod
    def matcher(self) -> IMatcher[TokenSpacyAdapter]:
        """A :class:`~iamsystem.Matcher`.
        This method is abstract to allow several ways to make a dependency
        injection of a matcher in this class.
        """
        raise NotImplementedError

    def _set_extensions(self) -> None:
        """Add an extension to the spaCy Span class to store iamsystem's
        annotations."""
        if not Span.has_extension(self.attr):
            Span.set_extension(self.attr, default=None)

    def __call__(self, doc: Doc) -> Doc:
        """Function called by spaCy to execute this component."""
        spans = self.process(doc)
        if self.attr not in doc.spans:
            doc.spans[self.attr] = spans
        return doc

    def process(self, doc) -> List[Span]:
        """Annotate a document. Call IAMsystem algorithm."""
        tokens = self.matcher.tokenize(text=doc)
        anns: List[IAnnotation[TokenSpacyAdapter]] = self.matcher.annot_tokens(
            tokens=tokens
        )
        spacy_spans = []
        for ann in anns:
            start_i = ann.start_i
            end_i = ann.end_i
            # 'A label to attach to the span, e.g. for named entities.'
            labels = [str(keyword) for keyword in ann.keywords]
            label = ";".join(labels)
            # 'A knowledge base ID to attach to the span for named entities.'
            entities = [kw for kw in ann.keywords if isinstance(kw, IEntity)]
            if len(entities) != 0:
                kbids = [entity.kb_id for entity in entities]
                kb_id = ";".join(kbids)
                span = Span(
                    doc=doc, start=start_i, end=end_i, label=label, kb_id=kb_id
                )
            else:
                span = Span(doc=doc, start=start_i, end=end_i, label=label)
            span._.set(self.attr, ann)
            spacy_spans.append(span)
        return spacy_spans


[docs]@Language.factory("iamsystem")
class IAMsystemSpacy(BaseCustomComp):
    """A stateful component.
    'Component factories are callables that take settings and return a pipeline
    component function. This is useful if your component is stateful and if
    you need to customize their creation'.
    See: https://spacy.io/usage/processing-pipelines#custom-components
    """

[docs]    def __init__(
        self,
        nlp: Language,
        name: str,
        keywords: Iterable[IKeyword],
        fuzzy_algos: Iterable[FuzzyAlgo],
        w: int = 1,
        remove_nested_annots: bool = True,
        stopwords: IStopwords[TokenSpacyAdapter] = None,
        norm_fun: normalizeFun = lower_no_accents,
        attr: str = "iamsystem",
    ):
        """
        Create a custom spaCy component.
        :class:`~iamsystem.Matcher` uses spaCy tokenizer to tokenize the
        documents and the keywords.

        :param nlp: a spacy Language.
        :param name: the name of this spaCy component.
        :param keywords: a list of :class:`~iamsystem.IKeywords` to detect
          in a document.
        :param fuzzy_algos: a list of :class:`~iamsystem.FuzzyAlgo`.
        :param w: :class:`~iamsystem.Matcher`'s window parameter.
        :param remove_nested_annots: whether to remove nested annotations.
        :param stopwords: :class:`~iamsystem.IStopwords` instance.
        :param norm_fun: a function that normalizes the 'norm\\_' attribute
            of a spaCy token, attribute used by iamsystem.
        :param attr: the attribute to store iamsystem's annotation in a spaCy
            span instance.
        """

        super().__init__(
            nlp=nlp,
            name=name,
            attr=attr,
        )
        tokenizer: ITokenizer[TokenSpacyAdapter] = SpacyTokenizer(
            nlp=nlp, norm_fun=norm_fun
        )
        self._matcher = Matcher(tokenizer=tokenizer, stopwords=stopwords)
        self._matcher.remove_nested_annots = remove_nested_annots
        self._matcher.w = w
        for algo in fuzzy_algos:
            self._matcher.add_fuzzy_algo(fuzzy_algo=algo)
        self._matcher.add_keywords(keywords=keywords)

    @property
    def matcher(self) -> IMatcher[TokenSpacyAdapter]:
        """A matcher that uses spaCy tokenizer."""
        return self._matcher


[docs]@Language.factory("iamsystem_matcher")
class IAMsystemBuildSpacy(BaseCustomComp):
    """A serializable custom component."""

[docs]    def __init__(
        self,
        nlp: Language,
        name: str,
        build_params: Dict[Any, Any],
        serialized_kw: Dict[Any, Any] = None,
        attr: str = "iamsystem",
        norm_fun: normalizeFun = None,
    ):
        """
        Create a custom spaCy component.
        :class:`~iamsystem.Matcher` uses spaCy tokenizer to tokenize the
        documents and the keywords.

        :param nlp: a spacy Language.
        :param name: the name of this spaCy component.
        :param attr: the attribute to store iamsystem's annotation in a spaCy
            span instance.
        :param serialized_kw: a way to import serialized keywords.
            A dictionary containing 3 fields:

            - 'module': module name of the class to import. ex: 'iamsystem'.
            - 'class_name': the Keyword class to import.
            - 'kw': an iterable of dict created with the asdict() function.

            If None, keywords are expected in 'build_params'.
        :param norm_fun: a function that normalizes the 'norm\\_' attribute
            of a spaCy token, attribute used by iamsystem. Default to lower
            case and remove accents.
        :param build_params: :py:meth:`~iamsystem.Matcher.build` parameters,
            the spacy tokenizer will be used whatever the tokenizer value.
        """

        super().__init__(
            nlp=nlp,
            name=name,
            attr=attr,
        )
        if norm_fun is None:
            norm_fun = lower_no_accents
        tokenizer: ITokenizer[TokenSpacyAdapter] = SpacyTokenizer(
            nlp=nlp, norm_fun=norm_fun
        )
        build_params["tokenizer"] = tokenizer
        # import serializable keywords:
        if serialized_kw is not None:
            KeywordClass = getattr(
                importlib.import_module(serialized_kw["module"]),
                serialized_kw["class_name"],
            )
            kws = [KeywordClass(**params) for params in serialized_kw["kws"]]
            build_params["keywords"] = kws
        self._matcher = Matcher.build(**build_params)

    @property
    def matcher(self) -> IMatcher[TokenSpacyAdapter]:
        """A matcher that uses spaCy tokenizer."""
        return self._matcher