Source code for iamsystem.fuzzy.api

""" Fuzzy algoriths abstract base classes."""
import warnings

from abc import ABC
from abc import abstractmethod
from typing import Generic
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple

from typing_extensions import Protocol
from typing_extensions import runtime_checkable

from iamsystem.fuzzy.util import IWords2ignore
from iamsystem.fuzzy.util import SimpleWords2ignore
from iamsystem.matcher.util import StateTransition
from iamsystem.tokenization.api import TokenT


# Synonym type. Ex: ('insuffisance','cardiaque')
SynType = Tuple[str, ...]

# Synonym type with an algorithm. Ex: (('insuffisance','cardiaque'), 'exact')
SynAlgo = Tuple[SynType, str]

# Synonym type with algorithms. Ex: (('insuffisance','cardiaque'), ['exact',...
SynAlgos = Tuple[SynType, List[str]]


@runtime_checkable
class ISynsProvider(Protocol[TokenT]):
    """Provides all the synonyms coming from fuzzy algorithms."""

    @abstractmethod
    def get_synonyms(
        self,
        tokens: Sequence[TokenT],
        token: TokenT,
        transitions: Iterable[StateTransition],
    ) -> List[SynAlgos]:
        """Retrieve the synonyms of a token.

        :param tokens: the sequence of tokens of the document.
            Useful when the fuzzy algorithm needs context, namely the tokens
            around the token of interest given by 'i' parameter.
        :param token: the token of this sequence for which synonyms
            are expected.
        :param transitions: the state transitions in which the algorithm
            currently is. Useful is the fuzzy algorithm needs to know the next
             or possible transitions.
        :return: 0 to many synonyms.
        """
        raise NotImplementedError


[docs]class FuzzyAlgo(Generic[TokenT], ABC): """Fuzzy Algorithm base class.""" NO_SYN: Iterable[SynType] = [] "Default value to return by a fuzzy algorithm if no synonym found." def __init__(self, name: str): """Create a fuzzy algorithm to allow a partial match between a text token and a keyword token. :param name: algorithm's name. """ self.name = name
[docs] @staticmethod def word_to_syn(word: str) -> SynType: """Utility function to transform a string to expected SynType. :param word: a word synonym produced by the algorithm. Ex: word='insuffisance' for token 'ins'. :return: SynType, the expected output format. """ return tuple([word])
[docs] @staticmethod def words_seq_to_syn(words: Sequence[str]) -> SynType: """Utility function to transform a sequence of string to the expected output type. :param words: a sequence of words produced by the algorithm. Ex: words=['insuffisance', 'cardiaque'] for the token 'ic'. :return: SynType, the expected output format. """ return tuple(words)
[docs] @abstractmethod def get_synonyms( self, tokens: Sequence[TokenT], token: TokenT, transitions: Iterable[StateTransition], ) -> List[SynAlgo]: """Main API function to retrieve all synonyms provided by a fuzzy algorithm. :param tokens: the sequence of tokens of the document. Useful when the fuzzy algorithm needs context, namely the tokens around the token of interest. :param token: the token of this sequence for which synonyms are expected. :param transitions: the state transitions in which the algorithm currently is. Useful is the fuzzy algorithm needs to know the next or possible transitions. :return: 0 to many synonyms (SynAlgo type). """ raise NotImplementedError
[docs]class ContextFreeAlgo(FuzzyAlgo[TokenT], ABC): """A :class:`~iamsystem.FuzzyAlgo` that doesn't take into account context, only the current token.""" def __init__(self, name: str): super().__init__(name)
[docs] def get_synonyms( self, tokens: Sequence[TokenT], token: TokenT, transitions: Iterable[StateTransition], ) -> List[SynAlgo]: """Delegate to get_syns_of_token.""" return [ (syn, self.name) for syn in self.get_syns_of_token(token=token) ]
[docs] @abstractmethod def get_syns_of_token(self, token: TokenT) -> Iterable[SynType]: """Returns synonyms of this token.""" pass
class INormLabelAlgo(Protocol): """A fuzzy algorithm that relies only on a string.""" name: str @abstractmethod def get_syns_of_word(self, word: str) -> Iterable[SynType]: """Returns the synonym of this word""" raise NotImplementedError
[docs]class NormLabelAlgo(ContextFreeAlgo[TokenT], INormLabelAlgo, ABC): """A :class:`~iamsystem.FuzzyAlgo` that uses only the normalized label of a token. These fuzzy algorithms can be put in cache to avoid calling them multiple times. See :class:`~iamsystem.CacheFuzzyAlgos`. """
[docs] def get_syns_of_token(self, token: TokenT) -> Iterable[SynType]: """Delegate to get_syns_of_word.""" return self.get_syns_of_word(word=token.norm_label)
[docs] @abstractmethod def get_syns_of_word(self, word: str) -> Iterable[SynType]: """Returns synonyms of this word (e.g. the normalized label of a token).""" raise NotImplementedError
class StringDistance(NormLabelAlgo, ABC): """Class that computes a string distance between a token of a document and the keywords' tokens.""" def __init__( self, name: str, min_nb_char: int, words2ignore: Optional[IWords2ignore] = None, ): """Create a string distance fuzzy algorithm. :param name: string distance algorithm name. :param min_nb_char: the minimum number of characters a word must have in order not to be ignored. """ super().__init__(name) self._min_nb_char = min_nb_char self._tokens2ignore: IWords2ignore = ( words2ignore or SimpleWords2ignore() ) @property def min_nb_char(self): """The minimum number of characters a word must have not to be ignored.""" return self._min_nb_char @min_nb_char.setter def min_nb_char(self, value: int): """Set the minimum number of characters a word must have.""" self._min_nb_char = value def _is_a_word_to_ignore(self, word: str) -> bool: """Check if this word must be ignored.""" return len( word ) < self._min_nb_char or self._tokens2ignore.is_word_2_ignore(word) def add_words_to_ignore(self, words: Iterable[str]): """Add words that the algorithm must ignore: no string distance will be computed.""" warnings.warn( "Deprecated 'add_words_to_ignore': " "pass these words in the constructor." ) for word in words: self._tokens2ignore.add_word(word)