""" Fuzzy algoriths abstract base classes."""
import warnings
from abc import ABC
from abc import abstractmethod
from typing import Generic
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing_extensions import Protocol
from typing_extensions import runtime_checkable
from iamsystem.fuzzy.util import IWords2ignore
from iamsystem.fuzzy.util import SimpleWords2ignore
from iamsystem.matcher.util import StateTransition
from iamsystem.tokenization.api import TokenT
# Synonym type. Ex: ('insuffisance','cardiaque')
SynType = Tuple[str, ...]
# Synonym type with an algorithm. Ex: (('insuffisance','cardiaque'), 'exact')
SynAlgo = Tuple[SynType, str]
# Synonym type with algorithms. Ex: (('insuffisance','cardiaque'), ['exact',...
SynAlgos = Tuple[SynType, List[str]]
@runtime_checkable
class ISynsProvider(Protocol[TokenT]):
"""Provides all the synonyms coming from fuzzy algorithms."""
@abstractmethod
def get_synonyms(
self,
tokens: Sequence[TokenT],
token: TokenT,
transitions: Iterable[StateTransition],
) -> List[SynAlgos]:
"""Retrieve the synonyms of a token.
:param tokens: the sequence of tokens of the document.
Useful when the fuzzy algorithm needs context, namely the tokens
around the token of interest given by 'i' parameter.
:param token: the token of this sequence for which synonyms
are expected.
:param transitions: the state transitions in which the algorithm
currently is. Useful is the fuzzy algorithm needs to know the next
or possible transitions.
:return: 0 to many synonyms.
"""
raise NotImplementedError
[docs]class FuzzyAlgo(Generic[TokenT], ABC):
"""Fuzzy Algorithm base class."""
NO_SYN: Iterable[SynType] = []
"Default value to return by a fuzzy algorithm if no synonym found."
def __init__(self, name: str):
"""Create a fuzzy algorithm to allow a partial match between
a text token and a keyword token.
:param name: algorithm's name.
"""
self.name = name
[docs] @staticmethod
def word_to_syn(word: str) -> SynType:
"""Utility function to transform a string to expected SynType.
:param word: a word synonym produced by the algorithm.
Ex: word='insuffisance' for token 'ins'.
:return: SynType, the expected output format.
"""
return tuple([word])
[docs] @staticmethod
def words_seq_to_syn(words: Sequence[str]) -> SynType:
"""Utility function to transform a sequence of string
to the expected output type.
:param words: a sequence of words produced by the algorithm.
Ex: words=['insuffisance', 'cardiaque'] for the token 'ic'.
:return: SynType, the expected output format.
"""
return tuple(words)
[docs] @abstractmethod
def get_synonyms(
self,
tokens: Sequence[TokenT],
token: TokenT,
transitions: Iterable[StateTransition],
) -> List[SynAlgo]:
"""Main API function to retrieve all synonyms provided by
a fuzzy algorithm.
:param tokens: the sequence of tokens of the document.
Useful when the fuzzy algorithm needs context, namely the tokens
around the token of interest given by 'i' parameter.
:param token: the token of this sequence for which synonyms
are expected.
:param transitions: the state transitions in which the algorithm
currently is. Useful is the fuzzy algorithm needs to know the next
or possible transitions.
:return: 0 to many synonyms (SynAlgo type).
"""
raise NotImplementedError
[docs]class ContextFreeAlgo(FuzzyAlgo[TokenT], ABC):
"""A :class:`~iamsystem.FuzzyAlgo` that doesn't take into account context,
only the current token."""
def __init__(self, name: str):
super().__init__(name)
[docs] def get_synonyms(
self,
tokens: Sequence[TokenT],
token: TokenT,
transitions: Iterable[StateTransition],
) -> List[SynAlgo]:
"""Delegate to get_syns_of_token."""
return [
(syn, self.name) for syn in self.get_syns_of_token(token=token)
]
[docs] @abstractmethod
def get_syns_of_token(self, token: TokenT) -> Iterable[SynType]:
"""Returns synonyms of this token."""
pass
class INormLabelAlgo(Protocol):
"""A fuzzy algorithm that relies only on a string."""
name: str
@abstractmethod
def get_syns_of_word(self, word: str) -> Iterable[SynType]:
"""Returns the synonym of this word"""
raise NotImplementedError
[docs]class NormLabelAlgo(ContextFreeAlgo[TokenT], INormLabelAlgo, ABC):
"""A :class:`~iamsystem.FuzzyAlgo`
that uses only the normalized label of a token.
These fuzzy algorithms can be put in cache to avoid calling
them multiple times.
See :class:`~iamsystem.CacheFuzzyAlgos`.
"""
[docs] def get_syns_of_token(self, token: TokenT) -> Iterable[SynType]:
"""Delegate to get_syns_of_word."""
return self.get_syns_of_word(word=token.norm_label)
[docs] @abstractmethod
def get_syns_of_word(self, word: str) -> Iterable[SynType]:
"""Returns synonyms of this word
(e.g. the normalized label of a token)."""
raise NotImplementedError
class StringDistance(NormLabelAlgo, ABC):
"""Class that computes a string distance between a token of a document
and the keywords' tokens."""
def __init__(
self,
name: str,
min_nb_char: int,
words2ignore: Optional[IWords2ignore] = None,
):
"""Create a string distance fuzzy algorithm.
:param name: string distance algorithm name.
:param min_nb_char: the minimum number of characters a word
must have in order not to be ignored.
"""
super().__init__(name)
self._min_nb_char = min_nb_char
self._tokens2ignore: IWords2ignore = (
words2ignore or SimpleWords2ignore()
)
@property
def min_nb_char(self):
"""The minimum number of characters a word must have
not to be ignored."""
return self._min_nb_char
@min_nb_char.setter
def min_nb_char(self, value: int):
"""Set the minimum number of characters a word must have."""
self._min_nb_char = value
def _is_a_word_to_ignore(self, word: str) -> bool:
"""Check if this word must be ignored."""
return len(
word
) < self._min_nb_char or self._tokens2ignore.is_word_2_ignore(word)
def add_words_to_ignore(self, words: Iterable[str]):
"""Add words that the algorithm must ignore: no string distance
will be computed."""
warnings.warn(
"Deprecated 'add_words_to_ignore': "
"pass these words in the constructor."
)
for word in words:
self._tokens2ignore.add_word(word)