Source code for iamsystem.fuzzy.spellwise

""" Spellwise library wrapper."""
import warnings

from enum import Enum
from typing import Dict
from typing import Iterable
from typing import List

from spellwise import CaverphoneOne
from spellwise import CaverphoneTwo
from spellwise import Editex
from spellwise import Levenshtein
from spellwise import Soundex
from spellwise import Typox
from typing_extensions import Protocol
from typing_extensions import TypedDict

from iamsystem.fuzzy.api import FuzzyAlgo
from iamsystem.fuzzy.api import NormLabelAlgo
from iamsystem.fuzzy.api import SynType


class Suggestions(TypedDict):
    """Spellwise algorithm's output."""

    word: str
    distance: int


[docs]class ESpellWiseAlgo(Enum): """Enumerated list of spellwise library algorithms. See spellwise documentation for more information. """ LEVENSHTEIN = Levenshtein SOUNDEX = Soundex EDITEX = Editex TYPOX = Typox CAVERPHONE_1 = CaverphoneOne CAVERPHONE_2 = CaverphoneTwo
class ISpellWiseAlgo(Protocol): """Spellwise algorithm interface.""" def add_words(self, words: List[str]) -> None: """Add words to a spellwise algorithm.""" pass def get_suggestions( self, query_word: str, max_distance: int = 2 ) -> List[Suggestions]: """Returns synonyms.""" pass
[docs]class SpellWiseWrapper(NormLabelAlgo): """A :class:`~iamsystem.FuzzyAlgo` that wraps an algorithm from the spellwise library.""" def __init__( self, spellwise_algo: ESpellWiseAlgo, max_distance: int, min_nb_char: int = 5, name: str = None, ): """Create an instance to leverage a spellwise algorithm. :param spellwise_algo: A value from :class:`~iamsystem.SpellWiseAlgo` enumerated list. :param max_distance: maximum edit distance (see spellwise documentation). :param min_nb_char: the minimum number of characters a word must have not to be ignored. :param name: a name given to this algorithm. Default: spellwise algorithm's name. """ algo_name = spellwise_algo.name if name is not None: algo_name = name super().__init__(algo_name) self._tokens2ignore: Dict[str, None] = {} self._suggester: ISpellWiseAlgo = spellwise_algo.value() self._max_distance = max_distance self._min_nb_char = min_nb_char @property def max_distance(self): """Maximum edit distance (see spellwise documentation).""" return self._max_distance @max_distance.setter def max_distance(self, value: int): """Set the maximum edit distance.""" self._max_distance = value @property def min_nb_char(self): """The minimum number of characters a word must have not to be ignored.""" return self._min_nb_char @min_nb_char.setter def min_nb_char(self, value: int): """Set the minimum number of characters a word must have.""" self._min_nb_char = value
[docs] def add_words(self, words: Iterable[str], warn=False) -> None: """A list of possible word synonyms, in general all the tokens of your keywords. An easy way to provide these tokens is to call :py:meth:`~iamsystem.Matcher.get_keywords_unigrams` method after you added your keywords to the matcher instance. :param words: A list of possible synonyms. :param warn: raise a warning if a word added is ignored. Default False. :return: None. """ words = list(words) words_filtered = [ word for word in words if not self._is_a_word_to_ignore(word=word) ] n_removed = len(words) - len(words_filtered) if n_removed != 0 and warn: warnings.warn( f"{n_removed} words weren't added to fuzzy algo '{self.name}'" f" after filtering" ) self._suggester.add_words(words=words_filtered)
[docs] def add_words_to_ignore(self, words: Iterable[str]): """Add words that the algorithm will ignore: no string distance will be computed.""" for word in words: self._tokens2ignore[word] = None
def _is_a_word_to_ignore(self, word: str) -> bool: """Check if this ignore must be ignored right away.""" if len(word) < self._min_nb_char or word in self._tokens2ignore: return True else: return False
[docs] def get_syns_of_word(self, word: str) -> Iterable[SynType]: """Returns closest words if this the word is not a word to ignore.""" if self._is_a_word_to_ignore(word): return FuzzyAlgo.NO_SYN suggs: List[Suggestions] = self._suggester.get_suggestions( query_word=word, max_distance=self._max_distance ) if len(suggs) == 0: return FuzzyAlgo.NO_SYN return [self.word_to_syn(sugg.get("word", "")) for sugg in suggs]