Source code for iamsystem.fuzzy.spellwise

""" Spellwise library wrapper."""
import warnings

from enum import Enum
from typing import Iterable
from typing import List
from typing import Optional
from typing import Union

from spellwise import CaverphoneOne
from spellwise import CaverphoneTwo
from spellwise import Editex
from spellwise import Levenshtein
from spellwise import Soundex
from spellwise import Typox
from typing_extensions import Protocol
from typing_extensions import TypedDict

from iamsystem.fuzzy.api import FuzzyAlgo
from iamsystem.fuzzy.api import StringDistance
from iamsystem.fuzzy.api import SynType
from iamsystem.fuzzy.util import IWords2ignore


class Suggestions(TypedDict):
    """Spellwise algorithm's output."""

    word: str
    distance: int


[docs] class ESpellWiseAlgo(Enum): """Enumerated list of spellwise library algorithms. See spellwise documentation for more information. """ LEVENSHTEIN = Levenshtein SOUNDEX = Soundex EDITEX = Editex TYPOX = Typox CAVERPHONE_1 = CaverphoneOne CAVERPHONE_2 = CaverphoneTwo
class ISpellWiseAlgo(Protocol): """Spellwise algorithm interface.""" def add_words(self, words: List[str]) -> None: """Add words to a spellwise algorithm.""" pass def get_suggestions( self, query_word: str, max_distance: int = 2 ) -> List[Suggestions]: """Returns synonyms.""" pass
[docs] class SpellWiseWrapper(StringDistance): """A :class:`~iamsystem.FuzzyAlgo` that wraps an algorithm from the spellwise library."""
[docs] def __init__( self, measure: Union[str, ESpellWiseAlgo], max_distance: int, min_nb_char=5, words2ignore: Optional[IWords2ignore] = None, name: str = None, ): """Create an instance to take advantage of a spellwise algorithm. :param measure: The measure string or a value selected from :class:`~iamsystem.SpellWiseAlgo` enumerated list. :param max_distance: maximum edit distance (see spellwise documentation). :param min_nb_char: the minimum number of characters a word must have in order not to be ignored. :param words2ignore: words that must be ignored by the algorithm to avoid false positives, for example English vocabulary words. :param name: a name given to this algorithm. Default: spellwise algorithm's name. """ if isinstance(measure, str): measure = ESpellWiseAlgo[measure.upper()] if name is None: name = measure.name super().__init__( name=name, min_nb_char=min_nb_char, words2ignore=words2ignore ) self._suggester: ISpellWiseAlgo = measure.value() self._max_distance = max_distance
@property def max_distance(self): """Maximum edit distance (see spellwise documentation).""" return self._max_distance @max_distance.setter def max_distance(self, value: int): """Set the maximum edit distance.""" self._max_distance = value
[docs] def add_words(self, words: Iterable[str], warn=False) -> None: """A list of possible word synonyms, in general all the tokens of your keywords. An easy way to provide these tokens is to call :py:meth:`~iamsystem.Matcher.get_keywords_unigrams` method after you added your keywords to the matcher instance. :param words: A list of possible synonyms. :param warn: raise a warning if a word added is ignored. Default False. :return: None. """ words = list(words) words_filtered = [ word for word in words if not len(word) < self.min_nb_char ] n_removed = len(words) - len(words_filtered) if n_removed != 0 and warn: warnings.warn( f"{n_removed} words weren't added to fuzzy algo '{self.name}'" f" after filtering" ) self._suggester.add_words(words=words_filtered)
[docs] def get_syns_of_word(self, word: str) -> Iterable[SynType]: """Compute string distance if it is not a word to be ignored and return keywords' unigrams in the maximum distance from that word.""" if self._is_a_word_to_ignore(word): return FuzzyAlgo.NO_SYN suggs: List[Suggestions] = self._suggester.get_suggestions( query_word=word, max_distance=self._max_distance ) if len(suggs) == 0: return FuzzyAlgo.NO_SYN return [self.word_to_syn(sugg.get("word", "")) for sugg in suggs]