Source code for iamsystem.tokenization.tokenize

""" Tokenizer implementations. """

import re

from typing import Callable
from typing import Iterable
from typing import List
from typing import Sequence

from iamsystem.stopwords.api import IStopwords
from iamsystem.tokenization.api import IOffsets
from iamsystem.tokenization.api import ITokenizer
from iamsystem.tokenization.api import TokenT
from iamsystem.tokenization.normalize import lower_no_accents
from iamsystem.tokenization.normalize import normalizeFun
from iamsystem.tokenization.token import Offsets
from iamsystem.tokenization.token import Token


splitFun = Callable[[str], Iterable[IOffsets]]


[docs]def split_find_iter_closure(pattern: str) -> splitFun: """Build a split function that maps a document to (start, end) tuples. :param pattern: a regex to split sentence characters. :return: a split function. """ pattern = pattern r = re.compile(pattern) def split(text: str) -> Iterable[IOffsets]: """Split the text with a regular expression""" match_iter = r.finditer(text) tokens = map( lambda match: Offsets(start=match.start(), end=match.end()), match_iter, ) return tokens return split
split_alpha_num = split_find_iter_closure(pattern=r"\w+")
[docs]class TokenizerImp(ITokenizer[Token]): """A :class:`~iamsystem.ITokenizer` implementation. Class responsible for the tokenization, normalization of tokens. See also :func:`~iamsystem.french_tokenizer`, :func:`~iamsystem.english_tokenizer`. """
[docs] def __init__(self, split: splitFun, normalize: normalizeFun): """Create a custom tokenizer that splits and normalizes a string. :param split: a function that split a text into (start,end) tuples. This function must return an iterable of :class:`~iamsystem.IOffsets` . See also :func:`~iamsystem.split_find_iter_closure`. :param normalize: a function that normalizes a string. This function must return a string. """ self.split = split self.normalize = normalize
[docs] def tokenize(self, text: str) -> Sequence[Token]: """Split the text into a sequence of :class:`~iamsystem.Token`.""" offsets: Iterable[IOffsets] = self.split(text) tokens: List[Token] = [ Token( start=offset.start, end=offset.end, label=text[offset.start : offset.end], # noqa norm_label=self.normalize( text[offset.start : offset.end] # noqa ), i=i, ) for i, offset in enumerate(offsets) ] return tokens
def remove_stopwords( tokens: Sequence[TokenT], stopwords=IStopwords[TokenT] ) -> Sequence[TokenT]: """Utility function to filter stopwords from a sequence of tokens""" tokens_wo_stop: Sequence[TokenT] = tuple( filter(lambda token: not stopwords.is_token_a_stopword(token), tokens) ) return tokens_wo_stop
[docs]def french_tokenizer() -> TokenizerImp: """An opinionated French tokenizer. | It splits the text by 'word' character. | It normalizes by lowercasing and unicode normalization form. :return: a :class:`~iamsystem.TokenizerImp` implementation. """ return TokenizerImp(split=split_alpha_num, normalize=lower_no_accents)
[docs]def english_tokenizer() -> TokenizerImp: """An opinionated English tokenizer. | It splits the text by 'word' character. | It normalizes by lowercasing. :return: a :class:`~iamsystem.TokenizerImp` implementation. """ return TokenizerImp(split=split_alpha_num, normalize=lambda s: s.lower())
tokenize_fun = Callable[[str], Sequence[TokenT]]
[docs]def tokenize_and_order_decorator(tokenize: tokenize_fun) -> tokenize_fun: """Decorate a tokenize function: the tokens are sorted alphabetically by their label. :param tokenize: a tokenize function to decorate. :return: the decorated tokenize function. """ def tokenize_and_order(text: str) -> List[TokenT]: """tokenize and sort.""" tokens = list(tokenize(text)) tokens.sort(key=lambda x: x.norm_label, reverse=False) return tokens return tokenize_and_order