Source code for iamsystem.tokenization.token

""" Tokenization outputs."""

from iamsystem.tokenization.api import IOffsets
from iamsystem.tokenization.api import IToken


[docs] class Offsets(IOffsets): """Store the start and end offsets of a token."""
[docs] def __init__(self, start: int, end: int): """ :param start: start-offset is the index of the first character. :param end: end-offset is the index of the last character **+ 1**, that is to say the first character to exclude from the returned substring when slicing with [start:end] """ self.start = start self.end = end
def __str__(self): """A dataclass string representation.""" return f"Offsets(start={self.start}, end={self.end})"
[docs] class Token(Offsets, IToken): """Store the label, normalized label, start and end offsets of a token."""
[docs] def __init__( self, start: int, end: int, label: str, norm_label: str, i: int ): """Create a token. :param start: start-offset is the index of the first character. :param end: end-offset is the index of the last character **+ 1**, that is to say the first character to exclude from the returned substring when slicing with [start:end] :param label: the label as it is in the document/keyword. :param norm_label: the normalized label (used by iamsystem's algorithm to perform entity linking). :param i: the index of the token within the parent document. """ super().__init__(start, end) self.label = label self.norm_label = norm_label self.i = i
def __str__(self): """A dataclass string representation.""" return ( f"Token(label='{self.label}', norm_label='{self.norm_label}'," f" start={self.start}, end={self.end}, i={self.i})" )