Source code for iamsystem.tokenization.token

""" Tokenization outputs."""

from iamsystem.tokenization.api import IOffsets
from iamsystem.tokenization.api import IToken



[docs]
class Offsets(IOffsets):
    """Store the start and end offsets of a token."""


[docs]
    def __init__(self, start: int, end: int):
        """
        :param start: start-offset is the index of the first character.
        :param end: end-offset is the index of the last character **+ 1**, that
            is to say the first character to exclude from the returned
            substring when slicing with [start:end]
        """
        self.start = start
        self.end = end


    def __str__(self):
        """A dataclass string representation."""
        return f"Offsets(start={self.start}, end={self.end})"




[docs]
class Token(Offsets, IToken):
    """Store the label, normalized label, start and end offsets of a token."""


[docs]
    def __init__(
        self, start: int, end: int, label: str, norm_label: str, i: int
    ):
        """Create a token.

        :param start: start-offset is the index of the first character.
        :param end: end-offset is the index of the last character **+ 1**, that
            is to say the first character to exclude from the returned
            substring when slicing with [start:end]
        :param label: the label as it is in the document/keyword.
        :param norm_label: the normalized label (used by iamsystem's algorithm
            to perform entity linking).
        :param i: the index of the token within the parent document.
        """
        super().__init__(start, end)
        self.label = label
        self.norm_label = norm_label
        self.i = i


    def __str__(self):
        """A dataclass string representation."""
        return (
            f"Token(label='{self.label}', norm_label='{self.norm_label}',"
            f" start={self.start}, end={self.end}, i={self.i})"
        )