Source code for iamsystem.tokenization.span

""" Classes that store a sequence of tokens. """
from typing import List

from iamsystem.tokenization.api import IOffsets
from iamsystem.tokenization.api import ISpan
from iamsystem.tokenization.api import TokenT
from iamsystem.tokenization.util import concat_tokens_label
from iamsystem.tokenization.util import concat_tokens_norm_label
from iamsystem.tokenization.util import get_span_seq_id
from iamsystem.tokenization.util import offsets_overlap


[docs]class Span(ISpan[TokenT], IOffsets):
    """A class that represents a sequence of tokens in a document."""

    def __init__(self, tokens: List[TokenT]):
        """Create a Span.

        :param tokens: an ordered continuous or discontinuous sequence
            of TokenT in a document.
        """
        self._tokens = tokens
        """The start offset of the first token."""
        self.start = self.tokens[0].start
        """The start offset of the first token."""
        self.end = self.tokens[-1].end

    @property
    def tokens(self) -> List[TokenT]:
        """The tokens of the document that matched the keywords attribute of
        this instance.

        :return: an ordered sequence of TokenT, a generic type
            that implements :class:`~iamsystem.IToken`.
        """
        return self._tokens

    @property
    def start_i(self):
        """The index of the first token within the parent document."""
        return self.tokens[0].i

    @property
    def end_i(self):
        """The index of the last token within the parent document."""
        return self.tokens[-1].i

    @property
    def tokens_label(self):
        """The concatenation of each token's label."""
        return concat_tokens_label(self._tokens)

    @property
    def tokens_norm_label(self):
        """The concatenation of each token's norm_label."""
        return concat_tokens_norm_label(self._tokens)

[docs]    def get_text_substring(self, text: str) -> str:
        """Return text substring."""
        return text[self.start : self.end]  # noqa

    def __str__(self):
        """A dataclass string representation."""
        return (
            f"Span(tokens_label='{self.tokens_label}', "
            f"tokens_norm_label='{self.tokens_norm_label}',"
            f"start_i={self.start_i}, end_i={self.end_i}, "
            f"start={self.start}, end={self.end})"
        )


def is_shorter_span_of(a: Span, b: Span) -> bool:
    """True if a is the shorter span of b."""
    if a is b:
        return False
    if not offsets_overlap(a=a, b=b):
        return False
    # if both conditions are true then we can't decide which to remove so it
    # returns False. Ex: 'IRC' abbreviation is matched to two long forms
    # that have the same offsets.
    if a.start == b.start and a.end == b.end:
        return False
    # b_seq_id must contain all offsets of a_seq_id, for example:
    # 1) left: 'lung cancer' and 'lung'
    # 2) right: 'prostate cancer' and 'cancer'
    # 3) middle: 'prostate cancer undetermined' and 'cancer'
    a_seq_id = get_span_seq_id(a.tokens)
    b_seq_id = get_span_seq_id(b.tokens)
    return a_seq_id in b_seq_id