Source code for iamsystem.brat.formatter

from enum import Enum
from typing import Tuple

from iamsystem.brat.util import get_brat_format_seq
from iamsystem.matcher.api import IAnnotation
from iamsystem.matcher.api import IBratFormatter
from iamsystem.tokenization.util import get_text_and_offsets_of_sequences
from iamsystem.tokenization.util import group_continuous_seq
from iamsystem.tokenization.util import remove_trailing_stopwords


[docs]class ContSeqFormatter(IBratFormatter):
    """Default Brat Formatter: annotate a document by selecting continuous
    sequences of tokens but ignore stopwords."""

[docs]    def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
        """Return tokens' labels and token's offsets (merge if continuous)"""
        sequences = group_continuous_seq(tokens=annot.tokens)
        return get_text_and_offsets_of_sequences(
            sequences=sequences, annot=annot
        )


[docs]class TokenFormatter(IBratFormatter):
    """Annotate a document by creating (start,end) offsets for each token
    (In comparison to TokenFormatter, it doesn't merge continuous sequence)."""

[docs]    def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
        """Return tokens' labels and token's offsets (merge if continuous)"""
        seq_offsets = get_brat_format_seq(offsets_seq=annot.tokens)
        seq_label = " ".join([token.label for token in annot.tokens])
        return seq_label, seq_offsets


[docs]class ContSeqStopFormatter(IBratFormatter):
    """A Brat formatter that takes into account stopwords: annotate a document
    by selecting continuous sequences of tokens/stopwords."""

    def __init__(self, remove_trailing_stop=True):
        """Create a brat formatter.

        :param remove_trailing_stop: if True, trailing stopwords in a
            discontinuous sequence will be removed.
            Ex: [['North', 'and'], ['America']] -> [['North', ['America']]
        """
        self.remove_trailing_stop = remove_trailing_stop

[docs]    def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
        tokens = [*annot.tokens, *annot.stop_tokens]
        tokens.sort(key=lambda x: x.i)
        sequences = group_continuous_seq(tokens=tokens)
        if self.remove_trailing_stop:
            stop_i = [stop.i for stop in annot.stop_tokens]
            sequences = remove_trailing_stopwords(
                sequences=sequences, stop_i=stop_i
            )
        return get_text_and_offsets_of_sequences(
            sequences=sequences, annot=annot
        )


[docs]class SpanFormatter(IBratFormatter):
    """A simple Brat formatter that only uses start, end offsets
    of an annotation"""

[docs]    def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]:
        """Return text, offsets by start and end offsets of the annotation."""
        seq_label = annot.text[annot.start : annot.end]  # noqa
        seq_offsets = f"{annot.start} {annot.end}"
        return seq_label, seq_offsets


[docs]class EBratFormatters(Enum):
    """An enumerated list of available Brat Formatters."""

    DEFAULT = ContSeqFormatter()
    "Default to CONTINUOUS_SEQ."
    TOKEN = TokenFormatter()
    "A fragment for each token."
    CONTINUOUS_SEQ = ContSeqFormatter()
    "Merge a continuous sequence of tokens but ignore stopwords."
    CONTINUOUS_SEQ_STOP = ContSeqStopFormatter()
    "Merge a continuous sequence of tokens with stopwords."
    SPAN = SpanFormatter()
    "A Brat annotation from first token start-offsets to last token end-offsets."  # noqa