Source code for iamsystem.brat.formatter

from enum import Enum
from typing import Tuple

from iamsystem.brat.util import get_brat_format_seq
from iamsystem.matcher.api import IAnnotation
from iamsystem.matcher.api import IBratFormatter
from iamsystem.tokenization.util import get_text_and_offsets_of_sequences
from iamsystem.tokenization.util import group_continuous_seq
from iamsystem.tokenization.util import remove_trailing_stopwords


[docs] class ContSeqFormatter(IBratFormatter): """Default Brat Formatter: annotate a document by selecting continuous sequences of tokens but ignore stopwords."""
[docs] def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]: """Return tokens' labels and token's offsets (merge if continuous)""" sequences = group_continuous_seq(tokens=annot.tokens) return get_text_and_offsets_of_sequences( sequences=sequences, annot=annot )
[docs] class TokenFormatter(IBratFormatter): """Annotate a document by creating (start,end) offsets for each token (In comparison to TokenFormatter, it doesn't merge continuous sequence)."""
[docs] def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]: """Return tokens' labels and token's offsets (merge if continuous)""" seq_offsets = get_brat_format_seq(offsets_seq=annot.tokens) seq_label = " ".join([token.label for token in annot.tokens]) return seq_label, seq_offsets
[docs] class ContSeqStopFormatter(IBratFormatter): """A Brat formatter that takes into account stopwords: annotate a document by selecting continuous sequences of tokens/stopwords."""
[docs] def __init__(self, remove_trailing_stop=True): """Create a brat formatter. :param remove_trailing_stop: if True, trailing stopwords in a discontinuous sequence will be removed. Ex: [['North', 'and'], ['America']] -> [['North', ['America']] """ self.remove_trailing_stop = remove_trailing_stop
[docs] def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]: tokens = [*annot.tokens, *annot.stop_tokens] tokens.sort(key=lambda x: x.i) sequences = group_continuous_seq(tokens=tokens) if self.remove_trailing_stop: stop_i = [stop.i for stop in annot.stop_tokens] sequences = remove_trailing_stopwords( sequences=sequences, stop_i=stop_i ) return get_text_and_offsets_of_sequences( sequences=sequences, annot=annot )
[docs] class SpanFormatter(IBratFormatter): """A simple Brat formatter that only uses start, end offsets of an annotation"""
[docs] def get_text_and_offsets(self, annot: IAnnotation) -> Tuple[str, str]: """Return text, offsets by start and end offsets of the annotation.""" seq_label = annot.text[annot.start : annot.end] # noqa seq_offsets = f"{annot.start} {annot.end}" return seq_label, seq_offsets
[docs] class EBratFormatters(Enum): """An enumerated list of available Brat Formatters.""" DEFAULT = ContSeqFormatter() "Default to CONTINUOUS_SEQ." TOKEN = TokenFormatter() "A fragment for each token." CONTINUOUS_SEQ = ContSeqFormatter() "Merge a continuous sequence of tokens but ignore stopwords." CONTINUOUS_SEQ_STOP = ContSeqStopFormatter() "Merge a continuous sequence of tokens with stopwords." SPAN = SpanFormatter() "A Brat annotation from first token start-offsets to last token end-offsets." # noqa