Source code for iamsystem.matcher.annotation

""" Main API output."""
import functools

from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

import iamsystem

from iamsystem.brat.formatter import EBratFormatters
from iamsystem.keywords.api import IEntity
from iamsystem.keywords.api import IKeyword
from iamsystem.matcher.api import IAnnotation
from iamsystem.matcher.api import IBratFormatter
from iamsystem.matcher.printannot import PrintAnnot
from iamsystem.matcher.util import StateTransition
from iamsystem.tokenization.api import TokenT
from iamsystem.tokenization.span import Span
from iamsystem.tokenization.span import is_shorter_span_of
from iamsystem.tokenization.util import itoken_to_dict
from iamsystem.tokenization.util import min_start_or_end
from iamsystem.tokenization.util import offsets_overlap
from iamsystem.tokenization.util import replace_offsets_by_new_str
from iamsystem.tree.nodes import INode


[docs]class Annotation(Span[TokenT], IAnnotation[TokenT]):
    """Ouput class of :class:`~iamsystem.Matcher` storing information on the
    detected entities."""

    def __init__(
        self,
        tokens: List[TokenT],
        algos: List[List[str]],
        node: INode,
        stop_tokens: List[TokenT],
        text: Optional[str] = None,
    ):
        """Create an annotation.

        :param tokens: a sequence of TokenT, a generic type that implements
            :class:`~iamsystem.IToken` protocol.
        :param algos: the list of fuzzy algorithms that matched the tokens.
            One to several algorithms per token.
        :param node: a final state of iamsystem algorithm containing the
            keyword that matched this sequence of tokens.
        :param stop_tokens: the list of stopwords tokens of the document.
        :param text: the annotated text/document.
        """
        super().__init__(tokens)
        self._algos = algos
        self._node = node
        self._stop_tokens = stop_tokens
        self._text = text

    @property
    def text(self) -> Optional[str]:
        """Return the annotated text."""
        return self._text

    @text.setter
    def text(self, value: str) -> None:
        """Set the annotated text."""
        self._text = value

    @property
    def algos(self) -> List[List[str]]:
        return self._algos

    @property
    def label(self):
        """@Deprecated. An annotation label. Return 'tokens_label' attribute"""
        return self.tokens_label

    @property
    def stop_tokens(self) -> List[TokenT]:
        """The list of stopwords tokens inside the annotation detected by
        the Matcher stopwords instance."""
        # Note that _stop_tokens are stopwords of the document. The reason to
        # filter now and not before is that, when order_tokens = T, stopwords
        # inside an annotation may not have been seen.
        stop_tokens_in_annot = [
            token
            for token in self._stop_tokens
            if self.start_i < token.i < self.end_i
        ]
        stop_tokens_in_annot.sort(key=lambda token: token.i)
        return stop_tokens_in_annot

    @property
    def keywords(self) -> Sequence[IKeyword]:
        """The linked entities, :class:`~iamsystem.IKeyword` instances that
        matched a document's tokens."""
        return self._node.get_keywords()  # type: ignore

[docs]    def get_tokens_algos(self) -> Iterable[Tuple[TokenT, List[str]]]:
        """Get each token and the list of fuzzy algorithms that matched it.

        :return: an iterable of tuples (token0, ['algo1',...]) where token0 is
            a token and ['algo1',...] a list of fuzzy algorithms.
        """
        return zip(self._tokens, self.algos)

[docs]    def to_dict(self, text: str = None) -> Dict[str, Any]:
        """Return a dictionary representation of this object.

        :param text: the document from which this annotation comes from.
         Default to None.
        :return: A dictionary of relevant attributes.
        """
        dic = {
            "start": self.start,
            "end": self.end,
            "label": self.label,
            "norm_label": self.tokens_norm_label,
            "tokens": [itoken_to_dict(token) for token in self.tokens],
            "algos": self.algos,
            "kb_ids": [
                keyword.kb_id
                for keyword in self.keywords
                if isinstance(keyword, IEntity)
            ],
            "kw_labels": [keyword.label for keyword in self.keywords],
            "version": iamsystem.__annot_version__,
        }
        if text is not None:
            text_substring = text[self.start : self.end]  # noqa
            dic["substring"] = text_substring
        return dic

    def __str__(self) -> str:
        """Annotation string representation with Brat offsets format."""
        return f"{self.to_string()}"

[docs]    def to_string(self, text=False, debug=False) -> str:
        """Get a default string representation of this object.

        :param text: the document from which this annotation comes from.
            Default to None. If set, add the document substring:
            text[first-token-start-offset : last-token-end-offset].
        :param debug: default to False. If True, add the sequence of tokens
            and fuzzyalgo names.
        :return: a concatenated string
        """
        columns = [Annotation.annot_to_str(annot=self)]
        if text:
            text_substring = self.text[self.start : self.end]  # noqa
            columns.append(text_substring)
        if debug:
            token_annots_str = self._get_norm_label_algos_str()
            columns.append(token_annots_str)
        return "\t".join(columns).replace("\n", "\\n")

    def _get_norm_label_algos_str(self):
        """Get a string representation of tokens and algorithms."""
        return ";".join(
            [
                f"{token.norm_label}({','.join(algos)})"
                for token, algos in self.get_tokens_algos()
            ]
        )

    annot_to_str: Callable[[IAnnotation], str] = PrintAnnot().annot_to_str
    " A class function that generates a string representation of an annotation."  # noqa

[docs]    @classmethod
    def set_brat_formatter(
        cls, brat_formatter: Union[EBratFormatters, IBratFormatter]
    ):
        """Change Brat Formatter to change text-span and offsets.

        :param brat_formatter: A Brat formatter to produce
            a different Brat annotation. If None, default to
            :class:`~iamsystem.ContSeqFormatter`.
        :return: None
        """
        if isinstance(brat_formatter, EBratFormatters):
            brat_formatter = brat_formatter.value
        cls.annot_to_str = PrintAnnot(
            brat_formatter=brat_formatter
        ).annot_to_str


def is_ancestor_annot_of(a: Annotation, b: Annotation) -> bool:
    """True if a is an ancestor of b."""
    if a is b:
        return False
    if a.start != b.start or a.end > b.end:
        return False
    ancestors = b._node.get_ancestors()
    return a._node in ancestors


def sort_annot(annots: List[Annotation]) -> None:
    """Custom sort function by 1) start value 2) end value."""
    annots.sort(key=functools.cmp_to_key(min_start_or_end))


[docs]def rm_nested_annots(annots: List[Annotation], keep_ancestors=False):
    """In case of two nested annotations, remove the shorter one.
    For example, if we have "prostate" and "prostate cancer" annnotations,
    "prostate" annotation is removed.

    :param annots: a list of annotations.
    :param keep_ancestors: Default to False. Whether to keep the nested
      annotations that are ancestors and remove only other cases.
    :return: a filtered list of annotations.
    """
    # Assuming annotations are already sorted by start and end values,
    # an ancestor will always occur before its childs. For example, ancestor
    # "insuffisance" will alway occur before "insuffisance cardiaque". the
    # algorithm below check if each annotation is an ancestor by searching
    # childs to the right. Although the algorithm has two nested loops,
    # its complexity is not O(n²) since the 'break' keyword is quickly
    # executed.
    ancest_indices = set()
    short_indices = set()
    # count = 0
    for i, annot in enumerate(annots):
        for _y, other in enumerate(annots[(i + 1) :]):  # noqa
            y = _y + i + 1  # y is the indice of other in annots list.
            if not offsets_overlap(annot, other):
                break
            if is_shorter_span_of(annot, other):
                short_indices.add(i)
                # because ancestor is a special case of nested annot.
                if is_ancestor_annot_of(annot, other):
                    ancest_indices.add(i)
            if is_shorter_span_of(other, annot):
                short_indices.add(y)
            # count += 1
    # print(f"count:{count}")
    if keep_ancestors:
        indices_2_remove = set(
            [i for i in short_indices if i not in ancest_indices]
        )
    else:
        indices_2_remove = short_indices
    indices_2_keep = [
        i for i in range(len(annots)) if i not in indices_2_remove
    ]
    annots_filt = [annots[i] for i in indices_2_keep]
    return annots_filt


def create_annot(
    last_trans: StateTransition, stop_tokens: List[TokenT]
) -> Annotation:
    """last_trans contains all the state transitions and sequence of tokens in
    text. The last_trans's node is a final state which means it is associated
    with one or many keywords."""
    if not last_trans.node.is_a_final_state():
        raise ValueError("StateTransition's node is not a final state.")
    node = last_trans.node
    trans_states = _linkedlist_to_list(last_trans)
    # order by token indice (important if tokens were ordered alphabetically).
    # Note that node might not be the last anymore.
    trans_states.sort(key=lambda x: x.token.i)
    tokens: List[TokenT] = [t.token for t in trans_states]
    algos = [t.algos for t in trans_states]
    # Note that the annotations are created during iterating over the
    # document's tokens. If tokens are ordered alphabetically,
    # the list of stopwords inside an annotation are not known at this step.
    # Thus, all the stopwords detected are passed to each annotation:
    # it's not possible to filter them here, at the moment of creating an
    # annotation.
    annot = Annotation(
        tokens=tokens,
        algos=algos,
        node=node,
        stop_tokens=stop_tokens,
    )
    return annot


def _linkedlist_to_list(last_el: StateTransition) -> List[StateTransition]:
    """Convert a linked list to a list."""
    transitions: List[StateTransition] = [last_el]
    previous_trans = last_el.previous_trans
    while not StateTransition.is_first_trans(previous_trans):
        transitions.append(previous_trans)
        previous_trans = previous_trans.previous_trans
    transitions.reverse()
    return transitions


[docs]def replace_annots(
    text: str, annots: Sequence[Annotation], new_labels: Sequence[str]
):
    """Replace each annotation in a document (text parameter) by a new label.
    Warning: an annotation is ignored if overlapped by another one.

    :param text: the document from which the annotations come from.
    :param annots: an ordered sequence of annotation.
    :param new_labels: one new label per annotation, same length as annots
      expected.
    :return: a new document.
    """
    if len(annots) != len(new_labels):
        raise ValueError(
            "annots and new_labels parameters don't have the same length."
        )
    return replace_offsets_by_new_str(
        text=text, offsets_new_str=zip(annots, new_labels)
    )