Merge pull request #3570 from MattGPT-ai/add-sentence-labeler

alanakbik · web-flow · commit ae592bf0cceb · 2025-02-04T20:41:00.000+01:00
Add sentence labeler
diff --git a/flair/class_utils.py b/flair/class_utils.py
@@ -2,12 +2,17 @@
 import inspect
 from collections.abc import Iterable
 from types import ModuleType
-from typing import Any, Optional, TypeVar, Union, overload
+from typing import Any, Iterable, List, Optional, Protocol, Type, TypeVar, Union, overload
+
 
 T = TypeVar("T")
 
 
-def get_non_abstract_subclasses(cls: type[T]) -> Iterable[type[T]]:
+class StringLike(Protocol):
+    def __str__(self) -> str: ...
+
+
+def get_non_abstract_subclasses(cls: Type[T]) -> Iterable[Type[T]]:
     for subclass in cls.__subclasses__():
         yield from get_non_abstract_subclasses(subclass)
         if inspect.isabstract(subclass):
diff --git a/flair/data.py b/flair/data.py
@@ -565,7 +565,7 @@ def __init__(
         head_id: Optional[int] = None,
         whitespace_after: int = 1,
         start_position: int = 0,
-        sentence=None,
+        sentence: Optional["Sentence"] = None,
     ) -> None:
         super().__init__(sentence=sentence)
 
diff --git a/flair/training_utils.py b/flair/training_utils.py
@@ -1,22 +1,26 @@
 import logging
+import pathlib
 import random
 from collections import defaultdict
 from enum import Enum
 from functools import reduce
 from math import inf
 from pathlib import Path
-from typing import Literal, Optional, Union
+from typing import Literal, NamedTuple, Optional, Union
 
+from numpy import ndarray
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 from torch.optim import Optimizer
 from torch.utils.data import Dataset
 
 import flair
-from flair.data import DT, Dictionary, Sentence, _iter_dataset
+from flair.class_utils import StringLike
+from flair.data import DT, Dictionary, Sentence, Token, _iter_dataset
 
 EmbeddingStorageMode = Literal["none", "cpu", "gpu"]
-log = logging.getLogger("flair")
+MinMax = Literal["min", "max"]
+logger = logging.getLogger("flair")
 
 
 class Result:
@@ -33,7 +37,7 @@ def __init__(
         self.main_score: float = main_score
         self.scores = scores
         self.detailed_results: str = detailed_results
-        self.classification_report = classification_report
+        self.classification_report = classification_report if classification_report is not None else {}
 
     @property
     def loss(self):
@@ -44,13 +48,13 @@ def __str__(self) -> str:
 
 
 class MetricRegression:
-    def __init__(self, name) -> None:
+    def __init__(self, name: str) -> None:
         self.name = name
 
         self.true: list[float] = []
         self.pred: list[float] = []
 
-    def mean_squared_error(self):
+    def mean_squared_error(self) -> Union[float, ndarray]:
         return mean_squared_error(self.true, self.pred)
 
     def mean_absolute_error(self):
@@ -62,22 +66,18 @@ def pearsonr(self):
     def spearmanr(self):
         return spearmanr(self.true, self.pred)[0]
 
-    # dummy return to fulfill trainer.train() needs
-    def micro_avg_f_score(self):
-        return self.mean_squared_error()
-
-    def to_tsv(self):
+    def to_tsv(self) -> str:
         return f"{self.mean_squared_error()}\t{self.mean_absolute_error()}\t{self.pearsonr()}\t{self.spearmanr()}"
 
     @staticmethod
-    def tsv_header(prefix=None):
+    def tsv_header(prefix: StringLike = None) -> str:
         if prefix:
             return f"{prefix}_MEAN_SQUARED_ERROR\t{prefix}_MEAN_ABSOLUTE_ERROR\t{prefix}_PEARSON\t{prefix}_SPEARMAN"
 
         return "MEAN_SQUARED_ERROR\tMEAN_ABSOLUTE_ERROR\tPEARSON\tSPEARMAN"
 
     @staticmethod
-    def to_empty_tsv():
+    def to_empty_tsv() -> str:
         return "\t_\t_\t_\t_"
 
     def __str__(self) -> str:
@@ -101,13 +101,13 @@ def __init__(self, directory: Union[str, Path], number_of_weights: int = 10) ->
         self.weights_dict: dict[str, dict[int, list[float]]] = defaultdict(lambda: defaultdict(list))
         self.number_of_weights = number_of_weights
 
-    def extract_weights(self, state_dict, iteration):
+    def extract_weights(self, state_dict: dict, iteration: int) -> None:
         for key in state_dict:
             vec = state_dict[key]
-            # print(vec)
             try:
                 weights_to_watch = min(self.number_of_weights, reduce(lambda x, y: x * y, list(vec.size())))
-            except Exception:
+            except Exception as e:
+                logger.debug(e)
                 continue
 
             if key not in self.weights_dict:
@@ -195,15 +195,15 @@ class AnnealOnPlateau:
     def __init__(
         self,
         optimizer,
-        mode="min",
-        aux_mode="min",
-        factor=0.1,
-        patience=10,
-        initial_extra_patience=0,
-        verbose=False,
-        cooldown=0,
-        min_lr=0,
-        eps=1e-8,
+        mode: MinMax = "min",
+        aux_mode: MinMax = "min",
+        factor: float = 0.1,
+        patience: int = 10,
+        initial_extra_patience: int = 0,
+        verbose: bool = False,
+        cooldown: int = 0,
+        min_lr: float = 0.0,
+        eps: float = 1e-8,
     ) -> None:
         if factor >= 1.0:
             raise ValueError("Factor should be < 1.0.")
@@ -214,6 +214,7 @@ def __init__(
             raise TypeError(f"{type(optimizer).__name__} is not an Optimizer")
         self.optimizer = optimizer
 
+        self.min_lrs: list[float]
         if isinstance(min_lr, (list, tuple)):
             if len(min_lr) != len(optimizer.param_groups):
                 raise ValueError(f"expected {len(optimizer.param_groups)} min_lrs, got {len(min_lr)}")
@@ -231,7 +232,7 @@ def __init__(
         self.best = None
         self.best_aux = None
         self.num_bad_epochs = None
-        self.mode_worse = None  # the worse value for the chosen mode
+        self.mode_worse: Optional[float] = None  # the worse value for the chosen mode
         self.eps = eps
         self.last_epoch = 0
         self._init_is_better(mode=mode)
@@ -258,7 +259,7 @@ def step(self, metric, auxiliary_metric=None) -> bool:
         if self.mode == "max" and current > self.best:
             is_better = True
 
-        if current == self.best and auxiliary_metric:
+        if current == self.best and auxiliary_metric is not None:
             current_aux = float(auxiliary_metric)
             if self.aux_mode == "min" and current_aux < self.best_aux:
                 is_better = True
@@ -289,20 +290,20 @@ def step(self, metric, auxiliary_metric=None) -> bool:
 
         return reduce_learning_rate
 
-    def _reduce_lr(self, epoch):
+    def _reduce_lr(self, epoch: int) -> None:
         for i, param_group in enumerate(self.optimizer.param_groups):
             old_lr = float(param_group["lr"])
             new_lr = max(old_lr * self.factor, self.min_lrs[i])
             if old_lr - new_lr > self.eps:
                 param_group["lr"] = new_lr
                 if self.verbose:
-                    log.info(f" - reducing learning rate of group {epoch} to {new_lr}")
+                    logger.info(f" - reducing learning rate of group {epoch} to {new_lr}")
 
     @property
     def in_cooldown(self):
         return self.cooldown_counter > 0
 
-    def _init_is_better(self, mode):
+    def _init_is_better(self, mode: MinMax) -> None:
         if mode not in {"min", "max"}:
             raise ValueError("mode " + mode + " is unknown!")
 
@@ -313,10 +314,10 @@ def _init_is_better(self, mode):
 
         self.mode = mode
 
-    def state_dict(self):
+    def state_dict(self) -> dict:
         return {key: value for key, value in self.__dict__.items() if key != "optimizer"}
 
-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: dict) -> None:
         self.__dict__.update(state_dict)
         self._init_is_better(mode=self.mode)
 
@@ -350,11 +351,11 @@ def convert_labels_to_one_hot(label_list: list[list[str]], label_dict: Dictionar
     return [[1 if label in labels else 0 for label in label_dict.get_items()] for labels in label_list]
 
 
-def log_line(log):
+def log_line(log: logging.Logger) -> None:
     log.info("-" * 100, stacklevel=3)
 
 
-def add_file_handler(log, output_file):
+def add_file_handler(log: logging.Logger, output_file: pathlib.Path) -> logging.FileHandler:
     init_output_file(output_file.parents[0], output_file.name)
     fh = logging.FileHandler(output_file, mode="w", encoding="utf-8")
     fh.setLevel(logging.INFO)
@@ -368,11 +369,19 @@ def store_embeddings(
     data_points: Union[list[DT], Dataset],
     storage_mode: EmbeddingStorageMode,
     dynamic_embeddings: Optional[list[str]] = None,
-):
+) -> None:
+    """Stores embeddings of data points in memory or on disk.
+
+    Args:
+        data_points: a DataSet or list of DataPoints for which embeddings should be stored
+        storage_mode: store in either CPU or GPU memory, or delete them if set to 'none'
+        dynamic_embeddings: these are always deleted. If not passed, they are identified automatically.
+    """
+
     if isinstance(data_points, Dataset):
         data_points = list(_iter_dataset(data_points))
 
-    # if memory mode option 'none' delete everything
+    # if storage mode option 'none' delete everything
     if storage_mode == "none":
         dynamic_embeddings = None
 
@@ -411,3 +420,97 @@ def identify_dynamic_embeddings(data_points: list[DT]) -> Optional[list[str]]:
     if not all_embeddings:
         return None
     return list(set(dynamic_embeddings))
+
+
+class TokenEntity(NamedTuple):
+    """Entity represented by token indices."""
+
+    start_token_idx: int
+    end_token_idx: int
+    label: str
+    value: str = ""  # text value of the entity
+    score: float = 1.0
+
+
+class CharEntity(NamedTuple):
+    """Entity represented by character indices."""
+
+    start_char_idx: int
+    end_char_idx: int
+    label: str
+    value: str
+    score: float = 1.0
+
+
+def create_labeled_sentence_from_tokens(
+    tokens: Union[list[Token]], token_entities: list[TokenEntity], type_name: str = "ner"
+) -> Sentence:
+    """Creates a new Sentence object from a list of tokens or strings and applies entity labels.
+
+    Tokens are recreated with the same text, but not attached to the previous sentence.
+
+    Args:
+        tokens: a list of Token objects or strings - only the text is used, not any labels
+        token_entities: a list of TokenEntity objects representing entity annotations
+        type_name: the type of entity label to apply
+    Returns:
+          A labeled Sentence object
+    """
+    tokens_ = [token.text for token in tokens]  # create new tokens that do not already belong to a sentence
+    sentence = Sentence(tokens_, use_tokenizer=True)
+    for entity in token_entities:
+        sentence[entity.start_token_idx : entity.end_token_idx].add_label(type_name, entity.label, score=entity.score)
+    return sentence
+
+
+def create_labeled_sentence_from_entity_offsets(
+    text: str,
+    entities: list[CharEntity],
+    token_limit: float = inf,
+) -> Sentence:
+    """Creates a labeled sentence from a text and a list of entity annotations.
+
+    The function explicitly tokenizes the text and labels separately, ensuring entity labels are
+    not partially split across tokens. The sentence is truncated if a token limit is set.
+
+    Args:
+        text (str): The full text to be tokenized and labeled.
+        entities (list of tuples): Ordered non-overlapping entity annotations with each tuple in the
+            format (start_char_index, end_char_index, entity_class, entity_text).
+        token_limit: numerical value that determines the maximum token length of the sentence.
+            use inf to not perform chunking
+
+    Returns:
+        A labeled Sentence objects representing the text and entity annotations.
+    """
+    tokens: list[Token] = []
+    current_index = 0
+    token_entities: list[TokenEntity] = []
+
+    for entity in entities:
+        if current_index < entity.start_char_idx:
+            # add tokens before the entity
+            sentence = Sentence(text[current_index : entity.start_char_idx])
+            tokens.extend(sentence)
+
+        # add new entity tokens
+        start_token_idx = len(tokens)
+        entity_sentence = Sentence(text[entity.start_char_idx : entity.end_char_idx])
+        end_token_idx = start_token_idx + len(entity_sentence)
+
+        token_entity = TokenEntity(start_token_idx, end_token_idx, entity.label, entity.value, entity.score)
+        token_entities.append(token_entity)
+        tokens.extend(entity_sentence)
+
+        current_index = entity.end_char_idx
+
+    # add any remaining tokens to a new chunk
+    if current_index < len(text):
+        remaining_sentence = Sentence(text[current_index:])
+        tokens.extend(remaining_sentence)
+
+    if isinstance(token_limit, int) and token_limit < len(tokens):
+        tokens = tokens[:token_limit]
+        token_entities = [entity for entity in token_entities if entity.end_token_idx <= token_limit]
+
+    return create_labeled_sentence_from_tokens(tokens, token_entities)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -12,4 +12,4 @@ types-Deprecated>=1.2.9.2
 types-requests>=2.28.11.17
 types-tabulate>=0.9.0.2
 pyab3p
-transformers!=4.40.1,!=4.40.0
+transformers!=4.40.1,!=4.40.0
diff --git a/tests/resources/text_sequences/resume1.txt b/tests/resources/text_sequences/resume1.txt
diff --git a/tests/test_sentence_labeling.py b/tests/test_sentence_labeling.py