- feat: standardise metric and metric_kwargs

kenhktsui · kenhktsui · commit e1e6ca145697 · 2024-08-03T22:10:05.000+01:00
- feat: logging
diff --git a/anyclassifier/annotation/annotator.py b/anyclassifier/annotation/annotator.py
@@ -1,13 +1,19 @@
+import sys
 from abc import abstractmethod, ABCMeta
 from typing import Union, Optional
 import re
+from collections import Counter
 from tqdm import tqdm
+import logging
 from llama_cpp import Llama
 from datasets import Dataset  # it is import to load llama_cpp first before datasets to prevent error like https://github.com/abetlen/llama-cpp-python/issues/806
 from huggingface_hub import hf_hub_download
 from anyclassifier.annotation.prompt import AnnotationPrompt
 
 
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
 class AnnotatorBase(metaclass=ABCMeta):
 
     regex_pattern = re.compile(r'Label:\s*(.+)')
@@ -70,6 +76,9 @@ def annotate_dataset(self,
 
         selected_dataset = selected_dataset.add_column("label", label_list)
         selected_dataset = selected_dataset.filter(lambda x: x.get("label") is not None)
+        logging.info(f"""Count of labels
+{Counter(selected_dataset["label"]).most_common(len(self._prompt.label_definition))}        
+        """)
         return selected_dataset
 
 
diff --git a/anyclassifier/fasttext_wrapper/trainer.py b/anyclassifier/fasttext_wrapper/trainer.py
@@ -2,11 +2,11 @@
 from dataclasses import dataclass, asdict
 import warnings
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, Optional, Union, Callable, Any
 import fasttext
 from datasets import Dataset
 from sklearn.preprocessing import LabelEncoder
-from sklearn.metrics import precision_score, recall_score
+import evaluate
 from setfit.trainer import ColumnMappingMixin
 from anyclassifier.fasttext_wrapper.config import FastTextConfig
 from anyclassifier.fasttext_wrapper.model import FastTextForSequenceClassification
@@ -54,6 +54,13 @@ class FastTextTrainer(ColumnMappingMixin):
             The training dataset.
         eval_dataset (`Dataset`, *optional*):
             The evaluation dataset.
+        metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
+            The metric to use for evaluation. If a string is provided, we treat it as the metric
+            name and load it with default settings. If a callable is provided, it must take two arguments
+            (`y_pred`, `y_test`) and return a dictionary with metric keys to values.
+        metric_kwargs (`Dict[str, Any]`, *optional*):
+            Keyword arguments passed to the evaluation function if `metric` is an evaluation string like "f1".
+            For example useful for providing an averaging strategy for computing f1 in a multi-label setting.
         column_mapping (`Dict[str, str]`, *optional*):
             A mapping from the column names in the dataset to the column names expected by the model.
             The expected format is a dictionary with the following format:
@@ -66,13 +73,17 @@ def __init__(
         args: FastTextConfig,
         train_dataset: Optional["Dataset"] = None,
         eval_dataset: Optional["Dataset"] = None,
+        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+        metric_kwargs: Optional[Dict[str, Any]] = None,
         column_mapping: Optional[Dict[str, str]] = None,
     ) -> None:
         if args is not None and not isinstance(args, FastTextTrainingArguments):
             raise ValueError("`args` must be a `FastTextTrainingArguments` instance.")
         self.training_args = asdict(args)
         self.output_dir = self.training_args.pop("output_dir")
         self.data_txt_path = self.training_args.pop("data_txt_path")
+        self.metric = metric
+        self.metric_kwargs = metric_kwargs
         self.column_mapping = column_mapping
         if train_dataset:
             self._validate_column_mapping(train_dataset)
@@ -165,10 +176,16 @@ def evaluate(self, dataset: Dataset) -> Dict[str, float]:
         le.fit(label + label_pred)
         label = le.transform(label)
         label_pred = le.transform(label_pred)
-        return {
-            "precision": precision_score(label, label_pred, average="micro"),
-            "recall": recall_score(label, label_pred, average="micro")
-        }
+
+        metric_kwargs = self.metric_kwargs or {}
+        if isinstance(self.metric, str):
+            metric_fn = evaluate.load(self.metric)
+            results = metric_fn.compute(predictions=y_pred, references=y_test, **metric_kwargs)
+        elif callable(self.metric):
+            results = self.metric(y_pred, y_test, **metric_kwargs)
+        else:
+            raise ValueError("metric must be a string or a callable")
+        return {"metric": results}
 
     def push_to_hub(self, repo_id: str, **kwargs) -> str:
         """Upload model checkpoint to the Hub using `huggingface_hub`.
diff --git a/anyclassifier/train_any.py b/anyclassifier/train_any.py
@@ -1,4 +1,6 @@
-from typing import List, Dict, Union, Literal, Optional
+import sys
+from typing import List, Dict, Union, Literal, Optional, Callable, Any
+import logging
 from datasets import Dataset
 from huggingface_hub import interpreter_login
 from setfit import SetFitModel, TrainingArguments, Trainer as SetFitTrainer
@@ -9,6 +11,9 @@
 )
 
 
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
 def train_anyclassifier(
     instruction: str,
     annotator_model_path: str,
@@ -22,6 +27,8 @@ def train_anyclassifier(
     batch_size: Optional[int] = 16,
     n_record_to_label: int = 100,
     test_size: float = 0.3,
+    metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+    metric_kwargs: Optional[Dict[str, Any]] = None,
     push_dataset_to_hub: bool = False,
     dataset_repo_id: Optional[str] = None,
     is_dataset_private: Optional[bool] = True,
@@ -55,6 +62,13 @@ def train_anyclassifier(
             No of record for LLM to label
         test_size (`float`, *optional*):
             Proportion of labeled data to evaluation
+        metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
+            The metric to use for evaluation. If a string is provided, we treat it as the metric
+            name and load it with default settings. If a callable is provided, it must take two arguments
+            (`y_pred`, `y_test`) and return a dictionary with metric keys to values.
+        metric_kwargs (`Dict[str, Any]`, *optional*):
+            Keyword arguments passed to the evaluation function if `metric` is an evaluation string like "f1".
+            For example useful for providing an averaging strategy for computing f1 in a multi-label setting.
         push_dataset_to_hub (`bool`, *optional*):
             Whether to push dataset to huggingface hub for reuse, highly recommended to do so.
         dataset_repo_id (`str`, *optional*):
@@ -92,13 +106,15 @@ def train_anyclassifier(
             args=args,
             train_dataset=label_dataset["train"],
             eval_dataset=label_dataset["test"],
+            metric=metric,
+            metric_kwargs=metric_kwargs,
             column_mapping={**column_mapping, "label": "label"},
         )
 
         # Train and evaluate
         trainer.train()
         metrics = trainer.evaluate(label_dataset["test"])
-        print(metrics)
+        logging.info(metrics)
         return trainer
 
     elif model_type == "setfit":
@@ -120,14 +136,15 @@ def train_anyclassifier(
             args=args,
             train_dataset=label_dataset["train"],
             eval_dataset=label_dataset["test"],
-            metric="accuracy",
+            metric=metric,
+            metric_kwargs=metric_kwargs,
             column_mapping={**column_mapping, "label": "label"},
         )
 
         # Train and evaluate
         trainer.train()
         metrics = trainer.evaluate(label_dataset["test"])
-        print(metrics)
+        logging.info(metrics)
         return trainer
     else:
         raise NotImplementedError("other approach is not implemented yet")