- feat: parse_output only capture predefined label name

kenhktsui · kenhktsui · commit 1422dc39cfe1 · 2024-08-04T23:50:05.000+01:00
- feat: add max_length_for_label to handle long document
- feat: prompt change
- refactor: train_anyclassifier posItional args reshuffle
- docs: update README.md
diff --git a/README.md b/README.md
@@ -17,15 +17,18 @@ Together let's build more useful models.
 
 ## 🚀 Features
 - One line to build any classifier that you don't have data 🤯
-- Why one line? Because it can easily be used by other LLM as a function call, easily to be integrated with any **agentic flow**
+- Why one line? Not only it is easy to be used by Human but also it can easily be used by other LLM as a function call, easily to be integrated with any **agentic flow**
 - Smoothness integration with transformers, setfit, fasttext and datasets
   - [setfit](https://github.com/huggingface/setfit): for limited data (e.g. 100) 🤗
   - [fastText](https://github.com/facebookresearch/fastText): for blazingly fast inference (1000 docs/s) without GPU ⚡️
   - [transformers](https://github.com/huggingface/transformers): for other usecase
 - Huggingface-like interface for fastText that supports push_to_hub, saving and loading (let's not forget this amazing model before transformers architecture).
 
 ## 🏁 QuickStart in Colab
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LB8PUTT9wM1Qb2cY-6Dx-RNiqmyCvRr1?usp=sharing)
+| Dataset                       | Colab Link                                                                                                                                                          |
+|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| imdb sentiment classification | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LB8PUTT9wM1Qb2cY-6Dx-RNiqmyCvRr1?usp=sharing) |
+
 
 ## 🔧 Installation
 It is using llama.cpp as backend, and build wheel can take a lot of time (10min+), as such, we also provide an instruction to install with pre-built wheel.
@@ -82,11 +85,11 @@ unlabeled_dataset  # a huggingface datasets.Dataset class can be from your local
 # Magic One Line!
 trainer = build_anyclassifier(
   "Classify a text's sentiment.",
-  hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),  # as you like
   [
     Label(name='1', desc='positive sentiment'),
     Label(name='0', desc='negative sentiment')
   ],
+  hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),  # as you like
   unlabeled_dataset,
   column_mapping={"text": "text"},
   model_type="setfit",  # can be set to fastText
@@ -137,13 +140,14 @@ label_dataset.push_to_hub('user_id/any_data')
 
 ```
 
-See examples:  
+See more examples:  
 
 | model_type | example                                  | resulting model                                                                  | dataset                                                                      |
 |------------|------------------------------------------|----------------------------------------------------------------------------------|------------------------------------------------------------------------------|
 | setfit     | [link](examples/train_setfit_model.py)   | [link](https://huggingface.co/kenhktsui/anyclassifier_setfit_demo)               | [link](https://huggingface.co/datasets/kenhktsui/anyclassifier_dataset_demo) |
 | fasttext   | [link](examples/train_fasttext_model.py) | [link](https://huggingface.co/kenhktsui/fasttext_test)(probably need more label) | [link](https://huggingface.co/datasets/kenhktsui/anyclassifier_dataset_demo) |
 
+Test accuracy on imdb with SetFit: 90.42%  
 
 ## 🗺️ Roadmap
 - High Quality Data:
diff --git a/anyclassifier/annotation/annotator.py b/anyclassifier/annotation/annotator.py
@@ -1,29 +1,33 @@
 import sys
 from abc import abstractmethod, ABCMeta
-from typing import Union, Optional
+from typing import Union, Optional, List
 import re
 from collections import Counter
 from tqdm import tqdm
 import logging
 from llama_cpp import Llama
 from datasets import Dataset  # it is import to load llama_cpp first before datasets to prevent error like https://github.com/abetlen/llama-cpp-python/issues/806
 from huggingface_hub import hf_hub_download
-from anyclassifier.annotation.prompt import AnnotationPrompt
+from anyclassifier.annotation.prompt import AnnotationPrompt, Label
 
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
 
 class AnnotatorBase(metaclass=ABCMeta):
+    def __init__(self):
+        self.regex_pattern = None
+
+    def prepare_regex_pattern(self, labels: List[Label]):
+        labels_str = "|".join(l.name for l in labels)
+        self.regex_pattern = re.compile(rf'Label:\s*({labels_str})')
 
-    regex_pattern = re.compile(r'Label:\s*(.+)')
     @abstractmethod
     def annotate(self, text: str) -> str:
         pass
 
-    @classmethod
-    def parse_output(cls, text: str) -> Optional[str]:
-        match = cls.regex_pattern.search(text)
+    def parse_output(self, text: str) -> Optional[str]:
+        match = self.regex_pattern.search(text)
         if match:
             return match.group(1)
         return None
@@ -36,6 +40,8 @@ def __init__(self,
                                                    "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),
                  n_gpu_layers: int = -1,
                  n_ctx: int = 2048):
+        super().__init__()
+        self.prepare_regex_pattern(prompt.label_definition)
         self._prompt = prompt
         self._llm = Llama(model_path=model_path,
                           n_gpu_layers=n_gpu_layers,
@@ -59,7 +65,8 @@ def annotate(self, text: str) -> str:
     def annotate_dataset(self,
         dataset: Union[Dataset],
         text_col: str = "text",
-        n_record: int = 1000,
+        n_record: int = 200,
+        max_length_for_labeling: int = 1500,
         shuffle: bool = True
     ) -> Dataset:
         # shuffle the data to randomise potential bias in data collection process
@@ -70,7 +77,7 @@ def annotate_dataset(self,
 
         label_list = []
         for d in tqdm(selected_dataset, desc="Annotating dataset"):
-            llm_output = self.annotate(d[text_col])
+            llm_output = self.annotate(d[text_col][:max_length_for_labeling])
             label = self.parse_output(llm_output)
             label_list.append(label)
 
diff --git a/anyclassifier/annotation/prompt.py b/anyclassifier/annotation/prompt.py
@@ -35,7 +35,7 @@ def get_prompt(self, text: str):
             [f"Example {i+1}.\nText: {fse.text}\nLabel: {fse.label}" for i, fse in enumerate(self.few_shot_examples)]
         )
         return f"""{self.task_description}
-Here are the label definitions:
+Here are the label names and description:
 {label_defn_str}
 
 Here is the text to be analyzed:
diff --git a/anyclassifier/train_any.py b/anyclassifier/train_any.py
@@ -16,8 +16,8 @@
 
 def train_anyclassifier(
     instruction: str,
-    annotator_model_path: str,
     labels: List[Label],
+    annotator_model_path: str,
     unlabeled_dataset: Dataset,
     column_mapping: Dict[str, str] = {"text": "text"},
     model_type: Literal["setfit", "fasttext", "transformers"] = "setfit",
@@ -26,6 +26,7 @@ def train_anyclassifier(
     num_epochs: Optional[int] = 5,
     batch_size: Optional[int] = 16,
     n_record_to_label: int = 100,
+    max_length_for_labeling: int = 1500,
     test_size: float = 0.3,
     metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
     metric_kwargs: Optional[Dict[str, Any]] = None,
@@ -38,10 +39,10 @@ def train_anyclassifier(
     Args:
         instruction (`str`):
             The instruction to LLM annotator
-        annotator_model_path (`str`):
-            The LLM annotator model to be used by llama.cpp
         labels (`List[Label]`):
             The labels including name and desc you want to classify
+        annotator_model_path (`str`):
+            The path of LLM annotator model to be used by llama.cpp
         unlabeled_dataset ('Dataset'):
             The unlabeled dataset you want to label.
         column_mapping (`Dict[str, str]`, *optional*):
@@ -60,6 +61,11 @@ def train_anyclassifier(
             Batch size to train model
         n_record_to_label (`int`, *optional*):
             No of record for LLM to label
+        max_length_for_labeling (`int`, *optional*):
+            Max length on character level to avoid exceeding context length of LLM and faster annotation. In general,
+            how limiting truncating document affects the accuracy of annotation process depending on various
+            factors, like complexity of classification, location of key information. If the same topic is conveyed
+            throughout a document (e.g. sentiment analysis, domain classification), the impact is expected to be low.
         test_size (`float`, *optional*):
             Proportion of labeled data to evaluation
         metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
@@ -87,7 +93,11 @@ def train_anyclassifier(
         few_shot_examples=few_shot_examples
     )
     annotator = LlamaCppAnnotator(prompt, annotator_model_path)
-    label_dataset = annotator.annotate_dataset(unlabeled_dataset, n_record=n_record_to_label)
+    label_dataset = annotator.annotate_dataset(
+        unlabeled_dataset,
+        n_record=n_record_to_label,
+        max_length_for_labeling=max_length_for_labeling
+    )
 
     label_dataset = label_dataset.train_test_split(test_size=test_size)
 
diff --git a/examples/train_fasttext_model.py b/examples/train_fasttext_model.py
@@ -14,11 +14,11 @@
 
 trainer = train_anyclassifier(
     "Classify a text's sentiment.",
-    hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),
     [
         Label(name='1', desc='positive sentiment'),
         Label(name='0', desc='negative sentiment')
     ],
+    hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),
     unlabeled_dataset,
     column_mapping={"text": "text"},
     model_type="fasttext",
diff --git a/examples/train_setfit_model.py b/examples/train_setfit_model.py
@@ -14,11 +14,11 @@
 
 trainer = train_anyclassifier(
     "Classify a text's sentiment.",
-    hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),
     [
         Label(name='1', desc='positive sentiment'),
         Label(name='0', desc='negative sentiment')
     ],
+    hf_hub_download("lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"),
     unlabeled_dataset,
     column_mapping={"text": "text"},
     model_type="setfit",

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ def get_prompt(self, text: str):`
`35`	`35`	`[f"Example {i+1}.\nText: {fse.text}\nLabel: {fse.label}" for i, fse in enumerate(self.few_shot_examples)]`
`36`	`36`	`)`
`37`	`37`	`return f"""{self.task_description}`
`38`		`-Here are the label definitions:`
	`38`	`+Here are the label names and description:`
`39`	`39`	`{label_defn_str}`
`40`	`40`
`41`	`41`	`Here is the text to be analyzed:`