feat: add the 4-bit quantisation option and remove unnecessary base model copying

baixiac · baixiac · commit d03f4875d323 · 2025-09-17T15:31:30.000+01:00
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -24,7 +24,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          uv sync --group dev --group docs --group vllm
+          uv sync --group dev --group docs
       - name: Check types
         run: |
           uv run mypy app
diff --git a/app/cli/cli.py b/app/cli/cli.py
@@ -67,6 +67,7 @@ def serve_model(
     streamable: bool = typer.Option(False, help="Serve the streamable endpoints only"),
     device: Device = typer.Option(Device.DEFAULT.value, help="The device to serve the model on"),
     llm_engine: Optional[LlmEngine] = typer.Option(LlmEngine.CMS.value, help="The engine to use for text generation"),
+    load_in_4bit: Optional[bool] = typer.Option(False, help="Load the model in 4-bit precision, used by 'huggingface_llm' models"),
     debug: Optional[bool] = typer.Option(None, help="Run in the debug mode"),
 ) -> None:
     """
@@ -84,6 +85,7 @@ def serve_model(
         streamable (bool): Serve the streamable endpoints only. Defaults to False.
         device (Device): The device to serve the model on. Defaults to Device.DEFAULT.
         llm_engine (LlmEngine): The inference engine to use. Defaults to LlmEngine.CMS.
+        load_in_4bit (bool): Load the model in 4-bit precision, used by 'huggingface_llm' models. Defaults to False.
         debug (Optional[bool]): Run in debug mode if set to True.
     """
 
@@ -135,7 +137,7 @@ def serve_model(
         if model_path:
             model_service = model_service_dep()
             model_service.model_name = model_name
-            model_service.init_model()
+            model_service.init_model(load_in_4bit=load_in_4bit)
             cms_globals.model_manager_dep = ModelManagerDep(model_service)
         elif mlflow_model_uri:
             model_service = ModelManager.retrieve_model_service_from_uri(mlflow_model_uri, config, dst_model_path)
@@ -187,6 +189,7 @@ def train_model(
     description: Optional[str] = typer.Option(None, help="The description of the training or change logs"),
     model_name: Optional[str] = typer.Option(None, help="The string representation of the model name"),
     device: Device = typer.Option(Device.DEFAULT.value, help="The device to train the model on"),
+    load_in_4bit: Optional[bool] = typer.Option(False, help="Load the model in 4-bit precision, used by 'huggingface_llm' models"),
     debug: Optional[bool] = typer.Option(None, help="Run in the debug mode"),
 ) -> None:
     """
@@ -206,6 +209,7 @@ def train_model(
         description (Optional[str]): The optional description of the training or change logs.
         model_name (Optional[str]): The optional string representation of the model name.
         device (Device): The device to train the model on. Defaults to Device.DEFAULT.
+        load_in_4bit (bool): Load the model in 4-bit precision, used by 'huggingface_llm' models. Defaults to False.
         debug (Optional[bool]): Run in debug mode if set to True.
     """
 
@@ -229,7 +233,7 @@ def train_model(
             pass
         model_service = model_service_dep()
         model_service.model_name = model_name if model_name is not None else "CMS model"
-        model_service.init_model()
+        model_service.init_model(load_in_4bit=load_in_4bit)
     elif mlflow_model_uri:
         model_service = ModelManager.retrieve_model_service_from_uri(mlflow_model_uri, config, dst_model_path)
         model_service.model_name = model_name if model_name is not None else "CMS model"
diff --git a/app/model_services/base.py b/app/model_services/base.py
@@ -154,10 +154,14 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
         raise NotImplementedError
 
     @abstractmethod
-    def init_model(self) -> None:
+    def init_model(self, *args: Any, **kwargs: Any) -> None:
         """
         Initialises the model and auxiliary resources.
 
+        Args:
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+
         Raises:
             NotImplementedError: If the method is not implemented by the subclass.
         """
diff --git a/app/model_services/huggingface_llm_model.py b/app/model_services/huggingface_llm_model.py
@@ -174,8 +174,14 @@ def load_model(
         else:
             raise ConfigurationException(f"Model package archive format is not supported: {model_file_path}")
 
-    def init_model(self) -> None:
-        """Initialises the HuggingFace model and its tokenizer based on the configuration."""
+    def init_model(self, load_in_4bit: bool = False, *args: Any, **kwargs: Any) -> None:
+        """Initialises the HuggingFace model and its tokenizer based on the configuration.
+
+        Args:
+            load_in_4bit (bool): Whether to load the model in 4-bit precision. Defaults to False.
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+        """
 
         if all([
             hasattr(self, "_model"),
@@ -185,7 +191,7 @@ def init_model(self) -> None:
         ]):
             logger.warning("Model service is already initialised and can be initialised only once")
         else:
-            self._model, self._tokenizer = self.load_model(self._model_pack_path)
+            self._model, self._tokenizer = self.load_model(self._model_pack_path, load_in_4bit=load_in_4bit)
             if non_default_device_is_available(get_settings().DEVICE):
                 self._model.to(get_settings().DEVICE)
             if self._enable_trainer:
diff --git a/app/model_services/huggingface_ner_model.py b/app/model_services/huggingface_ner_model.py
@@ -175,8 +175,13 @@ def load_model(model_file_path: str, *args: Tuple, **kwargs: Dict[str, Any]) ->
         else:
             raise ConfigurationException(f"Model package archive format is not supported: {model_file_path}")
 
-    def init_model(self) -> None:
-        """Initialises the HuggingFace model, its tokenizer and a NER pipeline based on the configuration."""
+    def init_model(self, *args: Any, **kwargs: Any) -> None:
+        """Initialises the HuggingFace model, its tokenizer and a NER pipeline based on the configuration.
+
+        Args:
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+        """
 
         if all([
             hasattr(self, "_model"),
diff --git a/app/model_services/medcat_model.py b/app/model_services/medcat_model.py
@@ -119,8 +119,13 @@ def load_model(model_file_path: str, *args: Tuple, **kwargs: Dict[str, Any]) ->
         else:
             raise ConfigurationException("Model package archive format is not supported")
 
-    def init_model(self) -> None:
-        """Initializes the MedCAT model based on the configuration."""
+    def init_model(self, *args: Any, **kwargs: Any) -> None:
+        """Initializes the MedCAT model based on the configuration.
+
+        Args:
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+        """
 
         if hasattr(self, "_model") and isinstance(self._model, CAT):
             logger.warning("Model service is already initialised and can be initialised only once")
diff --git a/app/model_services/medcat_model_deid.py b/app/model_services/medcat_model_deid.py
@@ -178,8 +178,13 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
 
         return annotations_list
 
-    def init_model(self) -> None:
-        """Initializes the MedCAT De-Identification (AnonCAT) model based on the configuration."""
+    def init_model(self, *args: Any, **kwargs: Any) -> None:
+        """Initializes the MedCAT De-Identification (AnonCAT) model based on the configuration.
+
+        Args:
+            *args (Any): Additional positional arguments to be passed to this method.
+            **kwargs (Any): Additional keyword arguments to be passed to this method.
+        """
 
         if hasattr(self, "_model") and isinstance(self._model, CAT):
             logger.warning("Model service is already initialised and can be initialised only once")
diff --git a/app/model_services/trf_model_deid.py b/app/model_services/trf_model_deid.py
@@ -86,7 +86,7 @@ def load_model(
         logger.info("Model loaded from %s", unpacked_model_dir)
         return tokenizer, model
 
-    def init_model(self) -> None:
+    def init_model(self, *args: Any, **kwargs: Any) -> None:
         if hasattr(self, "_model") and isinstance(self._model, PreTrainedModel):
             logger.warning("Model service is already initialised and can be initialised only once")
         else:
diff --git a/app/trainers/huggingface_llm_trainer.py b/app/trainers/huggingface_llm_trainer.py
@@ -88,8 +88,11 @@ def __init__(self, model_service: "HuggingFaceLlmModel") -> None:
         self._model_service = model_service
         self._model_name = model_service.model_name
         self._model_pack_path = model_service._model_pack_path
-        self._retrained_models_dir = os.path.join(model_service._model_parent_dir, "retrained",
-                                                  self._model_name.replace(" ", "_"))
+        self._retrained_models_dir = os.path.join(
+            model_service._model_parent_dir,
+            "retrained",
+            self._model_name.replace(" ", "_"),
+        )
         self._model_manager = ModelManager(type(model_service), model_service._config)
         self._max_length = model_service.model.config.max_position_embeddings
         os.makedirs(self._retrained_models_dir, exist_ok=True)
@@ -306,7 +309,7 @@ def run(
             logger.error("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
             raise ExtraDependencyRequiredException("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
 
-        copied_model_pack_path = None
+        trained_model_pack_path = None
         redeploy = self._config.REDEPLOY_TRAINED_MODEL == "true"
         skip_save_model = self._config.SKIP_SAVE_MODEL == "true"
         results_path = os.path.abspath(os.path.join(self._config.TRAINING_CACHE_DIR, "results"))
@@ -319,15 +322,16 @@ def run(
 
         if not eval_mode:
             try:
-                logger.info("Loading a new model copy for training...")
-                copied_model_pack_path = self._make_model_file_copy(self._model_pack_path, run_id)
-                model, tokenizer = self._model_service.load_model(
-                    copied_model_pack_path,
-                    load_in_4bit=True,  # for memory efficient training
+                logger.info("Loading a PEFT model for training...")
+                model_pack_file_ext = get_model_data_package_extension(self._model_pack_path)
+                trained_model_pack_path = self._model_pack_path.replace(
+                    model_pack_file_ext,
+                    f"_trained_{run_id}{model_pack_file_ext}",
                 )
-                copied_model_directory = os.path.join(
-                    os.path.dirname(copied_model_pack_path),
-                    get_model_data_package_base_name(copied_model_pack_path),
+                model, tokenizer = self._model_service.model, self._model_service.tokenizer
+                trained_model_directory = os.path.join(
+                    os.path.dirname(trained_model_pack_path),
+                    get_model_data_package_base_name(trained_model_pack_path),
                 )
 
                 if non_default_device_is_available(self._config.DEVICE):
@@ -355,7 +359,7 @@ def run(
                     ],
                 )
 
-                model = get_peft_model(model, lora_config)
+                peft_model = get_peft_model(model, lora_config)
 
                 mlflow_logging_callback = MLflowLoggingCallback(self._tracker_client)
                 cancel_event_check_callback = CancelEventCheckCallback(self._cancel_event)
@@ -378,27 +382,26 @@ def run(
                     training_args = GRPOConfig(
                         output_dir=results_path,
                         logging_dir=logs_path,
+                        logging_steps=log_frequency,
                         learning_rate=5e-6,
                         adam_beta1=0.9,
                         adam_beta2=0.99,
                         weight_decay=0.1,
                         warmup_ratio=0.1,
                         lr_scheduler_type="cosine",
                         optim="paged_adamw_8bit",
-                        logging_steps=1,
                         per_device_train_batch_size=6,   # This global batch size must be divisible by the number of generations
                         gradient_accumulation_steps=1,
                         num_generations=6,
                         max_prompt_length=max_prompt_length,
                         max_completion_length=max_seq_length - max_prompt_length,
                         num_train_epochs = training_params["nepochs"],
-                        max_steps=250,
                         save_steps=250,
                         max_grad_norm=0.1,
                         report_to="none",
                     )
                     trainer = GRPOTrainer(
-                        model=model,
+                        model=peft_model,
                         processing_class=tokenizer,
                         reward_funcs=self._get_reward_functions(),
                         args=training_args,
@@ -409,7 +412,7 @@ def run(
                 else:
                     raise ConfigurationException(f"Unsupported trainer type: {trainer_type}")
 
-                self._tracker_client.log_model_config(model.config.to_dict())
+                self._tracker_client.log_model_config({**model.config.to_dict(), **peft_model.peft_config})
                 self._tracker_client.log_trainer_version(TrainerBackend.TRANSFORMERS, transformers_version)
 
                 logger.info(f"Performing {trainer_type.upper()} training...")
@@ -422,11 +425,13 @@ def run(
                     model_pack_file_ext = get_model_data_package_extension(self._config.BASE_MODEL_FILE)
                     model_pack_file_name = f"{ModelType.HUGGINGFACE_LLM.value}_{run_id}{model_pack_file_ext}"
                     retrained_model_pack_path = os.path.join(self._retrained_models_dir, model_pack_file_name)
+                    model = peft_model.merge_and_unload()
                     model.save_pretrained(
-                        copied_model_directory,
+                        trained_model_directory,
                         safe_serialization=(self._config.TRAINING_SAFE_MODEL_SERIALISATION == "true"),
                     )
-                    create_model_data_package(copied_model_directory, retrained_model_pack_path)
+                    tokenizer.save_pretrained(trained_model_directory)
+                    create_model_data_package(trained_model_directory, retrained_model_pack_path)
                     model_uri = self._tracker_client.save_model(
                         retrained_model_pack_path,
                         self._model_name,
@@ -475,7 +480,7 @@ def run(
                 with self._training_lock:
                     self._training_in_progress = False
                 self._clean_up_training_cache()
-                self._housekeep_file(copied_model_pack_path)
+                self._housekeep_file(trained_model_pack_path)
                 if trainer is not None:
                     del trainer
                     gc.collect()
@@ -505,6 +510,7 @@ def run(
                 training_args = GRPOConfig(
                     output_dir=results_path,
                     logging_dir=logs_path,
+                    logging_steps=log_frequency,
                     per_device_eval_batch_size=6,
                     num_generations=2,
                     max_prompt_length=max_prompt_length,
@@ -607,19 +613,19 @@ def correctness_reward_func(
             )
             return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]
 
-        def int_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> list[float]:
+        def int_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> List[float]:
             responses = [completion[0]["content"] for completion in completions]
             extracted_responses = [extract_xml_answer(r) for r in responses]
             return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]
 
-        def strict_format_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> list[float]:
+        def strict_format_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> List[float]:
             """Reward function that checks if the completion has a specific format."""
             pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
             responses = [completion[0]["content"] for completion in completions]
             matches = [re.match(pattern, r) for r in responses]
             return [0.5 if match else 0.0 for match in matches]
 
-        def soft_format_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> list[float]:
+        def soft_format_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> List[float]:
             """Reward function that checks if the completion has a specific format."""
             pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
             responses = [completion[0]["content"] for completion in completions]
@@ -640,7 +646,7 @@ def count_xml(text: str) -> float:
                 count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
             return count
 
-        def xmlcount_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> list[float]:
+        def xmlcount_reward_func(completions: Tuple[Any], **kwargs: Dict[str, Any]) -> List[float]:
             contents = [completion[0]["content"] for completion in completions]
             return [count_xml(c) for c in contents]
 
diff --git a/app/utils.py b/app/utils.py
@@ -547,11 +547,6 @@ def unpack_model_data_package(model_data_file_path: str, model_data_folder_path:
     elif model_data_file_path.endswith(".tar.gz"):
         with tarfile.open(model_data_file_path, "r:gz") as f:
             for member in f.getmembers():
-                path_parts = member.name.split(os.sep)
-                stripped_path = os.sep.join(path_parts[1:])
-                if not stripped_path:
-                    continue
-                member.name = stripped_path
                 f.extract(member, path=model_data_folder_path)
             return True
     else: