Merge pull request #40 from transformerlab/add/model-checkpoint-management

mina-parham · web-flow · commit 410e94f6eecf · 2025-10-22T13:49:39.000-04:00
Add/model checkpoint management
diff --git a/scripts/examples/test_script.py b/scripts/examples/test_script.py
@@ -164,6 +164,19 @@ def train():
         except Exception:
             pass
         
+        # Save the trained model
+        model_dir = os.path.join(training_config["output_dir"], "final_model")
+        os.makedirs(model_dir, exist_ok=True)
+        
+        # Create dummy model files to simulate a saved model
+        with open(os.path.join(model_dir, "config.json"), "w") as f:
+            f.write('{"model": "SmolLM-135M-Instruct", "params": 135000000}')
+        with open(os.path.join(model_dir, "pytorch_model.bin"), "w") as f:
+            f.write("dummy binary model data")
+        
+        saved_path = lab.save_model(model_dir, name="trained_model")
+        lab.log(f"✅ Model saved to job models directory: {saved_path}")
+        
         print("Complete")
 
         # Complete the job in TransformerLab via facade
@@ -176,6 +189,7 @@ def train():
             "output_dir": os.path.join(
                 training_config["output_dir"], f"final_model_{lab.job.id}"
             ),
+            "saved_model_path": saved_path,
             "wandb_url": captured_wandb_url,
         }
 
diff --git a/src/lab/lab_facade.py b/src/lab/lab_facade.py
@@ -1,13 +1,14 @@
 from __future__ import annotations
 
+from time import time
 from typing import Optional, Dict, Any
 import os
 import shutil
 
 from .experiment import Experiment
 from .job import Job
 from . import dirs
-
+from .model import Model as ModelService
 
 class Lab:
     """
@@ -177,6 +178,139 @@ def save_checkpoint(self, source_path: str, name: Optional[str] = None) -> str:
 
         return dest
 
+    def save_model(self, source_path: str, name: Optional[str] = None, architecture: Optional[str] = None, pipeline_tag: Optional[str] = None, parent_model: Optional[str] = None) -> str:
+        """
+        Save a model file or directory to the workspace models directory.
+        The model will automatically appear in the Model Zoo's Local Models list.
+        
+        Args:
+            source_path: Path to the model file or directory to save
+            name: Optional name for the model. If not provided, uses source basename.
+                 The final model name will be prefixed with the job_id for uniqueness.
+            architecture: Optional architecture string. If not provided, will attempt to 
+                         detect from config.json for directory-based models.
+            pipeline_tag: Optional pipeline tag. If not provided and parent_model is given,
+                         will attempt to fetch from parent model on HuggingFace.
+            parent_model: Optional parent model name/ID for provenance tracking.
+        
+        Returns:
+            The destination path on disk.
+        """
+        self._ensure_initialized()
+        if not isinstance(source_path, str) or source_path.strip() == "":
+            raise ValueError("source_path must be a non-empty string")
+        src = os.path.abspath(source_path)
+        if not os.path.exists(src):
+            raise FileNotFoundError(f"Model source does not exist: {src}")
+
+        job_id = self._job.id  # type: ignore[union-attr]
+        
+        # Determine base name with job_id prefix for uniqueness
+        if isinstance(name, str) and name.strip() != "":
+            base_name = f"{job_id}_{name}"
+        else:
+            base_name = f"{job_id}_{os.path.basename(src)}"
+        
+        # Save to main workspace models directory for Model Zoo visibility
+        models_dir = dirs.get_models_dir()
+        dest = os.path.join(models_dir, base_name)
+        
+        # Create parent directories
+        os.makedirs(os.path.dirname(dest), exist_ok=True)
+
+        # Copy file or directory
+        if os.path.isdir(src):
+            if os.path.exists(dest):
+                shutil.rmtree(dest)
+            shutil.copytree(src, dest)
+        else:
+            shutil.copy2(src, dest)
+        
+        # Create Model metadata so it appears in Model Zoo
+        try:
+            model_service = ModelService(base_name)
+            
+            # Use provided architecture or detect it
+            if architecture is None:
+                architecture = model_service.detect_architecture(dest)
+            
+            # Handle pipeline tag logic
+            if pipeline_tag is None and parent_model is not None:
+                # Try to fetch pipeline tag from parent model
+                pipeline_tag = model_service.fetch_pipeline_tag(parent_model)
+            # Determine model_filename for single-file models
+            model_filename = "" if os.path.isdir(dest) else os.path.basename(dest)
+            
+            # Prepare json_data with basic info
+            json_data = {
+                "job_id": job_id,
+                "description": f"Model generated by job {job_id}",
+            }
+            
+            # Add pipeline tag to json_data if provided
+            if pipeline_tag is not None:
+                json_data["pipeline_tag"] = pipeline_tag
+            
+            # Use the Model class's generate_model_json method to create metadata
+            model_service.generate_model_json(
+                architecture=architecture,
+                model_filename=model_filename,
+                json_data=json_data
+            )
+            self.log(f"Model saved to Model Zoo as '{base_name}'")
+        except Exception as e:
+            self.log(f"Warning: Model saved but metadata creation failed: {str(e)}")
+
+        # Create provenance data
+        try:
+            # Create MD5 checksums for all model files
+            md5_objects = model_service.create_md5_checksums(dest)
+            
+            # Prepare provenance metadata from job data
+            job_data = self._job.get_job_data()
+            
+            provenance_metadata = {
+                "job_id": job_id,
+                "model_name": parent_model or job_data.get("model_name"),
+                "model_architecture": architecture,
+                "input_model": parent_model,
+                "dataset": job_data.get("dataset"),
+                "adaptor_name": job_data.get("adaptor_name", None),
+                "parameters": job_data.get("_config", {}),
+                "start_time": job_data.get("start_time", ""),
+                "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+                "md5_checksums": md5_objects,
+
+
+            }
+            
+            # Create the _tlab_provenance.json file
+            provenance_file = model_service.create_provenance_file(
+                model_path=dest,
+                model_name=base_name,
+                model_architecture=architecture,
+                md5_objects=md5_objects,
+                provenance_data=provenance_metadata
+            )
+            self.log(f"Provenance file created at: {provenance_file}")
+        except Exception as e:
+            self.log(f"Warning: Model saved but provenance creation failed: {str(e)}")
+
+        # Track in job_data
+        try:
+            job_data = self._job.get_job_data()
+            model_list = []
+            if isinstance(job_data, dict):
+                existing = job_data.get("models", [])
+                if isinstance(existing, list):
+                    model_list = existing
+            model_list.append(dest)
+            self._job.update_job_data_field("models", model_list)
+        except Exception:
+            pass
+
+        return dest
+
     def error(
         self,
         message: str = "",
diff --git a/src/lab/model.py b/src/lab/model.py
@@ -65,6 +65,135 @@ def import_model(self, model_name, model_path):
         """
         self.generate_model_json(model_name, model_path)
 
+    def detect_architecture(self, model_path: str) -> str:
+        """
+        Detect the model architecture from a model directory's config.json.
+        
+        Args:
+            model_path: Path to the model directory or file
+            
+        Returns:
+            The model architecture (e.g., 'LlamaForCausalLM') or 'Unknown' if not found
+        """
+        architecture = "Unknown"
+        
+        if os.path.isdir(model_path):
+            config_path = os.path.join(model_path, "config.json")
+            if os.path.exists(config_path):
+                try:
+                    with open(config_path, 'r') as f:
+                        config = json.load(f)
+                        architectures = config.get("architectures", [])
+                        if architectures:
+                            architecture = architectures[0]
+                except Exception:
+                    pass
+        
+        return architecture
+
+    def fetch_pipeline_tag(self, parent_model: str) -> str | None:
+        """
+        Fetch the pipeline tag from a parent model on HuggingFace.
+        
+        Args:
+            parent_model: The HuggingFace model ID to fetch the pipeline tag from
+            
+        Returns:
+            The pipeline tag string if found, None otherwise
+        """
+        try:
+            from huggingface_hub import HfApi
+            api = HfApi()
+            model_info = api.model_info(parent_model)
+            return model_info.pipeline_tag
+        except Exception as e:
+            print(f"Could not fetch pipeline tag from parent model '{parent_model}': {type(e).__name__}: {e}")
+            return None
+
+    def create_md5_checksums(self, model_path: str) -> list:
+        """
+        Create MD5 checksums for all files in the model directory.
+        
+        Args:
+            model_path: Path to the model directory
+            
+        Returns:
+            List of dicts with 'file_path' and 'md5_hash' keys
+        """
+        import hashlib
+        
+        def compute_md5(file_path):
+            md5 = hashlib.md5()
+            with open(file_path, "rb") as f:
+                while chunk := f.read(8192):
+                    md5.update(chunk)
+            return md5.hexdigest()
+
+        md5_objects = []
+
+        if not os.path.isdir(model_path):
+            print(f"Model path '{model_path}' is not a directory, skipping MD5 checksum creation")
+            return md5_objects
+
+        for root, _, files in os.walk(model_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                try:
+                    md5_hash = compute_md5(file_path)
+                    md5_objects.append({"file_path": file_path, "md5_hash": md5_hash})
+                except Exception as e:
+                    print(f"Warning: Could not compute MD5 for {file_path}: {str(e)}")
+
+        return md5_objects
+
+    def create_provenance_file(self, model_path: str, model_name: str = None, model_architecture: str = None,
+                               md5_objects: list = None, provenance_data: dict = None) -> str:
+        """
+        Create a _tlab_provenance.json file containing model provenance data.
+        
+        Args:
+            model_path: Path to the model directory
+            model_name: Name of the model
+            model_architecture: Architecture of the model
+            md5_objects: List of MD5 checksums from create_md5_checksums()
+            provenance_data: Optional dict with additional provenance data. Expected keys include:
+                - job_id: ID of the job that created this model
+                - input_model: Name of the base/parent model used
+                - dataset: Name of the dataset used for training
+                - adaptor_name: Name of the adapter if applicable
+                - parameters: Training configuration parameters
+                - start_time: When training/processing started
+            
+        Returns:
+            Path to the created provenance file
+        """
+        import time
+        
+        # Start with base provenance data matching the structure from train.py
+        final_provenance = {
+            "model_name": model_name,
+            "model_architecture": model_architecture,
+            "job_id": None,
+            "input_model": None,
+            "dataset": None,
+            "adaptor_name": None,
+            "parameters": None,
+            "start_time": "",
+            "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "md5_checksums": md5_objects,
+        }
+        
+        # Merge in any additional provenance data provided
+        if provenance_data and isinstance(provenance_data, dict):
+            final_provenance.update(provenance_data)
+
+        # Write provenance to file
+        provenance_path = os.path.join(model_path, "_tlab_provenance.json")
+        with open(provenance_path, "w") as f:
+            json.dump(final_provenance, f, indent=2)
+
+        return provenance_path
+
     def generate_model_json(
         self,
         architecture: str,