Merge pull request #51 from transformerlab/add/train-resume-chceckpoint

mina-parham · web-flow · commit 3aa9f05c3916 · 2025-11-12T13:52:06.000-05:00
Update trl script to be able to resume training from checkpoint
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "transformerlab"
-version = "0.0.43"
+version = "0.0.44"
 description = "Python SDK for Transformer Lab"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/scripts/examples/trl_train_script.py b/scripts/examples/trl_train_script.py
@@ -5,6 +5,7 @@
 """
 
 import os
+import argparse
 from datetime import datetime
 from time import sleep
 from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments
@@ -122,6 +123,11 @@ def train_with_trl(quick_test=True):
         lab.init()
         lab.set_config(training_config)
 
+        # Check if we should resume from a checkpoint
+        checkpoint = lab.get_checkpoint_to_resume()
+        if checkpoint:
+            lab.log(f"📁 Resuming training from checkpoint: {checkpoint}")
+
         # Log start time
         start_time = datetime.now()
         mode = "Quick test" if quick_test else "Full training"
@@ -162,17 +168,17 @@ def train_with_trl(quick_test=True):
         lab.log("Loading model and tokenizer...")
         try:
             from transformers import AutoTokenizer, AutoModelForCausalLM
-            
+
             model_name = training_config["model_name"]
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model = AutoModelForCausalLM.from_pretrained(model_name)
-            
+
             # Add pad token if it doesn't exist
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
             
             lab.log(f"Loaded model: {model_name}")
-            
+
         except ImportError:
             lab.log("⚠️  Transformers not available, skipping real training")
             lab.finish("Training skipped - transformers not available")
@@ -207,6 +213,8 @@ def train_with_trl(quick_test=True):
                 remove_unused_columns=False,
                 push_to_hub=False,
                 dataset_text_field="text",  # Move dataset_text_field to SFTConfig
+                resume_from_checkpoint=checkpoint if checkpoint else None,
+                bf16=False,  # Disable bf16 for compatibility with older GPUs
                 # Enable automatic checkpoint saving
                 save_total_limit=3,  # Keep only the last 3 checkpoints to save disk space
                 save_strategy="steps",  # Save checkpoints every save_steps
@@ -440,15 +448,18 @@ def train_with_trl(quick_test=True):
 
 
 if __name__ == "__main__":
-    import sys
     
-    # Check if user wants full training or quick test
-    quick_test = False # Default to quick test
-    if len(sys.argv) > 1 and sys.argv[1] == "--quick-training":
-        quick_test = True
+    parser = argparse.ArgumentParser(description="Train a model with automatic checkpoint resume support.")
+    parser.add_argument("--quick-training", action="store_true", help="Run in quick test mode")
+    
+    args = parser.parse_args()
+    
+    quick_test = args.quick_training
+    
+    if quick_test:
         print("🚀 Running quick test mode...")
     else:
-        print("🚀 Running full training mode (use --quick-training for quick test)...")
-    
+        print("🚀 Running full training mode...")
+
     result = train_with_trl(quick_test=quick_test)
     print("Training result:", result)
diff --git a/src/lab/lab_facade.py b/src/lab/lab_facade.py
@@ -91,6 +91,73 @@ def update_progress(self, progress: int) -> None:
         # Check for wandb URL on every progress update
         self._check_and_capture_wandb_url()
 
+    # ------------- checkpoint resume support -------------
+    def get_checkpoint_to_resume(self) -> Optional[str]:
+        """
+        Get the checkpoint path to resume training from.
+        
+        This method checks for checkpoint resume information stored in the job data
+        when resuming training from a checkpoint.
+        
+        Returns:
+            Optional[str]: The full path to the checkpoint to resume from, or None if no
+                          checkpoint resume is requested.
+        """
+        if not self._job:
+            return None
+            
+        job_data = self._job.get_job_data()
+        if not job_data:
+            return None
+        
+        parent_job_id = job_data.get('parent_job_id')
+        checkpoint_name = job_data.get('resumed_from_checkpoint')
+        
+        if not parent_job_id or not checkpoint_name:
+            return None
+        
+        # Build the checkpoint path from parent job's checkpoints directory
+        checkpoint_path = self.get_parent_job_checkpoint_path(parent_job_id, checkpoint_name)
+        
+        # Verify the checkpoint exists
+        if checkpoint_path and os.path.exists(checkpoint_path):
+            return checkpoint_path
+        
+        return None
+    
+    def get_parent_job_checkpoint_path(self, parent_job_id: str, checkpoint_name: str) -> Optional[str]:
+        """
+        Get the full path to a checkpoint from a parent job.
+        
+        This is a helper method that constructs the path to a specific checkpoint
+        from a parent job's checkpoints directory.
+        
+        Args:
+            parent_job_id (str): The ID of the parent job that created the checkpoint
+            checkpoint_name (str): The name of the checkpoint file or directory
+        
+        Returns:
+            Optional[str]: The full path to the checkpoint, or None if it doesn't exist
+        """
+        try:
+            checkpoints_dir = dirs.get_job_checkpoints_dir(parent_job_id)
+            checkpoint_path = os.path.join(checkpoints_dir, checkpoint_name)
+            
+            # Security check: ensure the checkpoint path is within the checkpoints directory
+            checkpoint_path_normalized = os.path.normpath(checkpoint_path)
+            checkpoints_dir_normalized = os.path.normpath(checkpoints_dir)
+            
+            if not checkpoint_path_normalized.startswith(checkpoints_dir_normalized + os.sep):
+                return None
+            
+            if os.path.exists(checkpoint_path_normalized):
+                return checkpoint_path_normalized
+            
+            return None
+        except Exception as e:
+            print(f"Error getting parent job checkpoint path: {str(e)}")
+            return None
+
     # ------------- completion -------------
     def finish(
         self,
@@ -506,8 +573,8 @@ def save_dataset(self, df, dataset_id: str, additional_metadata: Optional[Dict[s
         try:
             if hasattr(df, "to_pandas") and callable(getattr(df, "to_pandas")):
                 df = df.to_pandas()
-        except Exception:
-            pass
+        except Exception as e:
+            print(f"Warning: Failed to convert dataset to pandas DataFrame: {str(e)}")
 
         # Prepare dataset directory
         dataset_id_safe = dataset_id.strip()
@@ -562,16 +629,17 @@ def save_dataset(self, df, dataset_id: str, additional_metadata: Optional[Dict[s
             )
         except Exception as e:
             # Do not fail the save if metadata write fails; log to job data
+            print(f"Warning: Failed to create dataset metadata: {str(e)}")
             try:
                 self._job.update_job_data_field("dataset_metadata_error", str(e))  # type: ignore[union-attr]
-            except Exception:
-                pass
+            except Exception as e2:
+                print(f"Warning: Failed to log dataset metadata error: {str(e2)}")
 
         # Track dataset on the job for provenance
         try:
             self._job.update_job_data_field("dataset_id", dataset_id_safe)  # type: ignore[union-attr]
-        except Exception:
-            pass
+        except Exception as e:
+            print(f"Warning: Failed to track dataset in job_data: {str(e)}")
 
         self.log(f"Dataset saved to '{output_path}' and registered as generated dataset '{dataset_id_safe}'")
         return output_path
@@ -615,8 +683,8 @@ def save_checkpoint(self, source_path: str, name: Optional[str] = None) -> str:
             ckpt_list.append(dest)
             self._job.update_job_data_field("checkpoints", ckpt_list)
             self._job.update_job_data_field("latest_checkpoint", dest)
-        except Exception:
-            pass
+        except Exception as e:
+            print(f"Warning: Failed to track checkpoint in job_data: {str(e)}")
 
         return dest