Merge pull request #39 from transformerlab/add/get-checkpoints-artifacts-fn

deep1401 · web-flow · commit 5b27095fa7db · 2025-10-20T12:30:21.000-06:00
add methods to fetch checkpoints and artifacts paths
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "transformerlab"
-version = "0.0.31"
+version = "0.0.32"
 description = "Python SDK for Transformer Lab"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/scripts/examples/test_script.py b/scripts/examples/test_script.py
@@ -57,7 +57,36 @@ def train():
             lab.update_progress(10 + (i + 1) * 10)
             print(f"Iteration {i + 1}/8")
             
-            # Method 3: Initialize wandb during training (common pattern)
+            # Save fake checkpoint every 2 iterations
+            if (i + 1) % 2 == 0:
+                checkpoint_file = os.path.join(training_config["output_dir"], f"checkpoint_epoch_{i + 1}.txt")
+                with open(checkpoint_file, "w") as f:
+                    f.write(f"Fake checkpoint for epoch {i + 1}\n")
+                    f.write(f"Model state: iteration_{i + 1}\n")
+                    f.write(f"Loss: {0.5 - (i + 1) * 0.05:.3f}\n")
+                    f.write(f"Accuracy: {0.6 + (i + 1) * 0.04:.3f}\n")
+                    f.write(f"Timestamp: {datetime.now()}\n")
+                
+                # Save checkpoint using lab facade
+                saved_checkpoint_path = lab.save_checkpoint(checkpoint_file, f"epoch_{i + 1}_checkpoint.txt")
+                lab.log(f"Saved checkpoint: {saved_checkpoint_path}")
+                
+                # Save some fake artifacts
+                artifact_file = os.path.join(training_config["output_dir"], f"training_metrics_epoch_{i + 1}.json")
+                with open(artifact_file, "w") as f:
+                    f.write('{\n')
+                    f.write(f'  "epoch": {i + 1},\n')
+                    f.write(f'  "loss": {0.5 - (i + 1) * 0.05:.3f},\n')
+                    f.write(f'  "accuracy": {0.6 + (i + 1) * 0.04:.3f},\n')
+                    f.write(f'  "learning_rate": {2e-5},\n')
+                    f.write(f'  "batch_size": {8},\n')
+                    f.write(f'  "timestamp": "{datetime.now().isoformat()}"\n')
+                    f.write('}\n')
+                
+                # Save artifact using lab facade
+                saved_artifact_path = lab.save_artifact(artifact_file, f"metrics_epoch_{i + 1}.json")
+                lab.log(f"Saved artifact: {saved_artifact_path}")
+
             if i == 3:  # Initialize wandb halfway through training
                 try:
                     import wandb
@@ -97,6 +126,30 @@ def train():
         training_duration = end_time - start_time
         lab.log(f"Training completed in {training_duration}")
         
+        # Save final artifacts
+        final_model_file = os.path.join(training_config["output_dir"], "final_model_summary.txt")
+        with open(final_model_file, "w") as f:
+            f.write("Final Model Summary\n")
+            f.write("==================\n")
+            f.write(f"Training Duration: {training_duration}\n")
+            f.write("Final Loss: 0.15\n")
+            f.write("Final Accuracy: 0.92\n")
+            f.write(f"Model: {training_config['model_name']}\n")
+            f.write(f"Dataset: {training_config['dataset']}\n")
+            f.write(f"Completed at: {end_time}\n")
+        
+        # Save final model as artifact
+        final_model_path = lab.save_artifact(final_model_file, "final_model_summary.txt")
+        lab.log(f"Saved final model summary: {final_model_path}")
+        
+        # Save training configuration as artifact
+        config_file = os.path.join(training_config["output_dir"], "training_config.json")
+        import json
+        with open(config_file, "w") as f:
+            json.dump(training_config, f, indent=2)
+        
+        config_artifact_path = lab.save_artifact(config_file, "training_config.json")
+        lab.log(f"Saved training config: {config_artifact_path}")
         # Get the captured wandb URL from job data for reporting
         job_data = lab.job.get_job_data()
         captured_wandb_url = job_data.get("wandb_run_url", "None")
diff --git a/src/lab/job.py b/src/lab/job.py
@@ -235,6 +235,58 @@ def get_next_queued_job(cls):
             return queued_jobs[0][1]
         return None
 
+    def get_checkpoints_dir(self):
+        """
+        Get the checkpoints directory path for this job.
+        """
+        return dirs.get_job_checkpoints_dir(self.id)
+    
+    def get_artifacts_dir(self):
+        """
+        Get the artifacts directory path for this job.
+        """
+        return dirs.get_job_artifacts_dir(self.id)
+    
+    def get_checkpoint_paths(self):
+        """
+        Get list of checkpoint file paths for this job.
+        Returns list of checkpoint paths from job_data or scans directory.
+        """
+        try:
+            # Scan the checkpoints directory
+            checkpoints_dir = self.get_checkpoints_dir()
+            if os.path.exists(checkpoints_dir):
+                checkpoint_files = []
+                for item in os.listdir(checkpoints_dir):
+                    item_path = os.path.join(checkpoints_dir, item)
+                    if os.path.isfile(item_path):
+                        checkpoint_files.append(item_path)
+                return sorted(checkpoint_files)
+            
+            return []
+        except Exception:
+            return []
+    
+    
+    def get_artifact_paths(self):
+        """
+        Get list of artifact file paths for this job.
+        Returns list of artifact paths from job_data or scans directory.
+        """
+        try:
+            # Scan the artifacts directory
+            artifacts_dir = self.get_artifacts_dir()
+            if os.path.exists(artifacts_dir):
+                artifact_files = []
+                for item in os.listdir(artifacts_dir):
+                    item_path = os.path.join(artifacts_dir, item)
+                    if os.path.isfile(item_path):
+                        artifact_files.append(item_path)
+                return sorted(artifact_files)
+        except Exception:
+            return []
+        return []
+
     def delete(self):
         """
         Mark this job as deleted.
diff --git a/src/lab/lab_facade.py b/src/lab/lab_facade.py
@@ -293,6 +293,34 @@ def job(self) -> Job:
         self._ensure_initialized()
         return self._job  # type: ignore[return-value]
 
+    def get_checkpoints_dir(self) -> str:
+        """
+        Get the checkpoints directory path for the current job.
+        """
+        self._ensure_initialized()
+        return self._job.get_checkpoints_dir()  # type: ignore[union-attr]
+    
+    def get_artifacts_dir(self) -> str:
+        """
+        Get the artifacts directory path for the current job.
+        """
+        self._ensure_initialized()
+        return self._job.get_artifacts_dir()  # type: ignore[union-attr]
+    
+    def get_checkpoint_paths(self) -> list[str]:
+        """
+        Get list of checkpoint file paths for the current job.
+        """
+        self._ensure_initialized()
+        return self._job.get_checkpoint_paths()  # type: ignore[union-attr]
+    
+    def get_artifact_paths(self) -> list[str]:
+        """
+        Get list of artifact file paths for the current job.
+        """
+        self._ensure_initialized()
+        return self._job.get_artifact_paths()  # type: ignore[union-attr]
+
     @property
     def experiment(self) -> Experiment:
         self._ensure_initialized()