@@ -57,7 +57,36 @@ def train():
5757 lab .update_progress (10 + (i + 1 ) * 10 )
5858 print (f"Iteration { i + 1 } /8" )
5959
60- # Method 3: Initialize wandb during training (common pattern)
60+ # Save fake checkpoint every 2 iterations
61+ if (i + 1 ) % 2 == 0 :
62+ checkpoint_file = os .path .join (training_config ["output_dir" ], f"checkpoint_epoch_{ i + 1 } .txt" )
63+ with open (checkpoint_file , "w" ) as f :
64+ f .write (f"Fake checkpoint for epoch { i + 1 } \n " )
65+ f .write (f"Model state: iteration_{ i + 1 } \n " )
66+ f .write (f"Loss: { 0.5 - (i + 1 ) * 0.05 :.3f} \n " )
67+ f .write (f"Accuracy: { 0.6 + (i + 1 ) * 0.04 :.3f} \n " )
68+ f .write (f"Timestamp: { datetime .now ()} \n " )
69+
70+ # Save checkpoint using lab facade
71+ saved_checkpoint_path = lab .save_checkpoint (checkpoint_file , f"epoch_{ i + 1 } _checkpoint.txt" )
72+ lab .log (f"Saved checkpoint: { saved_checkpoint_path } " )
73+
74+ # Save some fake artifacts
75+ artifact_file = os .path .join (training_config ["output_dir" ], f"training_metrics_epoch_{ i + 1 } .json" )
76+ with open (artifact_file , "w" ) as f :
77+ f .write ('{\n ' )
78+ f .write (f' "epoch": { i + 1 } ,\n ' )
79+ f .write (f' "loss": { 0.5 - (i + 1 ) * 0.05 :.3f} ,\n ' )
80+ f .write (f' "accuracy": { 0.6 + (i + 1 ) * 0.04 :.3f} ,\n ' )
81+ f .write (f' "learning_rate": { 2e-5 } ,\n ' )
82+ f .write (f' "batch_size": { 8 } ,\n ' )
83+ f .write (f' "timestamp": "{ datetime .now ().isoformat ()} "\n ' )
84+ f .write ('}\n ' )
85+
86+ # Save artifact using lab facade
87+ saved_artifact_path = lab .save_artifact (artifact_file , f"metrics_epoch_{ i + 1 } .json" )
88+ lab .log (f"Saved artifact: { saved_artifact_path } " )
89+
6190 if i == 3 : # Initialize wandb halfway through training
6291 try :
6392 import wandb
@@ -97,6 +126,30 @@ def train():
97126 training_duration = end_time - start_time
98127 lab .log (f"Training completed in { training_duration } " )
99128
129+ # Save final artifacts
130+ final_model_file = os .path .join (training_config ["output_dir" ], "final_model_summary.txt" )
131+ with open (final_model_file , "w" ) as f :
132+ f .write ("Final Model Summary\n " )
133+ f .write ("==================\n " )
134+ f .write (f"Training Duration: { training_duration } \n " )
135+ f .write ("Final Loss: 0.15\n " )
136+ f .write ("Final Accuracy: 0.92\n " )
137+ f .write (f"Model: { training_config ['model_name' ]} \n " )
138+ f .write (f"Dataset: { training_config ['dataset' ]} \n " )
139+ f .write (f"Completed at: { end_time } \n " )
140+
141+ # Save final model as artifact
142+ final_model_path = lab .save_artifact (final_model_file , "final_model_summary.txt" )
143+ lab .log (f"Saved final model summary: { final_model_path } " )
144+
145+ # Save training configuration as artifact
146+ config_file = os .path .join (training_config ["output_dir" ], "training_config.json" )
147+ import json
148+ with open (config_file , "w" ) as f :
149+ json .dump (training_config , f , indent = 2 )
150+
151+ config_artifact_path = lab .save_artifact (config_file , "training_config.json" )
152+ lab .log (f"Saved training config: { config_artifact_path } " )
100153 # Get the captured wandb URL from job data for reporting
101154 job_data = lab .job .get_job_data ()
102155 captured_wandb_url = job_data .get ("wandb_run_url" , "None" )
0 commit comments