Update README with all new features (logging, resume, Inspect AI, mock mode)

haasonsaas · haasonsaas · commit 804148b88b8f · 2025-10-01T18:53:30.000-07:00
diff --git a/checkpoint_manager.py b/checkpoint_manager.py
@@ -43,7 +43,7 @@ def save_run_state(
             "round_idx": round_idx,
             "global_step": global_step,
             "learning_rate": learning_rate,
-            "checkpoint_uri": checkpoint_uri,
+            "checkpoint_uri": str(checkpoint_uri),
             "config": config,
             "timestamp": datetime.now().isoformat(),
         }
diff --git a/mock_checkpoints/round_3.json b/mock_checkpoints/round_3.json
@@ -0,0 +1 @@
+{"name": "round_3", "mock": true}
diff --git a/runs/20251001_185248/metrics.jsonl b/runs/20251001_185248/metrics.jsonl
@@ -0,0 +1,20 @@
+{"timestamp": "2025-10-01T18:52:48.424598", "run_id": "20251001_185248", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-41/test_early_stopping_on_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 5, "lr_decay": 0.8, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
+{"timestamp": "2025-10-01T18:52:48.425049", "run_id": "20251001_185248", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503094000'>"}
+{"timestamp": "2025-10-01T18:52:48.425252", "run_id": "20251001_185248", "event": "evaluation", "round": 1, "score": 0.85, "threshold": 0.8, "passed": true, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.426888", "run_id": "20251001_185248", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-41/test_full_rounds_below_thresho0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.9, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
+{"timestamp": "2025-10-01T18:52:48.427227", "run_id": "20251001_185248", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503098368'>"}
+{"timestamp": "2025-10-01T18:52:48.427394", "run_id": "20251001_185248", "event": "evaluation", "round": 1, "score": 0.7, "threshold": 0.9, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.427519", "run_id": "20251001_185248", "event": "checkpoint", "round": 2, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503098368'>"}
+{"timestamp": "2025-10-01T18:52:48.427655", "run_id": "20251001_185248", "event": "evaluation", "round": 2, "score": 0.7, "threshold": 0.9, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.427770", "run_id": "20251001_185248", "event": "checkpoint", "round": 3, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503098368'>"}
+{"timestamp": "2025-10-01T18:52:48.427898", "run_id": "20251001_185248", "event": "evaluation", "round": 3, "score": 0.7, "threshold": 0.9, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.430702", "run_id": "20251001_185248", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-41/test_evalops_integration_calle0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 0.0001, "eval_threshold": 0.8, "max_rounds": 1, "lr_decay": 0.8, "evalops_enabled": true, "evalops_test_suite_id": "suite-123", "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
+{"timestamp": "2025-10-01T18:52:48.431038", "run_id": "20251001_185248", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4498415232'>"}
+{"timestamp": "2025-10-01T18:52:48.431243", "run_id": "20251001_185248", "event": "evaluation", "round": 1, "score": 0.9, "threshold": 0.8, "passed": true, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.432839", "run_id": "20251001_185248", "event": "config", "base_model": "test-model", "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-41/test_lr_decay_across_rounds0/train.jsonl", "eval_tasks": [], "renderer_name": "default", "learning_rate": 1.0, "eval_threshold": 0.99, "max_rounds": 3, "lr_decay": 0.5, "evalops_enabled": false, "evalops_test_suite_id": null, "evalops_api_url": null, "steps_per_round": 1, "batch_size": 8, "max_seq_length": 2048, "lora_rank": 16, "warmup_steps": 0, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
+{"timestamp": "2025-10-01T18:52:48.433169", "run_id": "20251001_185248", "event": "checkpoint", "round": 1, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503095008'>"}
+{"timestamp": "2025-10-01T18:52:48.433329", "run_id": "20251001_185248", "event": "evaluation", "round": 1, "score": 0.7, "threshold": 0.99, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.433460", "run_id": "20251001_185248", "event": "checkpoint", "round": 2, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503095008'>"}
+{"timestamp": "2025-10-01T18:52:48.433680", "run_id": "20251001_185248", "event": "evaluation", "round": 2, "score": 0.7, "threshold": 0.99, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:48.433796", "run_id": "20251001_185248", "event": "checkpoint", "round": 3, "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503095008'>"}
+{"timestamp": "2025-10-01T18:52:48.433937", "run_id": "20251001_185248", "event": "evaluation", "round": 3, "score": 0.7, "threshold": 0.99, "passed": false, "metrics": {}}
diff --git a/runs/20251001_185248/run_state.json b/runs/20251001_185248/run_state.json
@@ -0,0 +1,28 @@
+{
+  "round_idx": 3,
+  "global_step": 3,
+  "learning_rate": 0.25,
+  "checkpoint_uri": "<MagicMock name='mock.create_lora_training_client().save_weights_for_sampler().result().path' id='4503095008'>",
+  "config": {
+    "base_model": "test-model",
+    "train_file": "/private/var/folders/1s/sm2bn6kd64gdnv757t9qshcm0000gn/T/pytest-of-jonathanhaas/pytest-41/test_lr_decay_across_rounds0/train.jsonl",
+    "eval_tasks": [],
+    "renderer_name": "default",
+    "learning_rate": 1.0,
+    "eval_threshold": 0.99,
+    "max_rounds": 3,
+    "lr_decay": 0.5,
+    "evalops_enabled": false,
+    "evalops_test_suite_id": null,
+    "evalops_api_url": null,
+    "steps_per_round": 1,
+    "batch_size": 8,
+    "max_seq_length": 2048,
+    "lora_rank": 16,
+    "warmup_steps": 0,
+    "max_steps": 1000,
+    "min_lr": 1e-06,
+    "use_recommended_lr": false
+  },
+  "timestamp": "2025-10-01T18:52:48.433866"
+}
diff --git a/runs/20251001_185258/metrics.jsonl b/runs/20251001_185258/metrics.jsonl
@@ -0,0 +1,7 @@
+{"timestamp": "2025-10-01T18:52:58.032742", "run_id": "20251001_185258", "event": "config", "base_model": "meta-llama/Llama-3.1-8B-Instruct", "train_file": "demo_data.jsonl", "eval_tasks": ["simple_qa"], "renderer_name": "llama3", "learning_rate": 0.0003, "eval_threshold": 0.75, "max_rounds": 3, "lr_decay": 0.6, "evalops_enabled": false, "evalops_test_suite_id": "", "evalops_api_url": null, "steps_per_round": 5, "batch_size": 4, "max_seq_length": 512, "lora_rank": 16, "warmup_steps": 100, "max_steps": 1000, "min_lr": 1e-06, "use_recommended_lr": false}
+{"timestamp": "2025-10-01T18:52:58.144341", "run_id": "20251001_185258", "event": "checkpoint", "round": 1, "checkpoint_uri": "mock://checkpoint/round_1"}
+{"timestamp": "2025-10-01T18:52:58.144892", "run_id": "20251001_185258", "event": "evaluation", "round": 1, "score": 0.4, "threshold": 0.75, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:58.256871", "run_id": "20251001_185258", "event": "checkpoint", "round": 2, "checkpoint_uri": "mock://checkpoint/round_2"}
+{"timestamp": "2025-10-01T18:52:58.257349", "run_id": "20251001_185258", "event": "evaluation", "round": 2, "score": 0.6, "threshold": 0.75, "passed": false, "metrics": {}}
+{"timestamp": "2025-10-01T18:52:58.368808", "run_id": "20251001_185258", "event": "checkpoint", "round": 3, "checkpoint_uri": "mock://checkpoint/round_3"}
+{"timestamp": "2025-10-01T18:52:58.370565", "run_id": "20251001_185258", "event": "evaluation", "round": 3, "score": 1.0, "threshold": 0.75, "passed": true, "metrics": {}}
diff --git a/runs/20251001_185258/run_state.json b/runs/20251001_185258/run_state.json
@@ -0,0 +1,30 @@
+{
+  "round_idx": 3,
+  "global_step": 15,
+  "learning_rate": 0.00010799999999999998,
+  "checkpoint_uri": "mock://checkpoint/round_3",
+  "config": {
+    "base_model": "meta-llama/Llama-3.1-8B-Instruct",
+    "train_file": "demo_data.jsonl",
+    "eval_tasks": [
+      "simple_qa"
+    ],
+    "renderer_name": "llama3",
+    "learning_rate": 0.0003,
+    "eval_threshold": 0.75,
+    "max_rounds": 3,
+    "lr_decay": 0.6,
+    "evalops_enabled": false,
+    "evalops_test_suite_id": "",
+    "evalops_api_url": null,
+    "steps_per_round": 5,
+    "batch_size": 4,
+    "max_seq_length": 512,
+    "lora_rank": 16,
+    "warmup_steps": 100,
+    "max_steps": 1000,
+    "min_lr": 1e-06,
+    "use_recommended_lr": false
+  },
+  "timestamp": "2025-10-01T18:52:58.369674"
+}

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def save_run_state(`
`43`	`43`	`"round_idx": round_idx,`
`44`	`44`	`"global_step": global_step,`
`45`	`45`	`"learning_rate": learning_rate,`
`46`		`- "checkpoint_uri": checkpoint_uri,`
	`46`	`+ "checkpoint_uri": str(checkpoint_uri),`
`47`	`47`	`"config": config,`
`48`	`48`	`"timestamp": datetime.now().isoformat(),`
`49`	`49`	`}`