[refactor] step zero definition (PrimeIntellect-ai#528)

Jackmin801 · web-flow · commit 82a43ffd2fe9 · 2025-07-07T11:35:21.000+02:00
* start step at 0 which makes async level 0 sync

* fix: seed shouldnt have -1

* train step 0 start for trainer

* keep async level around

* DEBUG

* fix: +1

* +1

* Revert "DEBUG"

This reverts commit 6ba48b4b3f3958a741b8ad13b5dd2279e0ea31aa.

* docs
diff --git a/README.md b/README.md
@@ -220,6 +220,17 @@ To run fast tests, use the inverse of the `slow` marker:
 uv run pytest -v -m "not slow"
 ```
 
+### Checkpoint, rollout, step numbering and async level
+At each step `n`, all artifacts (e.g., checkpoint, rollout, gradient) are tagged with the same step number.
+- Step 0:
+  - Uses checkpoint 0 on rollout 0 to compute gradient 0.
+  - Then computes checkpoint 1 as: `ckpt 1 = ckpt 0 - grad 0`
+
+In general, the model used for generating rollouts at step `n` is from `ckpt[n - async_level]`.
+
+- When async_level = 0, the rollout and gradient are based on the same model version.
+  This is equivalent to synchronous on-policy training.
+
 ## Citation
 
 *TBD*
diff --git a/src/zeroband/training/ckpt.py b/src/zeroband/training/ckpt.py
@@ -16,7 +16,7 @@
 
 @dataclass
 class TrainingProgress:
-    step: int = 1
+    step: int = 0
     total_tokens: int = 0
     total_samples: int = 0
 
diff --git a/src/zeroband/training/orchestrator/orchestrator.py b/src/zeroband/training/orchestrator/orchestrator.py
@@ -106,19 +106,19 @@ async def orchestrate(config: OrchestratorConfig, setup_queue: Queue | None = No
     total_tokens, total_samples = 0, 0
     ckpt_step = 0
     last_eval_step = -1
-    epoch = 0
+    epoch = -1
 
-    for step in range(1, int(max_steps) + 1):
+    for step in range(int(max_steps)):
         # Check if we need to start a new epoch
-        epoch_step = (step - 1) % steps_per_epoch
+        epoch_step = step % steps_per_epoch
         if epoch_step == 0:
             epoch += 1
             logger.info(f"Starting epoch {epoch}")
             # Reshuffle dataset at the beginning of each epoch
-            dataset = dataset.shuffle(seed=(config.seed or 0) + epoch - 1)
+            dataset = dataset.shuffle(seed=(config.seed or 0) + epoch)
 
         logger.debug(
-            f"Orchestrator step {step} (epoch: {epoch}, epoch_step: {epoch_step + 1}/{steps_per_epoch}, checkpoint step: {ckpt_step})"
+            f"Orchestrator step {step} (epoch: {epoch}, epoch_step: {epoch_step}/{steps_per_epoch}, checkpoint step: {ckpt_step})"
         )
         step_start_time = time.time()
 
@@ -131,15 +131,14 @@ async def orchestrate(config: OrchestratorConfig, setup_queue: Queue | None = No
         batch_messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
 
         # Optionally, wait for the next checkpoint to be available
-        async_level = step - 1 - ckpt_step  # How many steps training ahead
         wait_for_weight_ckpt_time, reload_weights_time = 0, 0
-        if async_level > config.async_level:
-            ckpt_step = step - 1 - config.async_level
+        if step - ckpt_step > config.async_level:
             logger.debug(
-                f"Hit async barrier because step {step} is {async_level} (>{config.async_level}) steps ahead of checkpoint step {ckpt_step}."
+                f"Hit async barrier because step {step} is {step - ckpt_step} (>{config.async_level}) steps ahead of checkpoint step {ckpt_step}."
             )
 
             # Wait for the checkpoint to be available
+            ckpt_step = step - config.async_level
             logger.debug(f"Waiting for weight checkpoint for step {ckpt_step}")
             wait_for_weight_ckpt_start_time = time.time()
             wait_for_weight_checkpoint(config.weights.path, ckpt_step)
diff --git a/src/zeroband/training/train.py b/src/zeroband/training/train.py
@@ -101,6 +101,7 @@ def train(config: TrainingConfig):
             logger.info(f"Initializing shardcast from {envs.SHARDCAST_OUTPUT_DIR}")
             shardcast.initialize(
                 envs.SHARDCAST_OUTPUT_DIR,
+                # +1 to ensure to not delete current checkpoint when async_level=0
                 max_distribution_folders=config.async_level + 1,
             )
 
@@ -171,7 +172,7 @@ def train(config: TrainingConfig):
         if config.recompute_logprobs:
             logger.debug("Recomputing logprobs")
             compute_logprobs_start_time = time.time()
-            og_infer_step = progress.step - 1 - config.async_level  # -1 because we haven't updated the model yet
+            og_infer_step = progress.step - config.async_level
             infer_step = max(og_infer_step, 0)
 
             # Wake up the logprob model from CPU
@@ -268,7 +269,7 @@ def train(config: TrainingConfig):
         optimizer.zero_grad()
 
         # Save the weight checkpoint
-        step_path = Path(config.weights.path) / f"step_{progress.step}"
+        step_path = Path(config.weights.path) / f"step_{progress.step + 1}"
         save_weights_start_time = time.time()
         model_path = save_weight_checkpoint(model, tokenizer, step_path, async_save=config.weights.save_async)
         active_weight_checkpoint_paths.append(step_path)
@@ -310,7 +311,7 @@ def train(config: TrainingConfig):
         if config.recompute_logprobs:
             logger.debug("Offloading updated model to CPU")
             reshard_module(logprob_model)
-            tensor_offloaded_repository[progress.step] = copy_model_to_cpu(model)
+            tensor_offloaded_repository[progress.step + 1] = copy_model_to_cpu(model)
 
         # Compute step metrics
         num_local_tokens = micro_batch_size * seq_len * num_micro_batches