fixing the ESM2 checkpointing issue (#842)

polinabinder1 · camirr-nv · commit b164f58ab0d8 · 2025-06-26T17:29:41.000Z
### Description This addresses: #757 ### Type of changes In the original code the optimizer was not saved in the checkpoint, but it is expected in the megatron strategy. Saving the optimizer is added to the checkpoint callback and the test has been updated. Changes were made to have the checkpointing callback in the nemo logger. This is the same as the training path in sub-packages/bionemo-llm/src/bionemo/llm/train.py. --------- Signed-off-by: Polina Binder <pbinder@nvidia.com> Signed-off-by: polinabinder1 <pbinder@nvidia.com> Signed-off-by: Ubuntu <camirr@nvidia.com>
diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/train_esm2.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/scripts/train_esm2.py
@@ -338,7 +338,11 @@ def main(
             # Enables the .nemo file-like checkpointing where all IOMixins are under SerDe
             filename="{epoch}-{step}-{consumed_samples}",
             # Including step and consumed_samples in the checkpoint filename prevents duplicate filenames and bugs related to this.
+            # Save both the weights and the optimizer state.
+            save_weights_only=False,
+            save_optim_on_train_end=True,
         )
+
         callbacks.append(checkpoint_callback)
 
         auto_resume = resume.AutoResume(
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_train_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_train_esm2.py
@@ -184,7 +184,7 @@ def get_command_line_args(arg_name, arg_value) -> str:
         arg_str = f"--{arg_name.replace('_', '-')}={arg_value}"
         return arg_str
 
-    cmd = "train_esm2 " + " ".join(get_command_line_args(arg_name, arg_value) for arg_name, arg_value in args.items())
+    cmd = f"train_esm2 {' '.join(get_command_line_args(arg_name, arg_value) for arg_name, arg_value in args.items())}"
     return cmd
 
 
@@ -332,10 +332,6 @@ def test_main_runs(tmp_path, dummy_protein_dataset, dummy_parquet_train_val_inpu
 
 
 @pytest.mark.slow
-@pytest.mark.xfail(
-    reason="ESM2 training fails to resume from checkpoints. "
-    "Issue: https://github.com/NVIDIA/bionemo-framework/issues/757"
-)
 def test_main_stop_at_num_steps_and_continue(tmp_path, dummy_protein_dataset, dummy_parquet_train_val_inputs):
     max_steps_first_run = 4
     max_steps_second_run = 6
@@ -361,11 +357,9 @@ def test_main_stop_at_num_steps_and_continue(tmp_path, dummy_protein_dataset, du
     )
 
     # The first training command to finish at max_steps_first_run
-    stdout_first_run, stderr_first_run, returncode_first_run = run_command_in_subprocess(
-        command=command_first_run, path=str(tmp_path)
-    )
-
-    assert returncode_first_run == 0, "Command failed."
+    # stdout_first_run, stderr_first_run, returncode_first_run
+    #
+    stdout_first_run = run_command_in_subprocess(command=command_first_run, path=str(tmp_path))
 
     assert f"Training epoch 0, iteration 0/{max_steps_first_run - 1}" in stdout_first_run
     # Extract and validate global steps
@@ -403,53 +397,25 @@ def test_main_stop_at_num_steps_and_continue(tmp_path, dummy_protein_dataset, du
         wandb_project=None,
         experiment_name=experiment_name,
     )
-    stdout_second_run, stderr_second_run, returncode_second_run = run_command_in_subprocess(
-        command=command_second_run, path=str(tmp_path)
-    )
+    stdout_second_run = run_command_in_subprocess(command=command_second_run, path=str(tmp_path))
 
-    # Verify that the command failed with a non-zero exit code
-    # This assertion will fail if the resume functionality gets fixed, prompting us to update the test
-    assert returncode_second_run != 0, (
-        "Resuming training passed. "
-        "The resume functionality works and this test needs to be updated. "
-        "Update issue https://github.com/NVIDIA/bionemo-framework/issues/757"
-    )
+    # Verify the model can continue training from the checkpoint without errors
 
-    # Verify the error message contains the expected exception type and error message
-    # The test fails if we get a different error than expected, which would require investigation
-    assert (
-        "megatron.core.dist_checkpointing.core.CheckpointingException" in stderr_second_run
-        and "Cannot find global shape metadata for N-D flattened tensor ShardedTensor" in stderr_second_run
-    ), f"ESM2 training resuming failed due to an unexpected error.\nActual stderr: {stderr_second_run}..."
-
-    # Output the error for logging purposes
-    pytest.fail(
-        "Detected expected failure with megatron.core.dist_checkpointing.core.CheckpointingException as anticipated. "
-        "This is a known issue tracked in: https://github.com/NVIDIA/bionemo-framework/issues/757"
-    )
+    global_steps_second_run = extract_global_steps_from_log(stdout_second_run)
 
-    ### TODO: The following section should be enabled when issue #757 is resolved ###
-    # Once the issue is fixed, we'll need to:
-    # 1. Remove the assertion expecting a non-zero return code
-    # 2. Replace with below assertions that verify successful resumption
-    # 3. Check for specific markers in the output that indicate successful state restoration
-    # 4. Verify the model can continue training from the checkpoint without errors
+    assert global_steps_second_run[0] == max_steps_first_run
+    assert global_steps_second_run[-1] == max_steps_second_run - 1
+    assert len(global_steps_second_run) == max_steps_second_run - max_steps_first_run
 
-    # global_steps_second_run = extract_global_steps_from_log(stdout_second_run)
-    #
-    # assert global_steps_second_run[0] == max_steps_first_run
-    # assert global_steps_second_run[-1] == max_steps_second_run - 1
-    # assert len(global_steps_second_run) == max_steps_second_run - max_steps_first_run
-    #
-    # expected_checkpoint_second_run_suffix = f"step={max_steps_second_run - 1}"
-    # matching_subfolders = [
-    #     p
-    #     for p in checkpoints_dir.iterdir()
-    #     if p.is_dir() and (expected_checkpoint_second_run_suffix in p.name and "last" in p.name)
-    # ]
-    # assert matching_subfolders, (
-    #     f"No checkpoint subfolder ending with '{expected_checkpoint_second_run_suffix}' found in {checkpoints_dir}."
-    # )
+    expected_checkpoint_second_run_suffix = f"step={max_steps_second_run - 1}"
+    matching_subfolders = [
+        p
+        for p in checkpoints_dir.iterdir()
+        if p.is_dir() and (expected_checkpoint_second_run_suffix in p.name and "last" in p.name)
+    ]
+    assert matching_subfolders, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_second_run_suffix}' found in {checkpoints_dir}."
+    )
 
 
 @pytest.mark.parametrize("limit_val_batches", [0.0, 1.0, 4, None])