Update EVO2 tests according to Hyena arch changes (#798)

farhadrgh · dorotat-nv · cspades · web-flow · commit 6ab0afcf57cb · 2025-05-05T17:54:26.000Z
### Description NVIDIA-NeMo/NeMo#12856 introduces code reduction and perf improvements including standardizing input/output shapes for Hyena operators and consequentially reducing rearrangement overhead. This PR updates the EVO2 test to comply with those changes, ### Type of changes  - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels: - [SKIP_CI](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#skip_ci) - Skip all continuous integration tests - [INCLUDE_NOTEBOOKS_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_notebooks_tests) - Execute notebook validation tests in pytest - [INCLUDE_SLOW_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_slow_tests) - Execute tests labelled as slow in pytest for extensive testing > [!NOTE] > By default, the notebooks validation tests are skipped unless explicitly enabled. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. * If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) * If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Usage  ```python TODO: Add code snippet ``` ### Pre-submit Checklist  - [ ] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [ ] All existing tests pass successfully --------- Signed-off-by: Farhad Ramezanghorbani <farhadr@nvidia.com> Signed-off-by: Cory Ye <cye@nvidia.com> Signed-off-by: cspades <cory0ye@gmail.com> Signed-off-by: Timur Rvachov <trvachov@nvidia.com> Signed-off-by: Danny <dreidenbach@nvidia.com> Signed-off-by: Cory Ye <44509866+cspades@users.noreply.github.com> Signed-off-by: nvdreidenbach <97637601+nvdreidenbach@users.noreply.github.com> Signed-off-by: Peter St. John <pstjohn@nvidia.com> Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Polina Binder <pbinder@nvidia.com> Signed-off-by: polinabinder1 <pbinder@nvidia.com> Signed-off-by: dorotat <dorotat@nvidia.com> Signed-off-by: Truong Nguyen <tgnguyen@nvidia.com> Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com> Signed-off-by: Timur Rvachov <120140748+trvachov@users.noreply.github.com> Signed-off-by: Steven <skothenhill@nvidia.com> Co-authored-by: Dorota Toczydlowska <115542912+dorotat-nv@users.noreply.github.com> Co-authored-by: Cory Ye <44509866+cspades@users.noreply.github.com> Co-authored-by: Timur Rvachov <120140748+trvachov@users.noreply.github.com> Co-authored-by: nvdreidenbach <97637601+nvdreidenbach@users.noreply.github.com> Co-authored-by: Steven Kothen-Hill <148821680+skothenhill-nv@users.noreply.github.com> Co-authored-by: Peter St. John <pstjohn@nvidia.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: polinabinder1 <pbinder@nvidia.com> Co-authored-by: Truong Nguyen <tgnguyen@nvidia.com> Co-authored-by: jomitchellnv <148147880+jomitchellnv@users.noreply.github.com> Co-authored-by: lvojtku <lvojtku@nvidia.com>
diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM
@@ -1 +1 @@
-Subproject commit 62529f1d8e3d76f45ba5c0b4d7791566055d3eee
+Subproject commit fb7c3f8718397c8ab20485edaaae820ddd3c1b88
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit b685967f9512e1906e11fbd95048ff0fb05ff2fe
+Subproject commit 42d2b5531e4e2ae3ed66d35005b93b3b2b08b51f
diff --git a/sub-packages/bionemo-amplify/tests/bionemo/amplify/test_hf_rotary.py b/sub-packages/bionemo-amplify/tests/bionemo/amplify/test_hf_rotary.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import torch
-from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
+from megatron.core.models.common.embeddings.rope_utils import _apply_rotary_pos_emb_bshd
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from transformers import AutoConfig
 
@@ -47,8 +47,20 @@ def test_rope_embeddings():
         seq_len_interpolation_factor=nemo_config.seq_len_interpolation_factor,
     )
     rotary_pos_emb = rotary_pos_layer(q.shape[1])
-    q_post_nemo = apply_rotary_pos_emb(q.transpose(0, 1).cuda(), rotary_pos_emb.cuda(), config=nemo_config).cpu()
-    k_post_nemo = apply_rotary_pos_emb(k.transpose(0, 1).cuda(), rotary_pos_emb.cuda(), config=nemo_config).cpu()
+    # Note: Use the backend implementation of the RoPE to avoid
+    # getting or instantiating a CP process group.
+    q_post_nemo = _apply_rotary_pos_emb_bshd(
+        q.transpose(0, 1).cuda(),
+        rotary_pos_emb.cuda(),
+        rotary_interleaved=nemo_config.rotary_interleaved,
+        multi_latent_attention=nemo_config.multi_latent_attention,
+    ).cpu()
+    k_post_nemo = _apply_rotary_pos_emb_bshd(
+        k.transpose(0, 1).cuda(),
+        rotary_pos_emb.cuda(),
+        rotary_interleaved=nemo_config.rotary_interleaved,
+        multi_latent_attention=nemo_config.multi_latent_attention,
+    ).cpu()
 
     torch.testing.assert_close(q_post, q_post_nemo.transpose(0, 1))
     torch.testing.assert_close(k_post, k_post_nemo.transpose(0, 1))
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_train_esm2.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/scripts/test_train_esm2.py
@@ -327,7 +327,7 @@ def test_main_runs(tmp_path, dummy_protein_dataset, dummy_parquet_train_val_inpu
     event_files = list(log_dir.rglob("events.out.tfevents*"))
     assert event_files, f"No TensorBoard event files found under {log_dir}"
     assert "val_ppl" in trainer.logged_metrics  # validation logging on by default
-    assert "tflops_per_sec_per_gpu" in trainer.logged_metrics  # ensuring that tflops logger can be added
+    assert "TFLOPS_per_GPU" in trainer.logged_metrics  # ensuring that tflops logger can be added
     assert "train_step_timing in s" in trainer.logged_metrics
 
 
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py
@@ -157,7 +157,9 @@ def predict_step(self, batch, batch_idx: Optional[int] = None) -> Tensor:
             return forward_out
         # Reminder: the model's predictions for input i land at output i+1. To get everything to align, we prepend the
         # EOS token to the input sequences and take the outputs for all but the first token.
-        forward_out_tp_gathered = _gather_along_last_dim(forward_out)
+        forward_out_tp_gathered = _gather_along_last_dim(
+            forward_out, group=parallel_state.get_tensor_model_parallel_group()
+        )
         # else:
         #     forward_out_tp_gathered = _collect_into_dim(forward_out, dim=-1)
         forward_out_gathered = _gather_along_cp_dim(forward_out_tp_gathered)
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_train.py
@@ -146,7 +146,7 @@ def test_train_evo2_stops(tmp_path):
     )
 
     assert "reduced_train_loss" in trainer.logged_metrics  # validation logging on by default
-    assert "tflops_per_sec_per_gpu" in trainer.logged_metrics  # ensuring that tflops logger can be added
+    assert "TFLOPS_per_GPU" in trainer.logged_metrics  # ensuring that tflops logger can be added
     assert "train_step_timing in s" in trainer.logged_metrics
 
 
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/test_hyena_operators.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/test_hyena_operators.py
@@ -68,14 +68,14 @@ def test_gpu_forward(self, operator: ParallelHyenaOperator):
         g = operator.num_groups
         dg = operator.group_dim
 
-        x1 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        x2 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        v = torch.ones((batch_size, seq_len, g, dg), device=device)
+        x1 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        x2 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        v = torch.ones((batch_size, (g * dg), seq_len), device=device)
 
         output = operator(x1, x2, v)
         assert output.shape[0] == batch_size
-        assert output.shape[1] == seq_len
-        assert output.shape[2] == operator.hidden_size
+        assert output.shape[1] == operator.hidden_size
+        assert output.shape[2] == seq_len
 
 
 class TestParallelShortHyenaOperator:
@@ -89,7 +89,6 @@ def operator(self, transformer_config: TransformerConfig, hyena_config: HyenaCon
                 init_method="small_init",
                 short_conv_class=ParallelCausalDepthwiseConv1d,
                 use_fast_causal_conv=False,
-                is_mlp=False,
                 local_init=False,
                 use_conv_bias=False,
             )
@@ -109,14 +108,14 @@ def test_gpu_forward(self, operator: ParallelShortHyenaOperator):
         g = operator.num_groups
         dg = operator.group_dim
 
-        x1 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        x2 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        v = torch.ones((batch_size, seq_len, g, dg), device=device)
+        x1 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        x2 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        v = torch.ones((batch_size, (g * dg), seq_len), device=device)
 
         output = operator(x1, x2, v)
         assert output.shape[0] == batch_size
-        assert output.shape[1] == seq_len
-        assert output.shape[2] == operator.hidden_size
+        assert output.shape[1] == operator.hidden_size
+        assert output.shape[2] == seq_len
 
 
 class TestParallelShortHyenaOperatorWithConvBias:
@@ -130,7 +129,6 @@ def operator(self, transformer_config: TransformerConfig, hyena_config: HyenaCon
                 init_method="small_init",
                 short_conv_class=ParallelCausalDepthwiseConv1d,
                 use_fast_causal_conv=False,
-                is_mlp=False,
                 local_init=False,
                 use_conv_bias=True,
             )
@@ -150,14 +148,14 @@ def test_gpu_forward(self, operator: ParallelShortHyenaOperator):
         g = operator.num_groups
         dg = operator.group_dim
 
-        x1 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        x2 = torch.ones((batch_size, seq_len, g, dg), device=device)
-        v = torch.ones((batch_size, seq_len, g, dg), device=device)
+        x1 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        x2 = torch.ones((batch_size, (g * dg), seq_len), device=device)
+        v = torch.ones((batch_size, (g * dg), seq_len), device=device)
 
         output = operator(x1, x2, v)
         assert output.shape[0] == batch_size
-        assert output.shape[1] == seq_len
-        assert output.shape[2] == operator.hidden_size
+        assert output.shape[1] == operator.hidden_size
+        assert output.shape[2] == seq_len
 
 
 class TestParallelCausalDepthwiseConv1d:
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/model/finetune_token_regressor.py
@@ -24,10 +24,7 @@
 from nemo.collections.llm.peft.lora import LoRA, LoRALinear
 from nemo.collections.nlp.modules.common.megatron.adapters.parallel_adapters import ParallelLinearAdapter
 from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
-from nemo.lightning.megatron_parallel import (
-    masked_token_loss,
-    masked_token_loss_context_parallel,
-)
+from nemo.lightning.megatron_parallel import masked_token_loss
 from torch import Tensor, nn
 
 from bionemo.llm.model.biobert.model import BioBertConfig, BioBertOutput, MegatronBioBertModel
@@ -102,17 +99,7 @@ def forward(
         # TODO(@jstjohn) also handle different output keys, like the sequence loss.
 
         cp_size = parallel_state.get_context_parallel_world_size()
-        if cp_size == 1:
-            # reduce the loss across the micro batch
-            loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"])
-        else:
-            # reduce the loss across the micro batch.
-            # TODO(@jomitchell): Figure out who defines "num_valid_tokens_in_ub" in the batch and document/understand this.
-            #  This has something to do with context parallel, and there is probably a megatron or nemo function that adds this and
-            #  other necessary keys to the batch. Thanks!
-            loss_for_microbatch = masked_token_loss_context_parallel(
-                unreduced_token_loss, batch["loss_mask"], batch["num_valid_tokens_in_ub"]
-            )
+        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size)
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.
diff --git a/sub-packages/bionemo-llm/pyproject.toml b/sub-packages/bionemo-llm/pyproject.toml
@@ -16,7 +16,7 @@ dependencies = [
     # external
     'lightning>=2.2.1',
     'megatron-core',
-    'nemo_toolkit[nlp]>=2.2.1',
+    'nemo_toolkit[nlp,eval]>=2.2.1',
     'nemo-run',
     'hatchling',
 ]
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/loss.py
@@ -22,7 +22,6 @@
 from nemo.lightning.megatron_parallel import (
     MegatronLossReduction,
     masked_token_loss,
-    masked_token_loss_context_parallel,
 )
 from torch import Tensor
 
@@ -179,24 +178,17 @@ def forward(
 
         # TODO(@jstjohn) also handle different output keys, like the sequence loss.
 
-        # compute loss
+        # Compute loss over "valid" tokens in the microbatch, i.e. the non-masked tokens.
+        # The loss is not normalized, only potentially reduced via torch.distributed.ReduceOp.SUM
+        # across the context parallel process group, so you need to divide by the number
+        # of non-masked tokens (loss_mask.sum()) to compute the mean reduced loss per token.
         cp_size = parallel_state.get_context_parallel_world_size()
-        if cp_size == 1:
-            # reduce the loss across the micro batch per valid token
-            loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"])
-        else:
-            # reduce the loss across the micro batch per valid token.
-            # TODO(@jomitchell): Figure out who defines "num_valid_tokens_in_ub" in the batch and document/understand this.
-            #  This has something to do with context parallel, and there is probably a megatron or nemo function that adds this and
-            #  other necessary keys to the batch. Thanks!
-            loss_for_microbatch = masked_token_loss_context_parallel(
-                unreduced_token_loss, batch["loss_mask"], batch["num_valid_tokens_in_ub"]
-            )
+        loss_for_microbatch = masked_token_loss(unreduced_token_loss, batch["loss_mask"], cp_size=cp_size)
+        num_valid_tokens_in_microbatch = batch["loss_mask"].sum()
 
         # If we do not drop the last partial batch of validation, we need to do fancy reduction handling to support
         #  reducing the loss across the data parallel group.
         if self.validation_step and not self.val_drop_last:
-            num_valid_tokens_in_microbatch = batch["loss_mask"].sum()
             if loss_for_microbatch.isnan():
                 # TODO(@jomitchell): Add a unit test for this. This is the case where there are no valid tokens in the microbatch for the loss
                 #  to be computed over, so we expect a NaN loss (divide by zero for a mean) but we make this an expected and non-breaking case,
@@ -205,9 +197,8 @@ def forward(
                     raise ValueError("Got NaN loss with non-empty input")
                 loss_sum_for_microbatch = torch.zeros_like(num_valid_tokens_in_microbatch)
             else:
-                loss_sum_for_microbatch = (
-                    num_valid_tokens_in_microbatch * loss_for_microbatch
-                )  # sum over all valid tokens
+                # The reduced loss is already the sum of all losses from masked_token_loss().
+                loss_sum_for_microbatch = loss_for_microbatch
 
             # In this case we need to store the loss sum as well as the number of valid tokens in the microbatch.
             loss_sum_and_microbatch_size_all_gpu = torch.cat(
@@ -216,17 +207,28 @@ def forward(
                     Tensor([num_valid_tokens_in_microbatch]).cuda().clone().detach(),
                 ]
             )
+
+            # Reduce the loss sum across the data parallel group to get the total loss
+            # for all data parallel / distributed microbatches.
             torch.distributed.all_reduce(
                 loss_sum_and_microbatch_size_all_gpu,
                 group=parallel_state.get_data_parallel_group(),
                 op=torch.distributed.ReduceOp.SUM,
             )
+
+            # Return the loss tensor multiplied by the context parallel size,
+            # and the data & context parallel reduced loss sum.
             return loss_for_microbatch * cp_size, {
                 "loss_sum_and_microbatch_size": loss_sum_and_microbatch_size_all_gpu
             }
 
-        # average the losses across the data parallel group, but also return the unreduced loss
-        reduced_loss = average_losses_across_data_parallel_group([loss_for_microbatch])
+        # Return the loss tensor multiplied by the context parallel size, as well as
+        # the data-parallel averaged loss, i.e. the loss divided by the DP size.
+        # Normalize the loss by the number of "valid" tokens, because masked_token_loss
+        # no longer does this normalization, and BioNeMo losses expect this normalization.
+        reduced_loss = (
+            average_losses_across_data_parallel_group([loss_for_microbatch]) / num_valid_tokens_in_microbatch
+        )
         return loss_for_microbatch * cp_size, {"avg": reduced_loss}
 
 
diff --git a/sub-packages/bionemo-llm/tests/bionemo/llm/model/test_loss.py b/sub-packages/bionemo-llm/tests/bionemo/llm/model/test_loss.py
@@ -75,7 +75,7 @@ def test_loss_equivalency_nemo_vs_pytorch():
             batch=batch_megatron,
             forward_out=unreduced_megatron_loss,  # wants the loss directly
         )
-        final_nemo_loss = nemo_default_loss_fn.reduce([forward_nemo_loss[1]])
+        final_nemo_loss = nemo_default_loss_fn.reduce([forward_nemo_loss[2]])
 
         # First check, nemo+megatron loss
         torch.testing.assert_close(expected_loss, final_nemo_loss)

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@ def test_train_evo2_stops(tmp_path):`
`146`	`146`	`)`
`147`	`147`
`148`	`148`	`assert "reduced_train_loss" in trainer.logged_metrics # validation logging on by default`
`149`		`- assert "tflops_per_sec_per_gpu" in trainer.logged_metrics # ensuring that tflops logger can be added`
	`149`	`+ assert "TFLOPS_per_GPU" in trainer.logged_metrics # ensuring that tflops logger can be added`
`150`	`150`	`assert "train_step_timing in s" in trainer.logged_metrics`
`151`	`151`
`152`	`152`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ dependencies = [`
`16`	`16`	`# external`
`17`	`17`	`'lightning>=2.2.1',`
`18`	`18`	`'megatron-core',`
`19`		`- 'nemo_toolkit[nlp]>=2.2.1',`
	`19`	`+ 'nemo_toolkit[nlp,eval]>=2.2.1',`
`20`	`20`	`'nemo-run',`
`21`	`21`	`'hatchling',`
`22`	`22`	`]`