Add subquadratic-ops support (#1043)

farhadrgh · web-flow · commit 555ee5f3cfb4 · 2025-08-15T17:58:03.000Z
### Description  ### Type of changes  - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): ### CI Pipeline Configuration Configure CI behavior by applying the relevant labels: - [SKIP_CI](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#skip_ci) - Skip all continuous integration tests - [INCLUDE_NOTEBOOKS_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_notebooks_tests) - Execute notebook validation tests in pytest - [INCLUDE_SLOW_TESTS](https://github.com/NVIDIA/bionemo-framework/blob/main/docs/docs/user-guide/contributing/contributing.md#include_slow_tests) - Execute tests labelled as slow in pytest for extensive testing > \[!NOTE\] > By default, the notebooks validation tests are skipped unless explicitly enabled. #### Authorizing CI Runs We use [copy-pr-bot](https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/#automation) to manage authorization of CI runs on NVIDIA's compute resources. - If a pull request is opened by a trusted user and contains only trusted changes, the pull request's code will automatically be copied to a pull-request/ prefixed branch in the source repository (e.g. pull-request/123) - If a pull request is opened by an untrusted user or contains untrusted changes, an NVIDIA org member must leave an `/ok to test` comment on the pull request to trigger CI. This will need to be done for each new commit. ### Usage  ```python # TODO: Add code snippet ``` ### Pre-submit Checklist  - [ ] I have tested these changes locally - [ ] I have updated the documentation accordingly - [ ] I have added/updated tests as needed - [ ] All existing tests pass successfully --------- Signed-off-by: Farhad Ramezanghorbani <farhadr@nvidia.com>
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
@@ -1 +1 @@
-Subproject commit ee0be114bd75c91c37bd4885451afef8b254729b
+Subproject commit f4f22a26bb3d08f087e50843e148a07c4c3f2472
diff --git a/ci/benchmarks/partial-conv/evo2_pretrain.yaml b/ci/benchmarks/partial-conv/evo2_pretrain.yaml
@@ -16,7 +16,7 @@ script_args:
   workspace: /workspace/bionemo2
   data_path: /data/evo2
   artefacts_url: https://__token__:${JET_GITLAB_TOKEN}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
-  file_name_wheel: cuhyena==v0.2.3+cuda12.9
+  file_name_wheel: subquadratic-ops
   model: evo2
   variant: train
   config_name: 1b
@@ -55,7 +55,7 @@ script: |-
   --enable-preemption \
   --ckpt-async-save \
   --use-megatron-comm-overlap-llama3-8k \
-  --use-b2b-causal-conv1d \
+  --use-subquadratic_ops \
   --overlap-grad-reduce \
   --clip-grad=${clip_grad} \
   --eod-pad-in-loss-mask \
@@ -78,7 +78,7 @@ script: |-
   --limit-val-batches=20 \
   --log-every-n-steps=50 \
   --val-check-interval=500 \
-  --use-b2b-causal-conv1d \
+  --use-subquadratic_ops \
   --create-tflops-callback \
   --create-tensorboard-logger \
   --result-dir=${tensorboard_dir} \
diff --git a/ci/benchmarks/perf/evo2_pretrain.yaml b/ci/benchmarks/perf/evo2_pretrain.yaml
@@ -76,7 +76,7 @@ script: |-
   --limit-val-batches=20 \
   --log-every-n-steps=50 \
   --val-check-interval=200 \
-  --use-b2b-causal-conv1d \
+  --use-subquadratic_ops \
   --create-tflops-callback \
   --create-tensorboard-logger \
   --result-dir=${tensorboard_dir} \
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -462,9 +462,9 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         help="Dropout probability for the attention layers.",
     )
     parser.add_argument(
-        "--use-b2b-causal-conv1d",
+        "--use-subquadratic_ops",
         action="store_true",
-        help="Use back-to-back causal convolution CUDA kernel for hyena short conv layers for improved performance.",
+        help="Use subquadratic_ops for improved performance.",
     )
     parser.add_argument(
         "--save-top-k",
@@ -597,8 +597,8 @@ def train(args: argparse.Namespace) -> nl.Trainer:
         config_modifiers_init["ffn_hidden_size"] = args.ffn_hidden_size
     if args.use_targeted_variance_loss:
         config_modifiers_init["use_targeted_variance_loss"] = True
-    if args.use_b2b_causal_conv1d:
-        config_modifiers_init["use_b2b_causal_conv1d"] = True
+    if args.use_subquadratic_ops:
+        config_modifiers_init["use_subquadratic_ops"] = True
     if args.hybrid_override_pattern:
         config_modifiers_init["hybrid_override_pattern"] = args.hybrid_override_pattern
     if args.num_layers: