NVIDIA
diff --git a/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 8 additions & 3 deletions b/‎bionemo-recipes/models/amplify/src/amplify/state.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 31 additions & 13 deletions b/‎bionemo-recipes/models/esm2/src/esm/collator.py‎
Lines changed: 31 additions & 13 deletions
diff --git a/‎bionemo-recipes/models/esm2/src/esm/state.py‎
Lines changed: 8 additions & 3 deletions b/‎bionemo-recipes/models/esm2/src/esm/state.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/README.md‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/models/esm2/tests/common/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/models/esm2/tests/common/__init__.py‎
Lines changed: 0 additions & 15 deletions b/‎bionemo-recipes/models/esm2/tests/common/__init__.py‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/fixtures.py‎
Lines changed: 3 additions & 18 deletions b/‎bionemo-recipes/models/esm2/tests/common/fixtures.py‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 12 additions & 5 deletions b/‎bionemo-recipes/models/esm2/tests/common/test_modeling_common.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 94 additions & 0 deletions b/‎bionemo-recipes/models/esm2/tests/test_collator.py‎
Lines changed: 94 additions & 0 deletions
@@ -67,8 +67,8 @@ def apply_transforms(
     source: Union[nn.Module, _ModelState],
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
-    state_dict_ignored_entries: List = [],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    state_dict_ignored_entries: Optional[List] = None,
     cast_dtype: Optional[torch.dtype] = None,
 ) -> TargetModuleT:
     """Transform the state dictionary of a source module to match the structure of a target module's state dictionary.
@@ -126,6 +126,11 @@ def scale_weights(ctx):
         This function is particularly useful when adapting models from different frameworks or
         when consolidating models with different architectural changes.
     """
+    if transforms is None:
+        transforms = []
+    if state_dict_ignored_entries is None:
+        state_dict_ignored_entries = []
+
     # Track dtypes to make sure they weren't modified during conversion.
     target_orig_dtypes = extract_dtypes(target.named_parameters())
 
@@ -318,7 +323,7 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
                     try:
                         source_match = source_matches[target_index]
                     except IndexError as e:
-                        logger.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
+                        logger.error(f"Encountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                         raise e
                     if accepts_var_args:
                         source_values = [source_dict[k] for k in source_match]
 
@@ -156,8 +156,11 @@ def __call__(self, features, return_tensors=None):
             sequence processing capabilities. When pad_to_multiple_of is used, an additional
             mock sequence is appended to reach the desired total length.
         """
+        if return_tensors is not None and return_tensors != "pt":
+            raise NotImplementedError(f"Only return_tensors='pt' is supported, got '{return_tensors}'")
+
         # Perform the masking with the BSHD collator.
-        bshd_batch = self.collator(features)
+        bshd_batch = self.collator(features, return_tensors=return_tensors)
 
         # Create the flattened batch to get the cu_seq_lens_q and cu_seq_lens_k values.
         packed_batch = _pt_flatten_collate(features, return_position_ids=self.return_position_ids)
@@ -279,33 +282,48 @@ def __iter__(self):
         samples = []
         current_length = 0
         for sample in iter(self.dataset):
-            current_length += self._padded_len(len(sample["input_ids"]))
+            sample_length = len(sample["input_ids"])
+            if sample_length > self.max_tokens_per_batch:
+                raise ValueError(
+                    f"TokenPackingDataset: Sample length ({sample_length}) exceeds max_tokens_per_batch "
+                    f"({self.max_tokens_per_batch}). Set truncation or a maximum length in your tokenizer or dataset to"
+                    "ensure all samples fit within max_tokens_per_batch."
+                )
+
+            current_length += self._padded_len(sample_length)
             if current_length == self.max_tokens_per_batch:
                 yield [*samples, sample]
                 samples = []
                 current_length = 0
 
             elif current_length > self.max_tokens_per_batch:
                 if not self.split_samples:
-                    # If we are not splitting samples, we can just yield the current batch (before this sample) and
-                    # start a new one.
-                    yield samples
+                    # Yield the current batch (before this sample) and start a new one with this sample.
+                    if samples:
+                        yield samples
                     samples = [sample]
-
+                    current_length = self._padded_len(sample_length)
                 else:
-                    # Calculate how many padded tokens are already in the batch
-                    tokens_in_batch = current_length - self._padded_len(len(sample["input_ids"]))
+                    # Calculate how many padded tokens are already in the batch.
+                    tokens_in_batch = current_length - self._padded_len(sample_length)
                     # Calculate how many tokens we can fit from this sample, ensuring the
                     # padded length doesn't exceed the remaining capacity.
                     tokens_available = self.max_tokens_per_batch - tokens_in_batch
                     if self.pad_sequences_to_be_divisible_by is not None:
                         d = self.pad_sequences_to_be_divisible_by
                         tokens_available = (tokens_available // d) * d
-                    first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
-                    yield [*samples, first_part]
-                    samples = [remaining_part]
-
-                current_length = self._padded_len(len(samples[0]["input_ids"]))
+                    if tokens_available <= 0:
+                        # Remaining capacity is less than pad_sequences_to_be_divisible_by;
+                        # can't fit any tokens from this sample. Yield current batch and start fresh.
+                        if samples:
+                            yield samples
+                        samples = [sample]
+                        current_length = self._padded_len(sample_length)
+                    else:
+                        first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
+                        yield [*samples, first_part]
+                        samples = [remaining_part]
+                        current_length = self._padded_len(len(samples[0]["input_ids"]))
             else:
                 samples.append(sample)
 
 
@@ -67,8 +67,8 @@ def apply_transforms(
     source: Union[nn.Module, _ModelState],
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
-    state_dict_ignored_entries: List = [],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    state_dict_ignored_entries: Optional[List] = None,
     cast_dtype: Optional[torch.dtype] = None,
 ) -> TargetModuleT:
     """Transform the state dictionary of a source module to match the structure of a target module's state dictionary.
@@ -126,6 +126,11 @@ def scale_weights(ctx):
         This function is particularly useful when adapting models from different frameworks or
         when consolidating models with different architectural changes.
     """
+    if transforms is None:
+        transforms = []
+    if state_dict_ignored_entries is None:
+        state_dict_ignored_entries = []
+
     # Track dtypes to make sure they weren't modified during conversion.
     target_orig_dtypes = extract_dtypes(target.named_parameters())
 
@@ -318,7 +323,7 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
                     try:
                         source_match = source_matches[target_index]
                     except IndexError as e:
-                        logger.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
+                        logger.error(f"Encountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                         raise e
                     if accepts_var_args:
                         source_values = [source_dict[k] for k in source_match]
 
@@ -4,7 +4,7 @@ Shared test infrastructure for BioNeMo models. One base class, **BaseModelTest**
 
 ## Structure
 
-```
+```text
 tests/common/
 ├── __init__.py             # Public API exports
 ├── test_modeling_common.py # BaseModelTest, TestTolerances
 
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Common test utilities for BioNeMo models.
 
 This package provides reusable test infrastructure following HuggingFace
 
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Shared test fixtures for BioNeMo models."""
 
 import os
@@ -63,7 +48,7 @@ def use_te_debug():
 
     os.environ["NVTE_DEBUG"] = "1"
     yield
-    del os.environ["NVTE_DEBUG"]
+    os.environ.pop("NVTE_DEBUG", None)
 
 
 ALL_RECIPES = [
@@ -138,6 +123,6 @@ def te_attn_backend(request):
 
     yield request.param
 
-    del os.environ["NVTE_FUSED_ATTN"]
-    del os.environ["NVTE_FLASH_ATTN"]
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
     _attention_backends["backend_selection_requires_update"] = True
@@ -15,6 +15,7 @@
 
 """Common test class for BioNeMo models, following HuggingFace transformers patterns."""
 
+import fnmatch
 import gc
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -30,9 +31,12 @@
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, set_seed
 
 
-HAS_DATA_CENTER_GPU = any(
-    gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
-)
+try:
+    HAS_DATA_CENTER_GPU = torch.cuda.is_available() and any(
+        gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
+    )
+except (RuntimeError, AssertionError):
+    HAS_DATA_CENTER_GPU = False
 
 
 @dataclass
@@ -283,7 +287,9 @@ def msg(x):
                 if should_be_fp8:
                     if f"{name}.weight" in set(model._tied_weights_keys):
                         continue  # Skip tied weights
-                    elif hasattr(model, "_do_not_quantize") and name in model._do_not_quantize:
+                    elif hasattr(model, "_do_not_quantize") and any(
+                        fnmatch.fnmatch(name, pattern) for pattern in model._do_not_quantize
+                    ):
                         continue  # Skip weights that should be kept in bf16
                     assert isinstance(module.weight, QuantizedTensor), f"Module {name} weight is not a Float8Tensor"
 
@@ -340,13 +346,14 @@ def get_reference_model(
         model.to("cuda")
         return model
 
-    def get_reference_model_no_weights(self) -> PreTrainedModel:
+    def get_reference_model_no_weights(self, **kwargs) -> PreTrainedModel:
         """Load the reference HuggingFace model with random weights."""
         return self.get_upstream_model_class()(
             AutoConfig.from_pretrained(
                 self.get_upstream_model_id(),
                 dtype=torch.float32,
                 revision=self.get_upstream_model_revision(),
+                **kwargs,
             )
         )
 
 
@@ -968,6 +968,100 @@ def __iter__(self):
         )
 
 
+def test_token_packing_dataset_padding_split_remaining_capacity_below_divisor():
+    """Test that split mode handles remaining capacity below pad_sequences_to_be_divisible_by.
+
+    When the remaining batch capacity (after rounding down to the pad divisor) is 0,
+    the current batch must be yielded and the sample starts a new batch. Without this
+    guard, _split_sample_by_num_tokens would be called with tokens_available=0 and crash.
+
+    max=12, pad=8, split=True:
+    - s1: raw=5, padded=8. current=8 < 12. Append.
+    - s2: raw=3, padded=8. current=8+8=16 > 12.
+      tokens_in_batch=8, tokens_available=12-8=4, rounded to (4//8)*8=0 → yield [s1], fresh batch.
+    - s3: raw=4, padded=8. current=8+8=16 > 12. Same: yield [s2], fresh batch.
+    """
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {"input_ids": list(range(5))}  # padded to 8
+            yield {"input_ids": list(range(3))}  # padded to 8
+            yield {"input_ids": list(range(4))}  # padded to 8
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(
+        dataset,
+        max_tokens_per_batch=12,
+        pad_sequences_to_be_divisible_by=8,
+        split_samples=True,
+        drop_last=False,
+    )
+    batches = list(token_packing_dataset)
+
+    # Each sample pads to 8; only one fits per batch (8 < 12, but 8+8=16 > 12,
+    # and remaining capacity 4 rounds down to 0 with pad=8).
+    assert len(batches) == 3
+    assert [len(s["input_ids"]) for s in batches[0]] == [5]
+    assert [len(s["input_ids"]) for s in batches[1]] == [3]
+    assert [len(s["input_ids"]) for s in batches[2]] == [4]
+
+
+def test_token_packing_dataset_padding_no_split_yields_before_overflow():
+    """Test that non-split mode correctly yields the batch before a padded sample overflows.
+
+    max=12, pad=8, split=False:
+    - s1: raw=5, padded=8. current=8 < 12. Append.
+    - s2: raw=3, padded=8. current=8+8=16 > 12. Yield [s1], start fresh with s2.
+    - s3: raw=4, padded=8. current=8+8=16 > 12. Yield [s2], start fresh with s3.
+    """
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {"input_ids": list(range(5))}  # padded to 8
+            yield {"input_ids": list(range(3))}  # padded to 8
+            yield {"input_ids": list(range(4))}  # padded to 8
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(
+        dataset,
+        max_tokens_per_batch=12,
+        pad_sequences_to_be_divisible_by=8,
+        split_samples=False,
+        drop_last=False,
+    )
+    batches = list(token_packing_dataset)
+
+    # Each sample pads to 8, only one fits per batch (8 < 12, but 8+8=16 > 12)
+    assert len(batches) == 3
+    assert [len(s["input_ids"]) for s in batches[0]] == [5]
+    assert [len(s["input_ids"]) for s in batches[1]] == [3]
+    assert [len(s["input_ids"]) for s in batches[2]] == [4]
+
+
+def test_token_packing_dataset_oversized_sample_raises():
+    """Test that a sample exceeding max_tokens_per_batch raises a ValueError.
+
+    Users should set truncation or a maximum length in their tokenizer/dataset to ensure
+    all samples fit within max_tokens_per_batch.
+    """
+
+    class MockDataset(torch.utils.data.IterableDataset):
+        def __iter__(self):
+            yield {"input_ids": list(range(5))}  # fits
+            yield {"input_ids": list(range(25))}  # exceeds max of 10
+
+    dataset = MockDataset()
+    token_packing_dataset = TokenPackingDataset(
+        dataset,
+        max_tokens_per_batch=10,
+        split_samples=False,
+        drop_last=False,
+    )
+
+    with pytest.raises(ValueError, match="Sample length.*exceeds max_tokens_per_batch"):
+        list(token_packing_dataset)
+
+
 def test_token_packing_dataset_with_padding_split_drop_last_false(tokenizer):
     """Test that with drop_last=False, all batches except the last have exactly max_tokens."""
     pad_divisor = 4