rebase on moe fix pr

pstjohn · pstjohn · commit 24d1e2c5b2df · 2026-02-18T12:44:11.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/bionemo-recipes/models/mixtral/collator.py b/bionemo-recipes/models/mixtral/collator.py
@@ -156,8 +156,11 @@ def __call__(self, features, return_tensors=None):
             sequence processing capabilities. When pad_to_multiple_of is used, an additional
             mock sequence is appended to reach the desired total length.
         """
+        if return_tensors is not None and return_tensors != "pt":
+            raise NotImplementedError(f"Only return_tensors='pt' is supported, got '{return_tensors}'")
+
         # Perform the masking with the BSHD collator.
-        bshd_batch = self.collator(features)
+        bshd_batch = self.collator(features, return_tensors=return_tensors)
 
         # Create the flattened batch to get the cu_seq_lens_q and cu_seq_lens_k values.
         packed_batch = _pt_flatten_collate(features, return_position_ids=self.return_position_ids)
@@ -279,33 +282,48 @@ def __iter__(self):
         samples = []
         current_length = 0
         for sample in iter(self.dataset):
-            current_length += self._padded_len(len(sample["input_ids"]))
+            sample_length = len(sample["input_ids"])
+            if sample_length > self.max_tokens_per_batch:
+                raise ValueError(
+                    f"TokenPackingDataset: Sample length ({sample_length}) exceeds max_tokens_per_batch "
+                    f"({self.max_tokens_per_batch}). Set truncation or a maximum length in your tokenizer or dataset to"
+                    "ensure all samples fit within max_tokens_per_batch."
+                )
+
+            current_length += self._padded_len(sample_length)
             if current_length == self.max_tokens_per_batch:
                 yield [*samples, sample]
                 samples = []
                 current_length = 0
 
             elif current_length > self.max_tokens_per_batch:
                 if not self.split_samples:
-                    # If we are not splitting samples, we can just yield the current batch (before this sample) and
-                    # start a new one.
-                    yield samples
+                    # Yield the current batch (before this sample) and start a new one with this sample.
+                    if samples:
+                        yield samples
                     samples = [sample]
-
+                    current_length = self._padded_len(sample_length)
                 else:
-                    # Calculate how many padded tokens are already in the batch
-                    tokens_in_batch = current_length - self._padded_len(len(sample["input_ids"]))
+                    # Calculate how many padded tokens are already in the batch.
+                    tokens_in_batch = current_length - self._padded_len(sample_length)
                     # Calculate how many tokens we can fit from this sample, ensuring the
                     # padded length doesn't exceed the remaining capacity.
                     tokens_available = self.max_tokens_per_batch - tokens_in_batch
                     if self.pad_sequences_to_be_divisible_by is not None:
                         d = self.pad_sequences_to_be_divisible_by
                         tokens_available = (tokens_available // d) * d
-                    first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
-                    yield [*samples, first_part]
-                    samples = [remaining_part]
-
-                current_length = self._padded_len(len(samples[0]["input_ids"]))
+                    if tokens_available <= 0:
+                        # Remaining capacity is less than pad_sequences_to_be_divisible_by;
+                        # can't fit any tokens from this sample. Yield current batch and start fresh.
+                        if samples:
+                            yield samples
+                        samples = [sample]
+                        current_length = self._padded_len(sample_length)
+                    else:
+                        first_part, remaining_part = _split_sample_by_num_tokens(sample, tokens_available)
+                        yield [*samples, first_part]
+                        samples = [remaining_part]
+                        current_length = self._padded_len(len(samples[0]["input_ids"]))
             else:
                 samples.append(sample)
 
diff --git a/bionemo-recipes/models/mixtral/state.py b/bionemo-recipes/models/mixtral/state.py
@@ -67,8 +67,8 @@ def apply_transforms(
     source: Union[nn.Module, _ModelState],
     target: TargetModuleT,
     mapping: Dict[str, str],
-    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
-    state_dict_ignored_entries: List = [],
+    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = None,
+    state_dict_ignored_entries: Optional[List] = None,
     cast_dtype: Optional[torch.dtype] = None,
 ) -> TargetModuleT:
     """Transform the state dictionary of a source module to match the structure of a target module's state dictionary.
@@ -126,6 +126,11 @@ def scale_weights(ctx):
         This function is particularly useful when adapting models from different frameworks or
         when consolidating models with different architectural changes.
     """
+    if transforms is None:
+        transforms = []
+    if state_dict_ignored_entries is None:
+        state_dict_ignored_entries = []
+
     # Track dtypes to make sure they weren't modified during conversion.
     target_orig_dtypes = extract_dtypes(target.named_parameters())
 
@@ -318,7 +323,7 @@ def __call__(self, ctx: TransformCTX) -> TransformCTX:
                     try:
                         source_match = source_matches[target_index]
                     except IndexError as e:
-                        logger.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
+                        logger.error(f"Encountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                         raise e
                     if accepts_var_args:
                         source_values = [source_dict[k] for k in source_match]
diff --git a/bionemo-recipes/models/mixtral/tests/common/README.md b/bionemo-recipes/models/mixtral/tests/common/README.md
@@ -4,7 +4,7 @@ Shared test infrastructure for BioNeMo models. One base class, **BaseModelTest**
 
 ## Structure
 
-```
+```text
 tests/common/
 ├── __init__.py             # Public API exports
 ├── test_modeling_common.py # BaseModelTest, TestTolerances
diff --git a/bionemo-recipes/models/mixtral/tests/common/__init__.py b/bionemo-recipes/models/mixtral/tests/common/__init__.py
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Common test utilities for BioNeMo models.
 
 This package provides reusable test infrastructure following HuggingFace
diff --git a/bionemo-recipes/models/mixtral/tests/common/fixtures.py b/bionemo-recipes/models/mixtral/tests/common/fixtures.py
@@ -13,21 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """Shared test fixtures for BioNeMo models."""
 
 import os
@@ -63,7 +48,7 @@ def use_te_debug():
 
     os.environ["NVTE_DEBUG"] = "1"
     yield
-    del os.environ["NVTE_DEBUG"]
+    os.environ.pop("NVTE_DEBUG", None)
 
 
 ALL_RECIPES = [
@@ -138,6 +123,6 @@ def te_attn_backend(request):
 
     yield request.param
 
-    del os.environ["NVTE_FUSED_ATTN"]
-    del os.environ["NVTE_FLASH_ATTN"]
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
     _attention_backends["backend_selection_requires_update"] = True
diff --git a/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py b/bionemo-recipes/models/mixtral/tests/common/test_modeling_common.py
@@ -15,6 +15,7 @@
 
 """Common test class for BioNeMo models, following HuggingFace transformers patterns."""
 
+import fnmatch
 import gc
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -30,9 +31,12 @@
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, set_seed
 
 
-HAS_DATA_CENTER_GPU = any(
-    gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
-)
+try:
+    HAS_DATA_CENTER_GPU = torch.cuda.is_available() and any(
+        gpu_name in torch.cuda.get_device_name(0).upper() for gpu_name in ["H100", "H200", "B100", "B200", "B300"]
+    )
+except (RuntimeError, AssertionError):
+    HAS_DATA_CENTER_GPU = False
 
 
 @dataclass
@@ -283,7 +287,9 @@ def msg(x):
                 if should_be_fp8:
                     if f"{name}.weight" in set(model._tied_weights_keys):
                         continue  # Skip tied weights
-                    elif hasattr(model, "_do_not_quantize") and name in model._do_not_quantize:
+                    elif hasattr(model, "_do_not_quantize") and any(
+                        fnmatch.fnmatch(name, pattern) for pattern in model._do_not_quantize
+                    ):
                         continue  # Skip weights that should be kept in bf16
                     assert isinstance(module.weight, QuantizedTensor), f"Module {name} weight is not a Float8Tensor"
 
@@ -340,13 +346,14 @@ def get_reference_model(
         model.to("cuda")
         return model
 
-    def get_reference_model_no_weights(self) -> PreTrainedModel:
+    def get_reference_model_no_weights(self, **kwargs) -> PreTrainedModel:
         """Load the reference HuggingFace model with random weights."""
         return self.get_upstream_model_class()(
             AutoConfig.from_pretrained(
                 self.get_upstream_model_id(),
                 dtype=torch.float32,
                 revision=self.get_upstream_model_revision(),
+                **kwargs,
             )
         )