NVIDIA-NeMo
diff --git a/‎nemo_automodel/__init__.py‎
Lines changed: 25 additions & 143 deletions b/‎nemo_automodel/__init__.py‎
Lines changed: 25 additions & 143 deletions
diff --git a/‎nemo_automodel/_transformers/__init__.py‎
Lines changed: 28 additions & 7 deletions b/‎nemo_automodel/_transformers/__init__.py‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 4 additions & 0 deletions b/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎nemo_automodel/_transformers/auto_tokenizer.py‎
Lines changed: 39 additions & 19 deletions b/‎nemo_automodel/_transformers/auto_tokenizer.py‎
Lines changed: 39 additions & 19 deletions
@@ -11,166 +11,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import functools
-import importlib
-import inspect
-import logging
-
-from torch.utils.data import _utils as torch_data_utils
-
-# Monkey patch pin_memory to optionally accept a device argument.
-# The device argument was removed in some newer torch versions but we
-# need it for compatibility with torchdata.
-_original_pin_memory_loop = torch_data_utils.pin_memory._pin_memory_loop
-_original_pin_memory = torch_data_utils.pin_memory.pin_memory
-_original_pin_memory_sig = inspect.signature(_original_pin_memory)
-
-if "device" not in _original_pin_memory_sig.parameters:
-
-    @functools.wraps(_original_pin_memory)
-    def _patched_pin_memory(data, device=None):
-        """Patched pin_memory that accepts an optional device argument."""
-        return _original_pin_memory(data)
-
-    @functools.wraps(_original_pin_memory_loop)
-    def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
-        """Patched _pin_memory_loop to accept a device argument."""
-        return _original_pin_memory_loop(in_queue, out_queue, device_id, done_event)
-
-    torch_data_utils.pin_memory.pin_memory = _patched_pin_memory
-    torch_data_utils.pin_memory._pin_memory_loop = _pin_memory_loop
-
-
-# Monkey patch DeviceMesh to fix corner case in mesh slicing
-# Fixes issue where _dim_group_names is accessed without checking if rank is in mesh
-# Based on https://github.com/pytorch/pytorch/pull/169454/files
-try:
-    import torch as _torch
-
-    # Only apply the patch for the specific PyTorch version with the regression
-    # TODO: Remove this once bump up to a newer PyTorch version with the fix
-    if "2.10.0" in _torch.__version__ and "nv25.11" in _torch.__version__:
-        from torch.distributed._mesh_layout import _MeshLayout
-        from torch.distributed.device_mesh import _MeshEnv
-
-        _original_get_slice_mesh_layout = _MeshEnv._get_slice_mesh_layout
-
-        def _patched_get_slice_mesh_layout(self, device_mesh, mesh_dim_names):
-            """
-            Patched _get_slice_mesh_layout based on PyTorch PR #169454.
-            This fixes:
-            1. _dim_group_names access (commit f6c8092)
-            2. Regression in mesh slicing with size-1 dims (PR #169454 / Issue #169381)
-            """
-            # 1. First, build the layout manually to bypass the legacy 'stride < pre_stride' check
-            slice_from_root = device_mesh == self.get_root_mesh(device_mesh)
-            flatten_name_to_root_layout = (
-                {key: mesh._layout for key, mesh in self.root_to_flatten_mapping.setdefault(device_mesh, {}).items()}
-                if slice_from_root
-                else {}
-            )
-
-            mesh_dim_names_list = getattr(device_mesh, "mesh_dim_names", [])
-            valid_mesh_dim_names = [*mesh_dim_names_list, *flatten_name_to_root_layout]
-            if not all(name in valid_mesh_dim_names for name in mesh_dim_names):
-                raise KeyError(f"Invalid mesh_dim_names {mesh_dim_names}. Valid: {valid_mesh_dim_names}")
-
-            layout_sliced = []
-            for name in mesh_dim_names:
-                if name in mesh_dim_names_list:
-                    layout_sliced.append(device_mesh._layout[mesh_dim_names_list.index(name)])
-                elif name in flatten_name_to_root_layout:
-                    layout_sliced.append(flatten_name_to_root_layout[name])
-
-            sliced_sizes = tuple(layout.sizes for layout in layout_sliced)
-            sliced_strides = tuple(layout.strides for layout in layout_sliced)
-
-            # Bypass the 'stride < pre_stride' check that exists in the original
-            # and create the MeshLayout directly.
-            slice_mesh_layout = _MeshLayout(sliced_sizes, sliced_strides)
-
-            if not slice_mesh_layout.check_non_overlap():
-                raise RuntimeError(f"Slicing overlapping dim_names {mesh_dim_names} is not allowed.")
-
-            # 2. Replicate the _dim_group_names fix (commit f6c8092)
-            # We need to return an object that HAS _dim_group_names if the rank is in the mesh
-            if hasattr(device_mesh, "_dim_group_names") and len(device_mesh._dim_group_names) > 0:
-                slice_dim_group_name = []
-                submesh_dim_names = mesh_dim_names if isinstance(mesh_dim_names, tuple) else (mesh_dim_names,)
-                for name in submesh_dim_names:
-                    if name in mesh_dim_names_list:
-                        slice_dim_group_name.append(device_mesh._dim_group_names[mesh_dim_names_list.index(name)])
-                    elif hasattr(device_mesh, "_flatten_mapping") and name in device_mesh._flatten_mapping:
-                        flatten_mesh = device_mesh._flatten_mapping[name]
-                        slice_dim_group_name.append(
-                            flatten_mesh._dim_group_names[flatten_mesh.mesh_dim_names.index(name)]
-                        )
-
-                # Attach the group names to the layout object so the caller can use them
-                object.__setattr__(slice_mesh_layout, "_dim_group_names", slice_dim_group_name)
-
-            return slice_mesh_layout
-
-        # Apply the patch
-        _MeshEnv._get_slice_mesh_layout = _patched_get_slice_mesh_layout
-        logging.getLogger(__name__).debug(f"Applied DeviceMesh fix for PyTorch {_torch.__version__}")
-
-except (ImportError, AttributeError) as e:
-    logging.getLogger(__name__).debug(f"Could not apply DeviceMesh patch: {e}")
-    pass
 
+import importlib
 
 from .package_info import __package_name__, __version__
 
-__all__ = [
-    "recipes",
-    "shared",
-    "components",
-    "__version__",
-    "__package_name__",
-]
+# Keep the base package import lightweight.
+# Heavy dependencies (e.g., torch/transformers) are intentionally imported lazily
+# via __getattr__ so importing tokenizers doesn't pull in the full training stack.
+
+_SUBMODULES = {"recipes", "shared", "components"}
 
-# Promote NeMoAutoModelForCausalLM, AutoModelForImageTextToText into the top level
-# to enable: `from nemo_automodel import NeMoAutoModelForCausalLM`
-try:
-    # adjust this import path if your class lives somewhere else
-    from nemo_automodel._transformers.auto_model import (
-        NeMoAutoModelForCausalLM,
-        NeMoAutoModelForImageTextToText,
-        NeMoAutoModelForSequenceClassification,
-        NeMoAutoModelForTextToWaveform,
-    )  # noqa: I001
-    from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
+_LAZY_ATTRS: dict[str, tuple[str, str]] = {
+    "NeMoAutoModelForCausalLM": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForCausalLM"),
+    "NeMoAutoModelForImageTextToText": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForImageTextToText"),
+    "NeMoAutoModelForSequenceClassification": (
+        "nemo_automodel._transformers.auto_model",
+        "NeMoAutoModelForSequenceClassification",
+    ),
+    "NeMoAutoModelForTextToWaveform": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForTextToWaveform"),
+    "NeMoAutoTokenizer": ("nemo_automodel._transformers.auto_tokenizer", "NeMoAutoTokenizer"),
+}
 
-    globals()["NeMoAutoModelForCausalLM"] = NeMoAutoModelForCausalLM
-    globals()["NeMoAutoModelForImageTextToText"] = NeMoAutoModelForImageTextToText
-    globals()["NeMoAutoModelForSequenceClassification"] = NeMoAutoModelForSequenceClassification
-    globals()["NeMoAutoModelForTextToWaveform"] = NeMoAutoModelForTextToWaveform
-    globals()["NeMoAutoTokenizer"] = NeMoAutoTokenizer
-    __all__.append("NeMoAutoModelForCausalLM")
-    __all__.append("NeMoAutoModelForImageTextToText")
-    __all__.append("NeMoAutoModelForSequenceClassification")
-    __all__.append("NeMoAutoModelForTextToWaveform")
-    __all__.append("NeMoAutoTokenizer")
-except:
-    # optional dependency might be missing,
-    # leave the name off the module namespace so other imports still work
-    pass
+__all__ = sorted([*_SUBMODULES, "__version__", "__package_name__", *_LAZY_ATTRS.keys()])
 
 
 def __getattr__(name: str):
     """
-    Lazily import and cache submodules listed in __all__ when accessed.
+    Lazily import and cache selected submodules / exported symbols when accessed.
 
     Raises:
         AttributeError if the name isn’t in __all__.
     """
-    if name in __all__:
-        # import submodule on first access
+    if name in _SUBMODULES:
         module = importlib.import_module(f"{__name__}.{name}")
-        # cache it in globals() so future lookups do not re-import
         globals()[name] = module
         return module
+    if name in _LAZY_ATTRS:
+        module_name, attr_name = _LAZY_ATTRS[name]
+        module = importlib.import_module(module_name)
+        attr = getattr(module, attr_name)
+        globals()[name] = attr
+        return attr
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 
@@ -12,14 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 
-from nemo_automodel._transformers.auto_model import (
-    NeMoAutoModelForCausalLM,
-    NeMoAutoModelForImageTextToText,
-    NeMoAutoModelForSequenceClassification,
-    NeMoAutoModelForTextToWaveform,
-)
-from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
+# Keep this package lightweight: importing `nemo_automodel._transformers.*` should not
+# automatically pull in torch + all model code unless a specific symbol is accessed.
+
+_LAZY_ATTRS: dict[str, tuple[str, str]] = {
+    "NeMoAutoModelForCausalLM": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForCausalLM"),
+    "NeMoAutoModelForImageTextToText": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForImageTextToText"),
+    "NeMoAutoModelForSequenceClassification": (
+        "nemo_automodel._transformers.auto_model",
+        "NeMoAutoModelForSequenceClassification",
+    ),
+    "NeMoAutoModelForTextToWaveform": ("nemo_automodel._transformers.auto_model", "NeMoAutoModelForTextToWaveform"),
+    "NeMoAutoTokenizer": ("nemo_automodel._transformers.auto_tokenizer", "NeMoAutoTokenizer"),
+}
 
 __all__ = [
     "NeMoAutoModelForCausalLM",
@@ -28,3 +35,17 @@
     "NeMoAutoModelForTextToWaveform",
     "NeMoAutoTokenizer",
 ]
+
+
+def __getattr__(name: str):
+    if name in _LAZY_ATTRS:
+        module_name, attr_name = _LAZY_ATTRS[name]
+        module = importlib.import_module(module_name)
+        attr = getattr(module, attr_name)
+        globals()[name] = attr
+        return attr
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return sorted(__all__)
@@ -23,6 +23,10 @@
 
 import torch
 from torch.nn.attention import SDPBackend, sdpa_kernel
+
+from nemo_automodel.shared.torch_patches import apply_torch_patches
+
+apply_torch_patches()
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
 
@@ -15,11 +15,6 @@
 import logging
 from typing import Callable, Optional, Type, Union
 
-from transformers import AutoConfig, AutoTokenizer
-
-from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced
-from nemo_automodel._transformers.tokenization.registry import TokenizerRegistry
-
 logger = logging.getLogger(__name__)
 
 
@@ -35,14 +30,24 @@ def _get_model_type(pretrained_model_name_or_path: str, trust_remote_code: bool
         The model_type string, or None if it cannot be determined
     """
     try:
+        from transformers import AutoConfig
+
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
         return getattr(config, "model_type", None)
     except Exception as e:
         logger.debug(f"Could not load config to determine model type: {e}")
         return None
 
 
-class NeMoAutoTokenizer(AutoTokenizer):
+def _get_tokenizer_registry():
+    # Import lazily to avoid pulling in optional/custom backends (and transformers)
+    # when users only do `from nemo_automodel import NeMoAutoTokenizer`.
+    from nemo_automodel._transformers.tokenization.registry import TokenizerRegistry
+
+    return TokenizerRegistry
+
+
+class NeMoAutoTokenizer:
     """
     Auto tokenizer class that dispatches to appropriate tokenizer implementations.
 
@@ -62,13 +67,7 @@ class NeMoAutoTokenizer(AutoTokenizer):
     """
 
     # Make registry accessible at class level
-    _registry = TokenizerRegistry
-
-    def __init__(self):
-        raise EnvironmentError(
-            f"{self.__class__.__name__} is designed to be instantiated using the "
-            f"`{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` method."
-        )
+    _registry = None
 
     @classmethod
     def register(cls, model_type: str, tokenizer_cls: Union[Type, Callable]) -> None:
@@ -79,7 +78,7 @@ def register(cls, model_type: str, tokenizer_cls: Union[Type, Callable]) -> None
             model_type: The model type string (e.g., "mistral", "llama")
             tokenizer_cls: The tokenizer class or factory function
         """
-        cls._registry.register(model_type, tokenizer_cls)
+        _get_tokenizer_registry().register(model_type, tokenizer_cls)
 
     @classmethod
     def from_pretrained(
@@ -106,19 +105,26 @@ def from_pretrained(
         """
         # If force_hf, just use the base HF AutoTokenizer
         if force_hf:
-            return super().from_pretrained(
+            from transformers import AutoTokenizer
+
+            return AutoTokenizer.from_pretrained(
                 pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs
             )
 
         # Try to determine model type from config
         model_type = _get_model_type(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
 
-        if model_type and cls._registry.has_custom_tokenizer(model_type):
-            tokenizer_cls = cls._registry.get_tokenizer_cls(model_type)
-            logger.info(f"Using custom tokenizer {tokenizer_cls.__name__} for model type '{model_type}'")
-            return tokenizer_cls.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        registry = _get_tokenizer_registry()
+
+        if not force_default and model_type:
+            tokenizer_cls = registry.get_custom_tokenizer_cls(model_type)
+            if tokenizer_cls is not None:
+                logger.info(f"Using custom tokenizer {tokenizer_cls.__name__} for model type '{model_type}'")
+                return tokenizer_cls.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # Fall back to default BOS/EOS enforced tokenizer
+        from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced
+
         return NeMoAutoTokenizerWithBosEosEnforced.from_pretrained(
             pretrained_model_name_or_path, *args, trust_remote_code=trust_remote_code, **kwargs
         )
@@ -129,3 +135,17 @@ def from_pretrained(
     "NeMoAutoTokenizerWithBosEosEnforced",
     "TokenizerRegistry",
 ]
+
+
+def __getattr__(name: str):
+    if name == "TokenizerRegistry":
+        return _get_tokenizer_registry()
+    if name == "NeMoAutoTokenizerWithBosEosEnforced":
+        from nemo_automodel._transformers.tokenization.nemo_auto_tokenizer import NeMoAutoTokenizerWithBosEosEnforced
+
+        return NeMoAutoTokenizerWithBosEosEnforced
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return sorted(__all__)