0.23.1 Release - fix for torch 2.6

kevalmorabia97 · yeyu-nvidia · commit 7e16b1520b42 · 2025-12-08T10:51:05.000-08:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,7 +3,7 @@
     "editor.rulers": [
         100,
         120
-    ], // 100 for black auto-formatter, 120 for hard limit in ruff
+    ], // 100 for ruff auto-formatter, 120 for hard limit in ruff
     "[python]": {
         "editor.defaultFormatter": "charliermarsh.ruff",
         "editor.formatOnSave": true,
@@ -40,4 +40,7 @@
         "--no-cov",
     ],
     "evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
+    "python.analysis.extraPaths": [
+        "./tests/" // add tests to python path just like pytest does in pyproject.toml
+    ]
 }
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -6,7 +6,7 @@ In this tutorial, we demonstrate how to use Model Optimizer to perform teacher-s
 
 Checkout the stand-along distillation example in the [NVIDIA NeMo repository](https://github.com/NVIDIA/NeMo/tree/main/examples/nlp/language_modeling/megatron_gpt_distillation.py).
 
-You can also look at the tutorial notebooks [here](https://github.com/NVIDIA/NeMo/tree/main/tutorials/llm/llama-3/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Llama 3.1 8B step-by-step in NeMo framework.
+You can also look at the tutorial notebooks [here](https://github.com/NVIDIA/NeMo/tree/main/tutorials/llm/llama/pruning-distillation) which showcase the usage of Minitron pruning followed by distillation for Llama 3.1 8B step-by-step in NeMo framework.
 
 ## Knowledge Distillation (KD) for HuggingFace Models
 
diff --git a/examples/llm_distill/main.py b/examples/llm_distill/main.py
@@ -197,7 +197,7 @@ def train():
             if not os.path.isfile(modelopt_state_path):
                 raise FileNotFoundError("`modelopt_state.pt` not found with checkpoint.")
             logger.info(f"Loading modelopt state from {modelopt_state_path}")
-            modelopt_state = torch.load(modelopt_state_path)
+            modelopt_state = torch.load(modelopt_state_path, weights_only=False)
             mto.restore_from_modelopt_state(model, modelopt_state)
 
         logger.info("Beginning training...")
diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md
@@ -61,7 +61,7 @@ torch.save(mto.modelopt_state(model), "modelopt_quantizer_states.pt")
 
 # To resume training from a checkpoint or load the final QAT model for evaluation,
 # load the quantizer states before loading the model weights
-# mto.restore_from_modelopt_state(model, torch.load("modelopt_quantizer_states.pt"))
+# mto.restore_from_modelopt_state(model, torch.load("modelopt_quantizer_states.pt", weights_only=False))
 # After loading the quantizer states, load the model weights
 # model.load_state_dict(state_dict_from_last_checkpoint)
 
diff --git a/examples/llm_sparsity/requirements.txt b/examples/llm_sparsity/requirements.txt
@@ -1,4 +1,3 @@
 flash-attn
 sentencepiece>=0.2.0
 tensorboardX
-torch>=2.1.0
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
@@ -10,7 +10,7 @@ Model Optimizer can be used in one of the following complementary pruning modes
 
 ## Documentation
 
-Checkout the [Quick Start: Pruning](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/4_pruning.html) and the detailed [Optimization Guide](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_pruning.html) in the Model Optimizer documentation for more information on how to use the above pruning algorithms in Model Optimizer.
+Checkout the [Quick Start: Pruning](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/5_pruning.html) and the detailed [Optimization Guide](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/2_pruning.html) in the Model Optimizer documentation for more information on how to use the above pruning algorithms in Model Optimizer.
 
 ## Algorithms
 
diff --git a/examples/speculative_decoding/requirements.txt b/examples/speculative_decoding/requirements.txt
@@ -3,4 +3,3 @@ openai
 py7zr
 sentencepiece>=0.2.0
 tensorboardX
-torch>=2.1.0
diff --git a/modelopt/torch/export/distribute.py b/modelopt/torch/export/distribute.py
@@ -91,7 +91,7 @@ def read_configs_and_weights_from_rank(
             raise ValueError("NFSWorkspace is not initialized!")
         state_path = self._get_state_path(target_rank)
         if state_path.exists():
-            state = torch.load(state_path, map_location="cpu")
+            state = torch.load(state_path, map_location="cpu", weights_only=False)
             return state["config"], state["weight"]
         else:
             return None, None
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -822,8 +822,8 @@ def _split_gate_from_fc(decoder_type, module, fc_name, fc_layer):
                 # for Int8 SQ case, we split the weight scaling factor into two parts.
                 weight_scaling_factors = torch.chunk(weight_scaling_factor, 2, dim=0)
 
-            config.fc = build_linear_config(fc_linear)
-            config.gate = build_linear_config(fc_linear)
+            config.fc = build_linear_config(fc_linear, LINEAR_COLUMN)
+            config.gate = build_linear_config(fc_linear, LINEAR_COLUMN)
             config.fc.weight = weights[0]
             config.gate.weight = weights[1]
             if weight_scaling_factors is not None:
diff --git a/modelopt/torch/opt/conversion.py b/modelopt/torch/opt/conversion.py
@@ -488,7 +488,7 @@ def restore_from_modelopt_state(model: ModelLike, modelopt_state: dict[str, Any]
 
         # Restore the previously saved modelopt state followed by model weights
         mto.restore_from_modelopt_state(
-            model, torch.load("modelopt_state.pt")
+            model, torch.load("modelopt_state.pt", weights_only=False)
         )  # Restore modelopt state
         model.load_state_dict(torch.load("model_weights.pt"), ...)  # Load the model weights
 
@@ -561,6 +561,7 @@ def restore(model: ModelLike, f: Union[str, os.PathLike, BinaryIO], **kwargs) ->
 
     # load checkpoint
     kwargs.setdefault("map_location", "cpu")
+    kwargs.setdefault("weights_only", False)
     objs = torch.load(f, **kwargs)
 
     # restore model architecture
diff --git a/modelopt/torch/opt/plugins/huggingface.py b/modelopt/torch/opt/plugins/huggingface.py
@@ -60,7 +60,7 @@ def new_init_fn(self, *args, **kwargs):
         modelopt_state_path = _get_modelopt_state_path(self, model_path)
         cls._original__init__(self, *args, **kwargs)
         if os.path.isfile(modelopt_state_path):
-            modelopt_state = torch.load(modelopt_state_path, map_location="cpu")
+            modelopt_state = torch.load(modelopt_state_path, map_location="cpu", weights_only=False)
             if not has_nas_modelopt_state(modelopt_state):
                 restore_from_modelopt_state(self, modelopt_state)
                 print_rank_0(f"Restored ModelOpt state from {modelopt_state_path}")
@@ -113,7 +113,9 @@ def _new__load_pretrained_model(
     """
     modelopt_state_path = _get_modelopt_state_path(model, pretrained_model_name_or_path)
     if os.path.isfile(modelopt_state_path) and not ModeloptStateManager.is_converted(model):
-        modelopt_state_dict = torch.load(modelopt_state_path, map_location="cpu")
+        modelopt_state_dict = torch.load(
+            modelopt_state_path, map_location="cpu", weights_only=False
+        )
         restore_from_modelopt_state(model, modelopt_state_dict)
         print_rank_0(f"Restored ModelOpt state after init from {modelopt_state_path}")
 
diff --git a/modelopt/torch/opt/plugins/peft.py b/modelopt/torch/opt/plugins/peft.py
@@ -70,7 +70,9 @@ def _new_load_adapter(self, model_id, adapter_name, *args, **kwargs):
         assert (
             adapter_name in self.peft_config
         ), f"ModelOpt modified model should have adapter_name={adapter_name} in peft_config"
-        restore_from_modelopt_state(self, torch.load(modelopt_state_path, map_location="cpu"))
+        restore_from_modelopt_state(
+            self, torch.load(modelopt_state_path, map_location="cpu", weights_only=False)
+        )
 
     # The adapter state_dictionary does not contain the quantizer weights for the layers which are not LoraLinear
     # However this is okay since the quantizer weights have been loaded from the quantizer_state_dict.pth in
@@ -83,7 +85,7 @@ def _new_load_adapter(self, model_id, adapter_name, *args, **kwargs):
         from modelopt.torch.quantization.nn import TensorQuantizer
 
         quantizer_state_dict = torch.load(
-            _get_quantizer_state_save_path(model_id), map_location="cpu"
+            _get_quantizer_state_save_path(model_id), map_location="cpu", weights_only=False
         )
         for name, module in self.named_modules():
             if isinstance(module, TensorQuantizer):
diff --git a/modelopt/torch/opt/searcher.py b/modelopt/torch/opt/searcher.py
@@ -245,7 +245,7 @@ def load_search_checkpoint(self) -> bool:
 
         # iterate through state dict and load keys
         print(f"Loading searcher state from {checkpoint}...")
-        state_dict = torch.load(checkpoint)
+        state_dict = torch.load(checkpoint, weights_only=False)
         assert state_dict.keys() == self.state_dict().keys(), "Keys in checkpoint don't match!"
         for key, state in state_dict.items():
             setattr(self, key, state)
diff --git a/modelopt/torch/utils/distributed.py b/modelopt/torch/utils/distributed.py
@@ -88,7 +88,7 @@ def _deserialize(tensor: torch.Tensor, size: Optional[int] = None) -> Any:
     buffer = tensor.numpy().tobytes()
     if size is not None:
         buffer = buffer[:size]
-    obj = torch.load(io.BytesIO(buffer))
+    obj = torch.load(io.BytesIO(buffer), weights_only=False)
     return obj
 
 
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 # Package configuration ############################################################################
 name = "nvidia-modelopt"
-version = "0.23.0"
+version = "0.23.1"
 packages = setuptools.find_namespace_packages(include=["modelopt*"])
 package_dir = {"": "."}
 package_data = {"modelopt": ["**/*.h", "**/*.cpp", "**/*.cu"]}
diff --git a/tests/_test_utils/torch_dist/plugins/megatron_common.py b/tests/_test_utils/torch_dist/plugins/megatron_common.py
@@ -207,7 +207,13 @@ def run_mcore_gpt_inference(
     )
     wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
     wrapped_model.prep_model_for_inference(prompt_tokens)
-    inference_input = wrapped_model.get_batch_for_context_window(0, model.max_sequence_length)
+    if Version(mcore_version) >= Version("0.10"):
+        inference_input = wrapped_model.prep_inference_input(prompt_tokens)
+        inference_input = wrapped_model.get_batch_for_context_window(
+            inference_input, 0, model.max_sequence_length
+        )
+    else:
+        inference_input = wrapped_model.get_batch_for_context_window(0, model.max_sequence_length)
 
     # Note: This is returned in all TP ranks or last PP stage in PP models
     logits = wrapped_model.run_one_forward_step(inference_input)
diff --git a/tests/torch/gpu/deploy/_runtime/test_trt_client.py b/tests/torch/gpu/deploy/_runtime/test_trt_client.py
@@ -24,11 +24,12 @@
     LeNet5TwoOutputs,
 )
 
+skip_if_no_trtexec()
+
+
 from modelopt.torch._deploy._runtime.trt_client import TRTLocalClient
 from modelopt.torch._deploy.compilation import compile
 
-skip_if_no_trtexec()
-
 deployment = {
     "runtime": "TRT",
     "accelerator": "GPU",
diff --git a/tests/torch/gpu/quantization/test_quantize_mxformats_cuda.py b/tests/torch/gpu/quantization/test_quantize_mxformats_cuda.py
@@ -20,7 +20,7 @@
 from modelopt.torch.quantization.tensor_quant import mx_format_map
 
 if get_cuda_ext_mx() is None:
-    pytest.skip("cuda_ext_mx is not available")
+    pytest.skip("cuda_ext_mx is not available", allow_module_level=True)
 
 cuda_ext_mx = get_cuda_ext_mx()
 
diff --git a/tests/torch/gpu/quantization/test_torch_export.py b/tests/torch/gpu/quantization/test_torch_export.py
@@ -67,7 +67,7 @@ def calibrate_loop(model):
 
         output_pyt = model(input_tensor)
         with export_torch_mode():
-            export_program = torch.export.export(model, (input_tensor,))
+            export_program = torch.export.export(model, (input_tensor,), strict=False)
             outputs_ep = export_program.module()(input_tensor)
 
             print(f"output_pyt: {output_pyt}, outputs_ep: {outputs_ep}")