NVIDIA · shuyixiong · Jan 6, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -1113,6 +1113,26 @@ def post_load_weights(self):
         )
         return self.backend.post_load_weights()
 
+    def process_weights_after_loading(self):
+        """
+        Process weights after loading - delegated to backend
+
+        """
+        assert hasattr(self.backend, "process_weights_after_loading"), (
+            f"Backend {self.backend.__class__.__name__} must implement process_weights_after_loading()"
+        )
+        return self.backend.process_weights_after_loading()
+
+    def pre_reload_weights(self):
+        """
+        Pre reload weights - delegated to backend
+
+        """
+        assert hasattr(self.backend, "pre_reload_weights"), (
+            f"Backend {self.backend.__class__.__name__} must implement pre_reload_weights()"
+        )
+        return self.backend.pre_reload_weights()
+
     # ========== Communication and Quantization Properties ==========
 
     @property

@@ -1,3 +1,4 @@
+import inspect
 import os
 from functools import cached_property
 from typing import Dict, List, Optional, Tuple, Union
@@ -862,16 +863,22 @@ def load_weights(self,
         assert len(weights) == 1
         weights = weights[0]
 
-        if not isinstance(self.quant_method, UnquantizedFusedMoEMethod):
-            assert not allow_partial_loading, "Partial loading is not supported for quantized MoE now"
-            self.quant_method.load_weights(self, weights,
-                                           self.weight_loading_mode)
-        else:
-            self.quant_method.load_weights(
-                self,
-                weights,
-                self.weight_loading_mode,
-                allow_partial_loading=allow_partial_loading)
+        kargs = {}
+        if "allow_partial_loading" in inspect.getfullargspec(
+                self.quant_method.load_weights).args:
+            kargs["allow_partial_loading"] = allow_partial_loading
+        self.quant_method.load_weights(self, weights, self.weight_loading_mode,
+                                       **kargs)
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)
+
+    def process_weights_after_loading(self):
+        if hasattr(self.quant_method, 'process_weights_after_loading'):
+            self.quant_method.process_weights_after_loading(self)
+
+    def pre_reload_weights(self):
+        assert hasattr(
+            self.quant_method, 'pre_reload_weights'
+        ), "pre_reload_weights is not supported for this quant method"
+        self.quant_method.pre_reload_weights(self)
@@ -1401,3 +1401,13 @@ def load_weights(self,
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)
+
+    def process_weights_after_loading(self):
+        if hasattr(self.quant_method, 'process_weights_after_loading'):
+            self.quant_method.process_weights_after_loading(self)
+
+    def pre_reload_weights(self):
+        assert hasattr(
+            self.quant_method, 'pre_reload_weights'
+        ), "pre_reload_weights is not supported for this quant method"
+        self.quant_method.pre_reload_weights(self)
@@ -1,3 +1,4 @@
+import inspect
 import os
 from functools import cached_property
 from typing import Dict, List, Optional, Union
@@ -21,9 +22,9 @@
 # isort: off
 from .quantization import (
     DeepSeekFP8BlockScalesFusedMoEMethod, NVFP4TRTLLMGenFusedMoEBaseMethod,
-    NVFP4TRTLLMGenFusedMoEMethod, UnquantizedFusedMoEMethod,
-    W4A8MXFP4FP8TRTLLMGenFusedMoEMethod, W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod,
-    W4A8NVFP4FP8TRTLLMGenFusedMoEMethod, W4A16MXFP4TRTLLMGenFusedMoEMethod)
+    NVFP4TRTLLMGenFusedMoEMethod, W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
+    W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod, W4A8NVFP4FP8TRTLLMGenFusedMoEMethod,
+    W4A16MXFP4TRTLLMGenFusedMoEMethod)
 # isort: on
 from .routing import BaseMoeRoutingMethod, DeepSeekV3MoeRoutingMethod
 
@@ -273,20 +274,26 @@ def load_weights(self,
         assert len(weights) == 1
         weights = weights[0]
 
-        if not isinstance(self.quant_method, UnquantizedFusedMoEMethod):
-            assert not allow_partial_loading, "Partial loading is not supported for quantized MoE now"
-            self.quant_method.load_weights(self, weights,
-                                           self.weight_loading_mode)
-        else:
-            self.quant_method.load_weights(
-                self,
-                weights,
-                self.weight_loading_mode,
-                allow_partial_loading=allow_partial_loading)
+        kargs = {}
+        if "allow_partial_loading" in inspect.getfullargspec(
+                self.quant_method.load_weights).args:
+            kargs["allow_partial_loading"] = allow_partial_loading
+        self.quant_method.load_weights(self, weights, self.weight_loading_mode,
+                                       **kargs)
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)
 
+    def process_weights_after_loading(self):
+        if hasattr(self.quant_method, 'process_weights_after_loading'):
+            self.quant_method.process_weights_after_loading(self)
+
+    def pre_reload_weights(self):
+        assert hasattr(
+            self.quant_method, 'pre_reload_weights'
+        ), "pre_reload_weights is not supported for this quant method"
+        self.quant_method.pre_reload_weights(self)
+
     def quantize_input(self, x, post_quant_comm: bool = True):
         """Quantize inputs prior to post-communication (alltoall/allgather) or before MoE computation.
 

@@ -1,3 +1,4 @@
+import inspect
 import os
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -949,20 +950,26 @@ def load_weights(self,
         assert len(weights) == 1
         weights = weights[0]
 
-        if not isinstance(self.quant_method, UnquantizedFusedMoEMethod):
-            assert not allow_partial_loading, "Partial loading is not supported for quantized MoE now"
-            self.quant_method.load_weights(self, weights,
-                                           self.weight_loading_mode)
-        else:
-            self.quant_method.load_weights(
-                self,
-                weights,
-                self.weight_loading_mode,
-                allow_partial_loading=allow_partial_loading)
+        kargs = {}
+        if "allow_partial_loading" in inspect.getfullargspec(
+                self.quant_method.load_weights).args:
+            kargs["allow_partial_loading"] = allow_partial_loading
+        self.quant_method.load_weights(self, weights, self.weight_loading_mode,
+                                       **kargs)
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)
 
+    def process_weights_after_loading(self):
+        if hasattr(self.quant_method, 'process_weights_after_loading'):
+            self.quant_method.process_weights_after_loading(self)
+
+    def pre_reload_weights(self):
+        assert hasattr(
+            self.quant_method, 'pre_reload_weights'
+        ), "pre_reload_weights is not supported for this quant method"
+        self.quant_method.pre_reload_weights(self)
+
     def forward_fake(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],

@@ -512,12 +512,20 @@ def create_weights(self):
         raise NotImplementedError
 
     @abstractmethod
-    def load_weights(self, weights: List[Dict]):
+    def load_weights(self,
+                     weights: List[Dict],
+                     allow_partial_loading: bool = False):
         raise NotImplementedError
 
+    def process_weights_after_loading(self):
+        pass
+
     def post_load_weights(self):
         pass
 
+    def pre_reload_weights(self):
+        pass
+
     @abstractmethod
     def quantize_input(
         self,