@@ -1264,7 +1264,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
12641264 enable_round_tuning = False ,
12651265 enable_torch_compile = self .enable_torch_compile ,
12661266 disable_opt_rtn = disable_opt_rtn ,
1267- enable_rtn = self .iters == 0 ,
12681267 )
12691268 m = m .unwrapper ({})
12701269 except torch .OutOfMemoryError :
@@ -1280,7 +1279,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
12801279 enable_norm_bias_tuning = False ,
12811280 enable_round_tuning = False ,
12821281 enable_torch_compile = self .enable_torch_compile ,
1283- enable_rtn = self .iters == 0 ,
12841282 )
12851283 m = m .unwrapper ({})
12861284 except Exception as e :
@@ -1944,7 +1942,6 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
19441942 enable_torch_compile = self .enable_torch_compile ,
19451943 device = self .device ,
19461944 disable_opt_rtn = self .disable_opt_rtn ,
1947- enable_rtn = self .iters == 0 ,
19481945 )
19491946 new_layer = wrapper_layer .unwrapper ({})
19501947 set_module (self .model , layer_name , new_layer )
@@ -2713,7 +2710,6 @@ def _quantize_layer(
27132710 enable_minmax_tuning = self .enable_minmax_tuning ,
27142711 enable_torch_compile = self .enable_torch_compile ,
27152712 device = device ,
2716- enable_rtn = self .iters == 0 ,
27172713 ).to (device )
27182714 round_params = []
27192715 minmax_params = []
@@ -3025,7 +3021,6 @@ def _quantize_block(
30253021 self .enable_norm_bias_tuning ,
30263022 enable_torch_compile = self .enable_torch_compile ,
30273023 device = device ,
3028- enable_rtn = self .iters == 0 ,
30293024 )
30303025 # Call this before quantization and after applying the block wrapper.
30313026 if is_nv_fp (self .data_type ): # enable qkv and moe structure global_scale fuse.
0 commit comments