intel
diff --git a/‎auto_round/auto_scheme/delta_loss.py‎
Lines changed: 4 additions & 4 deletions b/‎auto_round/auto_scheme/delta_loss.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎auto_round/compressors/base.py‎
Lines changed: 0 additions & 5 deletions b/‎auto_round/compressors/base.py‎
Lines changed: 0 additions & 5 deletions
@@ -33,8 +33,8 @@
 )
 from auto_round.calib_dataset import get_dataloader
 from auto_round.data_type.gguf import (
-    quant_tensor_gguf_opt_rtn_asym_dq,
-    quant_tensor_gguf_opt_rtn_sym_dq,
+    quant_tensor_gguf_asym_dq,
+    quant_tensor_gguf_sym_dq,
     search_gguf_scale_min_asym,
     search_gguf_scale_min_sym,
 )
@@ -260,7 +260,7 @@ def _init_scale(self):
             scale, wmin, d_scale, d_wmin = search_gguf_scale_min_asym(tensor, bits, scale_dtype, imatrix)
             tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)
 
-            qdq_w, _, _ = quant_tensor_gguf_opt_rtn_asym_dq(
+            qdq_w, _, _ = quant_tensor_gguf_asym_dq(
                 tensor=tensor,
                 bits=bits,
                 scale_dtype=scale_dtype,
@@ -275,7 +275,7 @@ def _init_scale(self):
             tensor, orig_shape, pad_len = reshape_pad_tensor_by_group_size(tensor, group_size)
             scale, d_scale = search_gguf_scale_min_sym(tensor, bits, imatrix, scale_dtype, split_num=1)
             tensor = revert_tensor_by_pad(tensor, orig_shape=orig_shape, pad_len=pad_len)
-            qdq_w, _, _ = quant_tensor_gguf_opt_rtn_sym_dq(
+            qdq_w, _, _ = quant_tensor_gguf_sym_dq(
                 tensor=tensor, bits=bits, scale_dtype=scale_dtype, imatrix=imatrix, scale=scale, d_scale=d_scale
             )
         else:
 
@@ -1264,7 +1264,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
                     enable_round_tuning=False,
                     enable_torch_compile=self.enable_torch_compile,
                     disable_opt_rtn=disable_opt_rtn,
-                    enable_rtn=self.iters == 0,
                 )
                 m = m.unwrapper({})
             except torch.OutOfMemoryError:
@@ -1280,7 +1279,6 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
                         enable_norm_bias_tuning=False,
                         enable_round_tuning=False,
                         enable_torch_compile=self.enable_torch_compile,
-                        enable_rtn=self.iters == 0,
                     )
                     m = m.unwrapper({})
                 except Exception as e:
@@ -1944,7 +1942,6 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
                     enable_torch_compile=self.enable_torch_compile,
                     device=self.device,
                     disable_opt_rtn=self.disable_opt_rtn,
-                    enable_rtn=self.iters == 0,
                 )
                 new_layer = wrapper_layer.unwrapper({})
                 set_module(self.model, layer_name, new_layer)
@@ -2713,7 +2710,6 @@ def _quantize_layer(
             enable_minmax_tuning=self.enable_minmax_tuning,
             enable_torch_compile=self.enable_torch_compile,
             device=device,
-            enable_rtn=self.iters == 0,
         ).to(device)
         round_params = []
         minmax_params = []
@@ -3025,7 +3021,6 @@ def _quantize_block(
             self.enable_norm_bias_tuning,
             enable_torch_compile=self.enable_torch_compile,
             device=device,
-            enable_rtn=self.iters == 0,
         )
         # Call this before quantization and after applying the block wrapper.
         if is_nv_fp(self.data_type):  # enable qkv and moe structure global_scale fuse.