diff --git a/README.md b/README.md
index ddaecc53a..480e72dbe 100644
--- a/README.md
+++ b/README.md
@@ -323,7 +323,7 @@ The support for Gaudi device is limited.
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 844f366bb..59ccb9ea2 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -701,7 +701,7 @@ def tune(args):
                 logger.error("Cannot find correct gguf file for evaluation, please check.")
                 sys.exit(-1)
             model = AutoModelForCausalLM.from_pretrained(
-                eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+                eval_folder, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
             )
             model.eval()
             tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md
index ca9adb93f..d9de06d61 100644
--- a/auto_round/compressors/diffusion/README.md
+++ b/auto_round/compressors/diffusion/README.md
@@ -15,7 +15,7 @@ from diffusers import AutoPipelineForText2Image
 
 # Load the model
 model_name = "black-forest-labs/FLUX.1-dev"
-pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+pipe = AutoPipelineForText2Image.from_pretrained(model_name, dtype=torch.bfloat16)
 
 # Quantize the model
 autoround = AutoRound(
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 71a5c1402..8fd43319a 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -174,7 +174,7 @@ def eval(args):
                 " but may affect accuracy."
             )
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+            model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
         )
         model.eval()
         st = time.time()
@@ -252,7 +252,7 @@ def eval_task_by_task(
             )
 
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+            model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
         )
         model.eval()
         parallelism = False
diff --git a/auto_round/experimental/kv_cache.py b/auto_round/experimental/kv_cache.py
index 8a49f3072..e67c6138a 100644
--- a/auto_round/experimental/kv_cache.py
+++ b/auto_round/experimental/kv_cache.py
@@ -263,7 +263,7 @@ def prep_attention_module_for_calibration(module: torch.nn.Module):
 
 def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch.dtype:
     valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"]
-    valid_torch_dtype = {
+    valid_dtype = {
         "float16": torch.float16,
         "bfloat16": torch.bfloat16,
         "fp8": torch.float8_e4m3fn,
@@ -272,13 +272,13 @@ def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch
         "float": torch.float32,  # Alias for float32
     }
     if static_kv_dtype in valid_dtype_name_lst:
-        new_dtype = valid_torch_dtype[static_kv_dtype]
-    elif static_kv_dtype in valid_torch_dtype.values():
+        new_dtype = valid_dtype[static_kv_dtype]
+    elif static_kv_dtype in valid_dtype.values():
         new_dtype = static_kv_dtype
     else:
         raise ValueError(
             f"Invalid static kv dtype: {static_kv_dtype}. "
-            f"Valid options are: {', '.join(valid_dtype_name_lst  + list(valid_torch_dtype.values()))}."
+            f"Valid options are: {', '.join(valid_dtype_name_lst  + list(valid_dtype.values()))}."
         )
     return new_dtype
 
diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
index 8b8a618e2..42b4644c3 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -105,10 +105,10 @@ def pack_layer(layer_name, model, data_type, device=None):
     weight = layer.weight
     weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
     act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
-    torch_dtype = torch.float8_e4m3fn
+    dtype = torch.float8_e4m3fn
     if "fp8_e5m2" in data_type:
-        torch_dtype = torch.float8_e5m2
-    info = torch.finfo(torch_dtype)
+        dtype = torch.float8_e5m2
+    info = torch.finfo(dtype)
     if zp is not None:
         if isinstance(zp, torch.Tensor):
             zp = zp.to(packing_device)
@@ -117,7 +117,7 @@ def pack_layer(layer_name, model, data_type, device=None):
         q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
     q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
     q_weight = torch.clamp(q_weight, info.min, info.max)
-    q_weight = q_weight.to(torch_dtype)
+    q_weight = q_weight.to(dtype)
     if type(layer) == torch.nn.Linear:
         in_features = layer.in_features
         out_features = layer.out_features
diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py
index c7c327f1b..c1ddff569 100644
--- a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py
+++ b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py
@@ -172,7 +172,7 @@ def __init__(
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
-            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
+            # NOTE: can't use field "dtype" in config.json, because some finetunes lie.
             _, first_tensor = next(self.get_tensors())
             if first_tensor.dtype == torch.float16:
                 logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
index 1b0b48b35..79101cd6f 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
@@ -73,10 +73,10 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
     weight = layer.weight
     weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
     act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
-    torch_dtype = torch.float8_e4m3fn
+    dtype = torch.float8_e4m3fn
     if "fp8_e5m2" in data_type:
-        torch_dtype = torch.float8_e5m2
-    info = torch.finfo(torch_dtype)
+        dtype = torch.float8_e5m2
+    info = torch.finfo(dtype)
     if zp is not None:
         if isinstance(zp, torch.Tensor):
             zp = zp.to(packing_device)
@@ -85,7 +85,7 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
         q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
     q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
     q_weight = torch.clamp(q_weight, info.min, info.max)
-    q_weight = q_weight.to(torch_dtype)
+    q_weight = q_weight.to(dtype)
     if type(layer) == torch.nn.Linear:
         in_features = layer.in_features
         out_features = layer.out_features
diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py
index 78e2f43e6..9237a72ff 100644
--- a/auto_round/export/utils.py
+++ b/auto_round/export/utils.py
@@ -58,7 +58,7 @@ def save_model(
     if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
         with open(config_path, "r") as file:
             data = json.load(file)
-        data["torch_dtype"] = str(dtype).split(".")[-1]
+        data["dtype"] = str(dtype).split(".")[-1]
         with open(config_path, "w") as file:
             json.dump(data, file, indent=2)
     config_file = "quantization_config.json"
diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py
index 33ab74d8d..dca1e3370 100644
--- a/auto_round/inference/auto_quantizer.py
+++ b/auto_round/inference/auto_quantizer.py
@@ -329,10 +329,10 @@ def validate_environment(self, *args, **kwargs):
                     "auto-round` or install from source"
                 )
 
-    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
-        if torch_dtype is None:
-            torch_dtype = torch.bfloat16
-        return torch_dtype
+    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
+        if dtype is None:
+            dtype = torch.bfloat16
+        return dtype
 
     def post_init_model(self, model):
         """Post-initialization that require device information, for example buffers initialization on device.
diff --git a/auto_round/modelling/gpt_oss.py b/auto_round/modelling/gpt_oss.py
index 78f73075c..b5236c604 100644
--- a/auto_round/modelling/gpt_oss.py
+++ b/auto_round/modelling/gpt_oss.py
@@ -62,7 +62,7 @@ def __init__(self, config: GptOssConfig, original: GptOssMLP):
         super().__init__()
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
-        dtype_str = getattr(config, "torch_dtype", None) or getattr(config, "dtype", None)
+        dtype_str = getattr(config, "dtype", None)
         dtype = torch.bfloat16 if str(dtype_str).endswith("bfloat16") else torch.float32
         top_k = config.num_experts_per_tok
         self.hidden_size = hidden_size
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index 7a1c66de8..11a4663fd 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -195,9 +195,9 @@ def llm_load_model(
     )
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
 
     is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower()))
 
@@ -210,7 +210,7 @@ def llm_load_model(
     if _use_hpu_compile_mode():
         model = model_cls.from_pretrained(
             pretrained_model_name_or_path,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             attn_implementation="eager",
             trust_remote_code=trust_remote_code,
             device_map="auto" if use_auto_mapping else None,
@@ -219,7 +219,7 @@ def llm_load_model(
         try:
             model = model_cls.from_pretrained(
                 pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=trust_remote_code,
                 device_map="auto" if use_auto_mapping else None,
             )
@@ -228,7 +228,7 @@ def llm_load_model(
                 orig_func = set_fake_cuda_device_capability()
                 model = model_cls.from_pretrained(
                     pretrained_model_name_or_path,
-                    torch_dtype=torch_dtype,
+                    dtype=dtype,
                     trust_remote_code=trust_remote_code,
                     device_map="auto" if use_auto_mapping else None,
                 )
@@ -241,7 +241,7 @@ def llm_load_model(
             logger.warning(f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry.")
             model = model_cls.from_pretrained(
                 pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=False,
                 device_map="auto" if use_auto_mapping else None,
             )
@@ -256,7 +256,7 @@ def llm_load_model(
 def mllm_load_model(
     pretrained_model_name_or_path,
     device="cpu",
-    torch_dtype="auto",
+    dtype="auto",
     use_auto_mapping=True,
     trust_remote_code=True,
     model_dtype=None,
@@ -268,9 +268,9 @@ def mllm_load_model(
     from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
     if os.path.isdir(pretrained_model_name_or_path):
         config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json")))
     else:
@@ -306,7 +306,7 @@ def mllm_load_model(
         model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             device_map="auto" if use_auto_mapping else None,
         )
     else:
@@ -318,7 +318,7 @@ def mllm_load_model(
                 pretrained_model_name_or_path,
                 model_base=None,
                 model_name=pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
             )
         else:
             if architectures.endswith("Model") and hasattr(
@@ -333,7 +333,7 @@ def mllm_load_model(
                 model = cls.from_pretrained(
                     pretrained_model_name_or_path,
                     trust_remote_code=trust_remote_code,
-                    torch_dtype=torch_dtype,
+                    dtype=dtype,
                     device_map="auto" if use_auto_mapping else None,
                 )
             except ValueError as e:
@@ -342,7 +342,7 @@ def mllm_load_model(
                     model = cls.from_pretrained(
                         pretrained_model_name_or_path,
                         trust_remote_code=trust_remote_code,
-                        torch_dtype=torch_dtype,
+                        dtype=dtype,
                         device_map="auto" if use_auto_mapping else None,
                     )
                     torch.cuda.get_device_capability = orig_func
@@ -383,7 +383,7 @@ def mllm_load_model(
 def diffusion_load_model(
     pretrained_model_name_or_path: str,
     device: Union[str, torch.device] = "cpu",
-    torch_dtype: Union[str, torch.dtype] = "auto",
+    dtype: Union[str, torch.dtype] = "auto",
     use_auto_mapping: bool = False,
     trust_remote_code: bool = True,
     model_dtype: str = None,
@@ -393,15 +393,13 @@ def diffusion_load_model(
     from auto_round.utils.device import get_device_and_parallelism
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
 
     pipelines = LazyImport("diffusers.pipelines")
 
-    pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(
-        pretrained_model_name_or_path, torch_dtype=torch_dtype
-    )
+    pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, dtype=dtype)
     pipe = _to_model_dtype(pipe, model_dtype)
     model = pipe.transformer
     return pipe, model.to(device)
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 6efbc85e7..c7786cb5e 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -489,7 +489,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model_name = "opensourcerelease/DeepSeek-R1-bf16"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, dtype="auto")
 
 block = model.model.layers
 device_map = {}
@@ -599,7 +599,7 @@ Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX)
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -615,7 +615,7 @@ Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -630,7 +630,7 @@ Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits infe
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -670,7 +670,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
 quantization_config = AutoRoundConfig(backend="ipex")
 model = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
+    model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
@@ -701,7 +701,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
 model_name = "ybelkada/opt-125m-gptq-4bit"
 quantization_config = AutoRoundConfig()
 model = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
+    model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index dfc387dee..0ff975460 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -24,7 +24,7 @@ class TestAutoRoundAct(unittest.TestCase):
     def setUpClass(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -35,7 +35,7 @@ def tearDownClass(self):
 
     def test_mx_fp4(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -54,7 +54,7 @@ def test_mx_fp4(self):
 
     def test_wint4fp8_dynamic(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size = 4, 128
         autoround = AutoRound(
@@ -93,7 +93,7 @@ def test_wfp8afp8_static(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from auto_round.wrapper import WrapperWALayer
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -114,7 +114,7 @@ def test_wfp8afp8_static(self):
         self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30)
 
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
             model,
diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
index f9801217e..cb0e86135 100644
--- a/test/test_cpu/test_autoopt.py
+++ b/test/test_cpu/test_autoopt.py
@@ -24,7 +24,7 @@ class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 3adfd9f47..206a3b0ab 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -29,7 +29,7 @@ class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
@@ -103,7 +103,7 @@ def test_consecutive_quant(self):
         autoround.quantize()
 
         model = AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
+            "/tf_dataset/auto_round/models/microsoft/phi-2", dtype="auto", trust_remote_code=True
         )
         tokenizer = AutoTokenizer.from_pretrained(
             "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True
@@ -237,7 +237,7 @@ def test_disable_quanted_input(self):
     def test_enable_norm_bias_tuning_qwen3(self):
         bits, group_size, sym = 4, 128, True
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -337,7 +337,7 @@ def test_auto_device_map(self):
         bits, group_size, sym = 4, 128, False
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
+            model_name, dtype="auto", trust_remote_code=True, device_map="auto"
         )
         autoround = AutoRound(
             model,
@@ -386,7 +386,7 @@ def test_fp32(self):
         bits, group_size, sym = 4, 128, False
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
+            model_name, dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
         autoround = AutoRound(
             model,
@@ -417,7 +417,7 @@ def test_tensor_reshape(self):
 
     def test_rtn(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
         bits, group_size, sym = 4, 128, True
@@ -426,7 +426,7 @@ def test_rtn(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
             self.save_folder,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
             device_map="auto",
         )
 
@@ -457,7 +457,7 @@ def test_fallback_layers(self):
         bits, group_size, sym = 4, 128, True
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
+            model_name, dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
         layer_config = {
             "model.decoder.layers.0.self_attn.q_proj": {"bits": 16},
@@ -500,7 +500,7 @@ def test_not_convert_modules(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ"
         quantization_config = AutoRoundConfig()
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
+            model_name, quantization_config=quantization_config, device_map="cpu", dtype=torch.float16
         )
         self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear))
         self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear))
@@ -545,7 +545,7 @@ def test_not_convert_modules(self):
     def test_fallback_layers_regex_awq(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
             r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16},
@@ -581,7 +581,7 @@ def test_fallback_layers_regex_awq(self):
     def test_fallback_layers_regex_gptq(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
             r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16},
@@ -617,7 +617,7 @@ def test_fallback_layers_regex_gptq(self):
     def test_fallback_layers_regex_round(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
             r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16},
@@ -653,7 +653,7 @@ def test_fallback_layers_regex_round(self):
     def test_fallback_layers_regex_exception(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}}
         with self.assertRaises(ValueError):
@@ -674,7 +674,7 @@ def test_fallback_layers_regex_exception(self):
     #     model_name = "Qwen/Qwen3-0.6B-FP8"
     #     ar = AutoRound(model=model_name, iters=0)
     #     ar.quantize_and_save(output_dir=self.save_folder)
-    #     model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True)
+    #     model = AutoModelForCausalLM.from_pretrained(self.save_folder, dtype="auto", trust_remote_code=True)
     #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
     #     text = "There is a girl who likes adventure,"
     #     inputs = tokenizer(text, return_tensors="pt").to(model.device)
diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
index 97211ade4..43b300cf8 100644
--- a/test/test_cpu/test_autoround_acc.py
+++ b/test/test_cpu/test_autoround_acc.py
@@ -38,7 +38,7 @@ def tearDownClass(self):
 
     def test_default_acc(self):
         model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32, trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         inp = torch.ones([1, 10], dtype=torch.long)
@@ -57,7 +57,7 @@ def test_default_acc(self):
         out0 = model(inp)
         print(f"out0 = {float(out0[0][0][0][0])}")
 
-        model_tmp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
+        model_tmp = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32, trust_remote_code=True)
         autoround_1 = AutoRound(
             model_tmp,
             tokenizer,
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 501caee25..3d3af1e25 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -160,7 +160,7 @@ def test_multimodal_quant(self):
         assert len(block_names_wo_vision) != (block_names_with_vision)
 
     def test_block_name_quant(self):
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         from auto_round.utils import get_block_names
 
         llm_block_names = get_block_names(self.model)
diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
index 689cc705c..3834c7c44 100644
--- a/test/test_cpu/test_calib_dataset.py
+++ b/test/test_cpu/test_calib_dataset.py
@@ -39,7 +39,7 @@ def setUpClass(self):
                 jsonl_file.write("\n")
 
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     def test_json(self):
@@ -72,7 +72,7 @@ def test_jsonl(self):
 
     def test_apply_chat_template(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''"
         bits, group_size, sym = 4, 128, True
diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py
index edd28110f..ca74e6b4f 100644
--- a/test/test_cpu/test_conv1d.py
+++ b/test/test_cpu/test_conv1d.py
@@ -33,7 +33,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_quant(self):
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             self.model,
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index ea484316b..a906a59e2 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -37,7 +37,7 @@ class TestAutoRound(unittest.TestCase):
     def setUpClass(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -218,7 +218,7 @@ def test_static_afp8_export(self, static_kv_dtype):
         from safetensors import safe_open
 
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
             self.tokenizer,
@@ -247,7 +247,7 @@ def test_static_afp8_export(self, static_kv_dtype):
 
                 model = transformers.AutoModelForCausalLM.from_pretrained(
                     quantized_model_path,
-                    torch_dtype="auto",
+                    dtype="auto",
                     low_cpu_mem_usage=True,
                     trust_remote_code=True,
                 )
@@ -277,7 +277,7 @@ def test_static_afp8_export(self, static_kv_dtype):
             self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
             self.tokenizer,
diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py
index 5018d1610..c9a60f617 100644
--- a/test/test_cpu/test_generation.py
+++ b/test/test_cpu/test_generation.py
@@ -23,7 +23,7 @@ class TestAutoRoundFormatGeneration(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
@@ -63,7 +63,7 @@ def test_4bits_sym(self):
         assert "!!!" not in res
 
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="cpu", quantization_config=quantization_config, torch_dtype=torch.float16
+            quantized_model_path, device_map="cpu", quantization_config=quantization_config, dtype=torch.float16
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
@@ -74,7 +74,7 @@ def test_4bits_sym(self):
 
     def test_autoround_sym(self):
         for bits in [4]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 308425cd1..981f3415e 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -123,7 +123,7 @@ def test_func(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         # model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         # autoround = AutoRound(
         #     model,
         #     self.tokenizer,
@@ -148,7 +148,7 @@ def test_func(self):
     #
     # def test_q5_k(self):
     #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+    #     model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
     #     autoround = AutoRound(
     #         model,
     #         self.tokenizer,
@@ -172,7 +172,7 @@ def test_func(self):
 
     # def test_q6_k(self):
     #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+    #     model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
     #     autoround = AutoRound(
     #         model,
     #         self.tokenizer,
@@ -196,7 +196,7 @@ def test_func(self):
 
     def test_gguf_baseline(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
             self.tokenizer,
@@ -219,7 +219,7 @@ def test_gguf_baseline(self):
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
         #
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         # autoround = AutoRound(
         #     model,
         #     self.tokenizer,
@@ -243,7 +243,7 @@ def test_gguf_baseline(self):
 
     def test_q4_k_m(self):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
             "lm_head": {
@@ -282,7 +282,7 @@ def test_q4_k_m(self):
         self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0")
         shutil.rmtree("./saved", ignore_errors=True)
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py
index 051dfb075..963ec1036 100644
--- a/test/test_cpu/test_llmcompressor.py
+++ b/test/test_cpu/test_llmcompressor.py
@@ -15,7 +15,7 @@ class TestLLMC(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
index 4fb6bb977..dcd71457a 100644
--- a/test/test_cpu/test_load_awq_gptq.py
+++ b/test/test_cpu/test_load_awq_gptq.py
@@ -43,7 +43,7 @@ def test_load_gptq_no_dummy_gidx_model(self):
         with self.assertRaises(NotImplementedError) as cm:
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                torch_dtype="auto",
+                dtype="auto",
                 trust_remote_code=True,
                 device_map="cpu",
                 quantization_config=quantization_config,
@@ -54,7 +54,7 @@ def test_load_awq(self):
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="auto",
+            dtype="auto",
             trust_remote_code=True,
             device_map="cpu",
             quantization_config=quantization_config,
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 2c73d42cd..50da50305 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -40,7 +40,7 @@ class TestAutoRound(unittest.TestCase):
     def setUpClass(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = ".saved/"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -238,7 +238,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            dtype="auto",
             device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 8510adca5..de49781ea 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -175,7 +175,7 @@ def test_str_input(self):
 
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             quantized_model_path,
-            torch_dtype="float16",
+            dtype="float16",
             device_map="auto",
         )
         processor = AutoProcessor.from_pretrained(quantized_model_path)
@@ -232,7 +232,7 @@ def test_qwen2_5(self):
         from PIL import Image
         from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration
 
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", dtype="auto", device_map="auto")
         image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
         processor = AutoProcessor.from_pretrained("./saved")
         messages = [
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 4fcd25135..23aaffdf1 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -37,7 +37,7 @@ class TestAutoRoundFP(unittest.TestCase):
     def setUpClass(self):
         model_name = "facebook/opt-125m"  # /tf_dataset/auto_round/models/
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -308,7 +308,7 @@ def test_qwen_moe_quant_infer(self):
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, dtype="auto", device_map="cpu")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py
index aca5c7592..5528a209f 100644
--- a/test/test_cpu/test_mxfp_save_load.py
+++ b/test/test_cpu/test_mxfp_save_load.py
@@ -63,7 +63,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            dtype="auto",
         )
         model.eval()
         assert has_module(
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
index 557bf1f38..7a334254e 100644
--- a/test/test_cpu/test_torch_backend.py
+++ b/test/test_cpu/test_torch_backend.py
@@ -63,7 +63,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_torch_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -81,7 +81,7 @@ def test_torch_4bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -92,7 +92,7 @@ def test_torch_4bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -104,7 +104,7 @@ def test_torch_4bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_torch_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -122,7 +122,7 @@ def test_torch_4bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
index 8bb23f6aa..977277aa8 100644
--- a/test/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -65,7 +65,7 @@ def model_infer(self, model, tokenizer):
     @require_greater_than_051
     def test_3bits_autoround(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=3)
         quantized_model_path = self.save_dir
@@ -73,7 +73,7 @@ def test_3bits_autoround(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
@@ -85,7 +85,7 @@ def test_3bits_autoround(self):
     @require_greater_than_051
     def test_3bits_asym_autoround(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         bits, sym = 3, False
         autoround = AutoRound(model, tokenizer, bits=bits, sym=sym)
@@ -107,7 +107,7 @@ def test_3bits_asym_autoround(self):
     @require_greater_than_050
     def test_norm_bias_tuning(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True)
         autoround.quantize()
@@ -124,7 +124,7 @@ def test_norm_bias_tuning(self):
     @require_greater_than_050
     def test_2bits_autoround(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=2, group_size=64)
         autoround.quantize()
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index 55fc1690f..1f33960f7 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -82,7 +82,7 @@ def tearDownClass(self):
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoround_asym(self):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, False
             autoround = AutoRound(
@@ -112,7 +112,7 @@ def test_autoround_asym(self):
 
     @require_autogptq
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
 
@@ -128,7 +128,7 @@ def test_mixed_precision(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -140,7 +140,7 @@ def test_mixed_precision(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_awq_backend(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -157,7 +157,7 @@ def test_awq_backend(self):
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -168,7 +168,7 @@ def test_awq_backend(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -180,7 +180,7 @@ def test_tritonv2_bf16(self):
         model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            model_name, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -190,7 +190,7 @@ def test_tritonv2_bf16(self):
 
     @require_ipex
     def test_autoround_gptq_sym_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -245,7 +245,7 @@ def test_autoround_gptq_sym_format(self):
     @require_ipex
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoround_awq_sym_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -271,7 +271,7 @@ def test_autoround_awq_sym_format(self):
         assert "!!!" not in res
 
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.bfloat16
+            quantized_model_path, device_map="cpu", trust_remote_code=True, dtype=torch.bfloat16
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
@@ -285,7 +285,7 @@ def test_autoround_awq_sym_format(self):
     @require_greater_than_050
     def test_autoround_sym(self):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(
@@ -319,7 +319,7 @@ def test_load_gptq_model_3bits(self):
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="auto",
+            dtype="auto",
             trust_remote_code=True,
             device_map="auto",
             quantization_config=quantization_config,
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index 70366cf05..a8b728c64 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -100,7 +100,7 @@ def test_multi_card_1(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", device_map="auto")
         scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"))
         ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
index b66f60127..76eb06d23 100644
--- a/test/test_cuda/test_calib_dataset.py
+++ b/test/test_cuda/test_calib_dataset.py
@@ -30,7 +30,7 @@ def setUpClass(self):
                 jsonl_file.write("\n")
 
         model_name = "facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     def test_combine_dataset(self):
diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
index e617bf55e..f795b81f5 100644
--- a/test/test_cuda/test_conv1d.py
+++ b/test/test_cuda/test_conv1d.py
@@ -35,7 +35,7 @@ def tearDownClass(self):
 
     @require_gptqmodel
     def test_quant(self):
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         from auto_round import AutoRoundConfig
 
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index 5c12e0557..0a08cf0cf 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -65,7 +65,7 @@ def tearDownClass(self):
 
     @require_gptqmodel
     def test_gptqmodel_exllmav2_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -76,7 +76,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -87,7 +87,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -100,7 +100,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
 
     @require_autogptq
     def test_gptq_exllamav2_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -118,7 +118,7 @@ def test_gptq_exllamav2_4bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -133,7 +133,7 @@ def test_gptq_exllamav2_4bits_sym(self):
     def test_gptq_exllamav2_4bits_sym_group_size(self):
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
             autoround = AutoRound(
@@ -152,7 +152,7 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
 
             quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index d6d6c1f93..62b1dae52 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -35,7 +35,7 @@ def tearDownClass(self):
 
     @require_optimum
     def test_autogptq_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -66,7 +66,7 @@ def test_autogptq_format(self):
 
     @require_optimum
     def test_autogptq_format_fp_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
         for n, m in model.named_modules():
@@ -103,7 +103,7 @@ def test_autogptq_format_fp_layers(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autogptq_format_qsave_fp_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
         for n, m in model.named_modules():
@@ -154,7 +154,7 @@ def test_autogptq_format_qsave_fp_layers(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autoround_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -187,7 +187,7 @@ def test_autoround_format(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoawq_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -221,7 +221,7 @@ def test_autoawq_format(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoawq_format_fp_qsave_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
             "model.decoder.layers.9.self_attn.v_proj": {"bits": 16},
@@ -262,7 +262,7 @@ def test_autoawq_format_fp_qsave_layers(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autoround_3bit_asym_torch_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 3, 128, False
         autoround = AutoRound(
@@ -291,7 +291,7 @@ def test_autoround_3bit_asym_torch_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autoround_3bit_sym_torch_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 3, 128, True
         autoround = AutoRound(
diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
index 43e745c4e..e8ede8801 100644
--- a/test/test_cuda/test_fp8_input.py
+++ b/test/test_cuda/test_fp8_input.py
@@ -26,7 +26,7 @@ def test_small_model_rtn_generation(self):
         model_name = "/models/Qwen3-0.6B-FP8"
         ar = AutoRound(model=model_name, iters=0)
         ar.quantize_and_save(output_dir=self.save_dir)
-        model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.save_dir, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -44,7 +44,7 @@ def test_gguf_imatrix(self):
         # output = llm("There is a girl who likes adventure,", max_tokens=32)
         # print(output)
         # shutil.rmtree("./saved", ignore_errors=True)
-        # model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
+        # model = AutoModelForCausalLM.from_pretrained(self.save_dir, dtype="auto", trust_remote_code=True)
         # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         # text = "There is a girl who likes adventure,"
         # inputs = tokenizer(text, return_tensors="pt").to(model.device)
diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
index 0193b5c14..a8e0415c9 100644
--- a/test/test_cuda/test_get_block_name.py
+++ b/test/test_cuda/test_get_block_name.py
@@ -38,14 +38,14 @@ def check_block_names(self, block_names, prefixs=[], n_layers=[]):
 
     def test_glm4(self):
         model_name = "/models/glm-4-9b-chat"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["transformer.encoder.layers"], [40])
         assert is_pure_text_model(model), "Expected model to be pure text model"
 
     def test_opt_125m(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.decoder.layers"], [12])
 
@@ -53,56 +53,56 @@ def test_opt_125m(self):
 
     def test_Qwen(self):
         model_name = "/models/Qwen2.5-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [28])
         assert is_pure_text_model(model)
 
     def test_phi4(self):
         model_name = "/models/phi-4"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [40])
         assert is_pure_text_model(model)
 
     def test_llama3(self):
         model_name = "/models/Meta-Llama-3.1-8B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [32])
         assert is_pure_text_model(model)
 
     def test_mixtral(self):
         model_name = "/models/Mixtral-8x7B-Instruct-v0.1"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [32])
         assert is_pure_text_model(model)
 
     def test_falcon(self):
         model_name = "/models/Falcon3-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [28])
         assert is_pure_text_model(model)
 
     def test_orca(self):
         model_name = "/models/Orca-2-7b"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [32])
         assert is_pure_text_model(model)
 
     def test_OLMo(self):
         model_name = "/models/OLMo-2-1124-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.layers"], [32])
         assert is_pure_text_model(model)
 
     def test_Qwen2VL(self):
         model_name = "/models/Qwen2-VL-2B-Instruct"
-        model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.language_model.layers"], [28])
 
@@ -112,7 +112,7 @@ def test_Qwen2VL(self):
 
     def test_Llama32(self):
         model_name = "/models/Llama-3.2-11B-Vision-Instruct"
-        model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForVision2Seq.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.language_model.layers"], [40])
 
@@ -131,7 +131,7 @@ def test_Llama32(self):
 
     def test_SmolVLM(self):
         model_name = "/models/SmolVLM-Instruct"
-        model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForVision2Seq.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.text_model.layers"], [24])
 
@@ -141,7 +141,7 @@ def test_SmolVLM(self):
 
     def test_glm_4v(self):
         model_name = "/models/glm-4v-9b"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["transformer.encoder.layers"], [40])
 
@@ -153,7 +153,7 @@ def test_glm_4v(self):
 
     def test_gemma3(self):
         model_name = "/models/gemma-3-12b-it"
-        model = Gemma3ForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = Gemma3ForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.language_model.layers"], [48])
 
@@ -165,7 +165,7 @@ def test_gemma3(self):
 
     def test_Mistral3(self):
         model_name = "/models/Mistral-Small-3.1-24B-Instruct-2503"
-        model = Mistral3ForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = Mistral3ForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.language_model.layers"], [40])
 
@@ -177,7 +177,7 @@ def test_Mistral3(self):
 
     def test_Molmo(self):
         model_name = "/models/Molmo-7B-D-0924"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["model.transformer.blocks"], [28])
 
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index 681387285..11ebc1275 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -41,7 +41,7 @@ def tearDownClass(self):
     @require_optimum
     def test_backend(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
         autoround.quantize()
@@ -69,7 +69,7 @@ def test_backend(self):
     @require_package_version_ut("transformers", "<4.57.0")
     def test_backend_awq(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
         autoround.quantize()
@@ -87,7 +87,7 @@ def test_backend_awq(self):
     @require_gptqmodel
     def test_fp_layers(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
 
@@ -112,7 +112,7 @@ def test_fp_layers(self):
     @require_package_version_ut("transformers", "<4.57.0")
     def test_fp_layers_awq(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
 
@@ -135,7 +135,7 @@ def test_fp_layers_awq(self):
     @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
     def test_undivided_group_size_tuning(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
         autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2)
@@ -144,7 +144,7 @@ def test_undivided_group_size_tuning(self):
     @require_gptqmodel
     def test_adam(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128)
         autoround.quantize()
@@ -165,7 +165,7 @@ def test_autoround_asym(self):  ##need to install false
             print("skip autoround asym test, as autoround is not installed from source")
             return
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False)
         autoround.quantize()
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index 26d3ddca2..331ad4188 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -26,7 +26,7 @@ class TestAutoRoundMarlinBackend(unittest.TestCase):
     def test_marlin_group_size(self):
         for group_size in [-1, 64]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
             autoround = AutoRound(
@@ -44,7 +44,7 @@ def test_marlin_group_size(self):
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -55,7 +55,7 @@ def test_marlin_group_size(self):
 
         for group_size in [32, 128]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
             autoround = AutoRound(
@@ -73,7 +73,7 @@ def test_marlin_group_size(self):
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -120,7 +120,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_marlin_4bits_sym_with_zp_m_1(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -138,7 +138,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
 
         quantization_config = AutoRoundConfig(backend="marlin")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -149,7 +149,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -161,7 +161,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     # def test_marlin_4bits_sym(self):
-    #     model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+    #     model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
     #     tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
     #     bits, group_size, sym = 4, 128, True
     #     autoround = AutoRound(
@@ -180,7 +180,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #     quantization_config = AutoRoundConfig(backend="marlin")
     #     model = AutoModelForCausalLM.from_pretrained(
     #         self.save_folder,
-    #         torch_dtype=torch.float16,
+    #         dtype=torch.float16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
@@ -194,7 +194,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #
     #     model = AutoModelForCausalLM.from_pretrained(
     #         self.save_folder,
-    #         torch_dtype=torch.bfloat16,
+    #         dtype=torch.bfloat16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 4f7d39d8c..575b3f16d 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -34,7 +34,7 @@ class TestAutoRound(unittest.TestCase):
     def setUpClass(self):
         self.model_name = "/models/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -234,7 +234,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            dtype="auto",
             device_map="auto",
         )
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
index f2f1685be..c1e826865 100644
--- a/test/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -42,7 +42,7 @@ def tearDownClass(self):
     @require_gptqmodel
     def test_device_map_str(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = ".*q_proj:0,.*k_proj:cuda:0,v_proj:1,.*up_proj:1"
         autoround = AutoRound(model, tokenizer, device_map=device_map)
@@ -59,7 +59,7 @@ def test_device_map_str(self):
     @multi_card
     def test_layer_norm(self):
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
@@ -70,7 +70,7 @@ def test_layer_norm(self):
     @multi_card
     def test_rms_norm(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
@@ -81,7 +81,7 @@ def test_rms_norm(self):
     @multi_card
     def test_act_quantization(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         autoround = AutoRound(
@@ -92,7 +92,7 @@ def test_act_quantization(self):
     @multi_card
     def test_lm_head(self):
         model_name = "/models/Qwen2.5-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1}
         layer_config = {"lm_head": {"bits": 4}}
@@ -111,7 +111,7 @@ def test_lm_head(self):
     @multi_card
     def test_device_map(self):
         model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"}
         autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32)
@@ -170,7 +170,7 @@ def test_device_map(self):
             "cuda",
             "auto",
         ]:
-            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map=tmp_device_map)
+            model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", device_map=tmp_device_map)
 
             tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -214,7 +214,7 @@ def test_device_map_dict(self):
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         bits, group_size, sym = 4, 128, False
         model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(
             model,
@@ -314,7 +314,7 @@ def test_device_map_for_triton(self):
             "auto",
         ]:
             model = AutoModelForCausalLM.from_pretrained(
-                model_name, torch_dtype="auto", device_map=tmp_device_map, quantization_config=quantization_config
+                model_name, dtype="auto", device_map=tmp_device_map, quantization_config=quantization_config
             )
 
             tokenizer = AutoTokenizer.from_pretrained(model_name)
diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py
index 0dc43b093..07a9c2408 100644
--- a/test/test_cuda/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py
@@ -32,7 +32,7 @@ def test_e2e_quant_and_infer(scheme):
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             device_map="cpu",
-            torch_dtype="auto",
+            dtype="auto",
             trust_remote_code=True,
         )
 
@@ -52,7 +52,7 @@ def test_e2e_quant_and_infer(scheme):
         # Perform inference with the quantized model
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path,
-            torch_dtype="auto",
+            dtype="auto",
         )
         model.eval()
         assert has_module(model, QMODULE_MAPPING[scheme]), f"Expected {QMODULE_MAPPING[scheme].__name__} in the model."
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 48dd27d9b..82c731765 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -157,7 +157,7 @@ def test_qwen_moe_quant_infer(self):
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, dtype="auto", device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
index d73d474d6..9e6127582 100644
--- a/test/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -52,7 +52,7 @@ def test_load_gptq_model_8bits(self):
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="auto",
+            dtype="auto",
             trust_remote_code=True,
             device_map="cpu",
             quantization_config=quantization_config,
@@ -66,7 +66,7 @@ def test_load_gptq_model_2bits(self):
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="auto",
+            dtype="auto",
             trust_remote_code=True,
             device_map="cpu",
             quantization_config=quantization_config,
@@ -76,7 +76,7 @@ def test_load_gptq_model_2bits(self):
 
     @require_itrex
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
 
@@ -95,7 +95,7 @@ def test_mixed_precision(self):
 
         model = AutoModelForCausalLM.from_pretrained(
             self.save_folder,
-            torch_dtype=torch.float16,
+            dtype=torch.float16,
             device_map="cpu",
         )
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -109,7 +109,7 @@ def test_mixed_precision(self):
     @require_gptqmodel
     def test_autoround_sym(self):
         for bits in [4]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2)
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index fe036361f..0685cf5c9 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -39,7 +39,7 @@ def tearDownClass(self):
     #     from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
     #     model = Qwen2VLForConditionalGeneration.from_pretrained(
     #         quantized_model_path,
-    #         torch_dtype="float16",
+    #         dtype="float16",
     #         device_map=f"cuda:{self.device}",
     #     )
     #     processor = AutoProcessor.from_pretrained(quantized_model_path)
@@ -99,7 +99,7 @@ def test_phi3(self):
             quantized_model_path,
             device_map=f"cuda:{self.device}",
             trust_remote_code=True,
-            torch_dtype="float16",
+            dtype="float16",
         )
         processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True, num_crops=4)
 
@@ -146,7 +146,7 @@ def test_phi3_vision_awq(self):
         quantized_model_path = os.path.join(self.save_dir, "Phi-3.5-vision-instruct-w4g128")
         res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, device_map=f"cuda:{self.device}", trust_remote_code=True, torch_dtype="auto"
+            quantized_model_path, device_map=f"cuda:{self.device}", trust_remote_code=True, dtype="auto"
         )
         assert "WQLinear_GEMM" in str(
             type(model.model.vision_embed_tokens.img_processor.vision_model.encoder.layers[0].mlp.fc1)
@@ -200,7 +200,7 @@ class DataArgs:
             quantized_model_path,
             model_base=None,
             model_name=quantized_model_path,
-            torch_dtype="auto",
+            dtype="auto",
             device_map=f"cuda:{self.device}",
         )
         image_url = "http://images.cocodataset.org/train2017/000000116003.jpg"
@@ -232,7 +232,7 @@ class DataArgs:
     #     quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128")
     #     model = MllamaForConditionalGeneration.from_pretrained(
     #         quantized_model_path,
-    #         torch_dtype="float16",
+    #         dtype="float16",
     #         device_map=f"cuda:{self.device}",
     #     )
     #     processor = AutoProcessor.from_pretrained(quantized_model_path)
@@ -277,7 +277,7 @@ def test_cogvlm(self):
         model = (
             AutoModelForCausalLM.from_pretrained(
                 quantized_model_path,
-                torch_dtype="float16",
+                dtype="float16",
                 trust_remote_code=True,
                 device_map=DEVICE,
             )
@@ -343,7 +343,7 @@ def test_deepseek_vl2(self):
             quantized_model_path,
             trust_remote_code=True,
             device_map=f"cuda:{self.device}",
-            torch_dtype="float16",
+            dtype="float16",
         )
         vl_gpt = vl_gpt.eval()
 
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py
index 3f7cb4141..f50eb4cb4 100644
--- a/test/test_cuda/test_torch_backend.py
+++ b/test/test_cuda/test_torch_backend.py
@@ -64,7 +64,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_torch_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -82,7 +82,7 @@ def test_torch_4bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -93,7 +93,7 @@ def test_torch_4bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -105,7 +105,7 @@ def test_torch_4bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_torch_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -123,7 +123,7 @@ def test_torch_4bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 6f953339d..d0c16acc4 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -60,7 +60,7 @@ def setUpClass(cls):
         torch.cuda.synchronize()
         cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
         cls.quantized_model = AutoModelForCausalLM.from_pretrained(
-            cls.model_name, device_map=cls.device_map, torch_dtype=torch.float16
+            cls.model_name, device_map=cls.device_map, dtype=torch.float16
         )
 
     def tearDown(self):
@@ -90,7 +90,7 @@ def test_quantized_model_bf16(self):
         quantization_config = AutoRoundConfig(backend="triton")
         quantized_model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             device_map=self.device_map,
             quantization_config=quantization_config,
         )
@@ -105,7 +105,7 @@ def test_quantized_model_on_cpu(self):
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt")
 
-        quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
+        quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto")
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
 
         self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@@ -121,7 +121,7 @@ def test_save_pretrained(self):
             quantized_model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map=self.device_map,
-                torch_dtype=torch.float16,
+                dtype=torch.float16,
                 quantization_config=quantization_config,
             )
 
@@ -140,7 +140,7 @@ def test_quantized_model_multi_gpu(self):
         """
         quantization_config = AutoRoundConfig(backend="triton")
         quantized_model = AutoModelForCausalLM.from_pretrained(
-            self.model_name, device_map="auto", quantization_config=quantization_config, torch_dtype="auto"
+            self.model_name, device_map="auto", quantization_config=quantization_config, dtype="auto"
         )
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device)
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
@@ -155,7 +155,7 @@ def test_convert_from_gptq(self):
         quantization_config = AutoRoundConfig()
 
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, device_map="cuda", quantization_config=quantization_config, torch_dtype="auto"
+            model_name, device_map="cuda", quantization_config=quantization_config, dtype="auto"
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -173,7 +173,7 @@ def test_convert_from_awq_cpu(self):
         quantization_config = AutoRoundConfig()
 
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
+            model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -186,7 +186,7 @@ def test_mixed_bits(self):
         Simple test that checks if auto-round work properly with mixed bits
         """
         model_name = "facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 8},
@@ -199,7 +199,7 @@ def test_mixed_bits(self):
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
         with tempfile.TemporaryDirectory() as tmpdirname:
             autoround.quantize_and_save(output_dir=tmpdirname)
-            model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda")
+            model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=torch.float16, device_map="cuda")
             text = "There is a girl who likes adventure,"
             inputs = tokenizer(text, return_tensors="pt").to(model.device)
             tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
index 7cbc8719d..1902128c0 100644
--- a/test/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -60,7 +60,7 @@ def tearDownClass(self):
 
     @require_greater_than_050
     def test_tritonv2_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -78,7 +78,7 @@ def test_tritonv2_4bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -89,7 +89,7 @@ def test_tritonv2_4bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -102,7 +102,7 @@ def test_tritonv2_4bits_asym(self):
 
     @require_greater_than_050
     def test_tritonv2_2bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 2, 32, False
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
@@ -111,7 +111,7 @@ def test_tritonv2_2bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -122,7 +122,7 @@ def test_tritonv2_2bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -135,7 +135,7 @@ def test_tritonv2_2bits_asym(self):
 
     @require_greater_than_050
     def test_tritonv2_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -153,7 +153,7 @@ def test_tritonv2_4bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -164,7 +164,7 @@ def test_tritonv2_4bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -178,7 +178,7 @@ def test_tritonv2_4bits_sym(self):
 
     @require_greater_than_050
     def test_tritonv2_8bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 256, True
         autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1)
@@ -187,7 +187,7 @@ def test_tritonv2_8bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -198,7 +198,7 @@ def test_tritonv2_8bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -211,7 +211,7 @@ def test_tritonv2_8bits_sym(self):
 
     @require_greater_than_050
     def test_tritonv2_2bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 2, 64, True
         autoround = AutoRound(
@@ -226,7 +226,7 @@ def test_tritonv2_2bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -237,7 +237,7 @@ def test_tritonv2_2bits_sym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
index eee7c055a..f3822ff28 100644
--- a/test/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -49,7 +49,7 @@ def qwen_inference(self, quantized_model_dir):
         processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             quantized_model_dir,
-            torch_dtype="float16",
+            dtype="float16",
             device_map="auto",
             ##revision="df7f44c" ##AutoGPTQ format
         )
diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py
index 2bb7983e5..2a95e1eed 100644
--- a/test/test_hpu/test_auto_round.py
+++ b/test/test_hpu/test_auto_round.py
@@ -11,7 +11,7 @@ def run_opt_125m_on_hpu():
     from auto_round import AutoRound
 
     model_name = "facebook/opt-125m"
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     bits, group_size, sym = 4, 128, False
@@ -58,7 +58,7 @@ def test_w4a8(data_type):
     model_name = "facebook/opt-125m"
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype="auto",
+        dtype="auto",
         attn_implementation="eager",
         trust_remote_code=True,
     )
diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py
index e0a0ef321..57d032bf3 100644
--- a/test/test_hpu/test_inference.py
+++ b/test/test_hpu/test_inference.py
@@ -32,7 +32,7 @@ def is_hpex_available():
 #     @classmethod
 #     def setUpClass(self):
 #         model_name = "facebook/opt-125m"
-#         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+#         self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
 #         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 #         self.llm_dataloader = LLMDataLoader()
 
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index 8052a8af0..997b2f69c 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -35,7 +35,7 @@ def tearDownClass(self):
     def test_gptq_format(self):
         model_name = "facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
+            model_name, dtype="auto", trust_remote_code=True, device_map="auto"
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -67,9 +67,7 @@ def test_gptq_format(self):
 
     def test_awq_format(self):
         model_name = "facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu"
-        )
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True, device_map="xpu")
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(