diff --git a/README.md b/README.md index ddaecc53a..480e72dbe 100644 --- a/README.md +++ b/README.md @@ -323,7 +323,7 @@ The support for Gaudi device is limited. from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound" -model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 844f366bb..59ccb9ea2 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -701,7 +701,7 @@ def tune(args): logger.error("Cannot find correct gguf file for evaluation, please check.") sys.exit(-1) model = AutoModelForCausalLM.from_pretrained( - eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + eval_folder, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype ) model.eval() tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file) diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md index ca9adb93f..d9de06d61 100644 --- a/auto_round/compressors/diffusion/README.md +++ b/auto_round/compressors/diffusion/README.md @@ -15,7 +15,7 @@ from diffusers import AutoPipelineForText2Image # Load the model model_name = "black-forest-labs/FLUX.1-dev" -pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) +pipe = AutoPipelineForText2Image.from_pretrained(model_name, dtype=torch.bfloat16) # Quantize the model autoround = AutoRound( diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 71a5c1402..8fd43319a 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -174,7 +174,7 @@ def eval(args): " but may affect accuracy." ) model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype ) model.eval() st = time.time() @@ -252,7 +252,7 @@ def eval_task_by_task( ) model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype ) model.eval() parallelism = False diff --git a/auto_round/experimental/kv_cache.py b/auto_round/experimental/kv_cache.py index 8a49f3072..e67c6138a 100644 --- a/auto_round/experimental/kv_cache.py +++ b/auto_round/experimental/kv_cache.py @@ -263,7 +263,7 @@ def prep_attention_module_for_calibration(module: torch.nn.Module): def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch.dtype: valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"] - valid_torch_dtype = { + valid_dtype = { "float16": torch.float16, "bfloat16": torch.bfloat16, "fp8": torch.float8_e4m3fn, @@ -272,13 +272,13 @@ def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch "float": torch.float32, # Alias for float32 } if static_kv_dtype in valid_dtype_name_lst: - new_dtype = valid_torch_dtype[static_kv_dtype] - elif static_kv_dtype in valid_torch_dtype.values(): + new_dtype = valid_dtype[static_kv_dtype] + elif static_kv_dtype in valid_dtype.values(): new_dtype = static_kv_dtype else: raise ValueError( f"Invalid static kv dtype: {static_kv_dtype}. " - f"Valid options are: {', '.join(valid_dtype_name_lst + list(valid_torch_dtype.values()))}." + f"Valid options are: {', '.join(valid_dtype_name_lst + list(valid_dtype.values()))}." ) return new_dtype diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 8b8a618e2..42b4644c3 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -105,10 +105,10 @@ def pack_layer(layer_name, model, data_type, device=None): weight = layer.weight weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size) act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None - torch_dtype = torch.float8_e4m3fn + dtype = torch.float8_e4m3fn if "fp8_e5m2" in data_type: - torch_dtype = torch.float8_e5m2 - info = torch.finfo(torch_dtype) + dtype = torch.float8_e5m2 + info = torch.finfo(dtype) if zp is not None: if isinstance(zp, torch.Tensor): zp = zp.to(packing_device) @@ -117,7 +117,7 @@ def pack_layer(layer_name, model, data_type, device=None): q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len) q_weight = torch.clamp(q_weight, info.min, info.max) - q_weight = q_weight.to(torch_dtype) + q_weight = q_weight.to(dtype) if type(layer) == torch.nn.Linear: in_features = layer.in_features out_features = layer.out_features diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py index c7c327f1b..c1ddff569 100644 --- a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py +++ b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py @@ -172,7 +172,7 @@ def __init__( # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: - # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. + # NOTE: can't use field "dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py index 1b0b48b35..79101cd6f 100644 --- a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py +++ b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py @@ -73,10 +73,10 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device: weight = layer.weight weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size) act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None - torch_dtype = torch.float8_e4m3fn + dtype = torch.float8_e4m3fn if "fp8_e5m2" in data_type: - torch_dtype = torch.float8_e5m2 - info = torch.finfo(torch_dtype) + dtype = torch.float8_e5m2 + info = torch.finfo(dtype) if zp is not None: if isinstance(zp, torch.Tensor): zp = zp.to(packing_device) @@ -85,7 +85,7 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device: q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len) q_weight = torch.clamp(q_weight, info.min, info.max) - q_weight = q_weight.to(torch_dtype) + q_weight = q_weight.to(dtype) if type(layer) == torch.nn.Linear: in_features = layer.in_features out_features = layer.out_features diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py index 78e2f43e6..9237a72ff 100644 --- a/auto_round/export/utils.py +++ b/auto_round/export/utils.py @@ -58,7 +58,7 @@ def save_model( if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")): with open(config_path, "r") as file: data = json.load(file) - data["torch_dtype"] = str(dtype).split(".")[-1] + data["dtype"] = str(dtype).split(".")[-1] with open(config_path, "w") as file: json.dump(data, file, indent=2) config_file = "quantization_config.json" diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py index 33ab74d8d..dca1e3370 100644 --- a/auto_round/inference/auto_quantizer.py +++ b/auto_round/inference/auto_quantizer.py @@ -329,10 +329,10 @@ def validate_environment(self, *args, **kwargs): "auto-round` or install from source" ) - def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": - if torch_dtype is None: - torch_dtype = torch.bfloat16 - return torch_dtype + def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype": + if dtype is None: + dtype = torch.bfloat16 + return dtype def post_init_model(self, model): """Post-initialization that require device information, for example buffers initialization on device. diff --git a/auto_round/modelling/gpt_oss.py b/auto_round/modelling/gpt_oss.py index 78f73075c..b5236c604 100644 --- a/auto_round/modelling/gpt_oss.py +++ b/auto_round/modelling/gpt_oss.py @@ -62,7 +62,7 @@ def __init__(self, config: GptOssConfig, original: GptOssMLP): super().__init__() hidden_size = config.hidden_size intermediate_size = config.intermediate_size - dtype_str = getattr(config, "torch_dtype", None) or getattr(config, "dtype", None) + dtype_str = getattr(config, "dtype", None) dtype = torch.bfloat16 if str(dtype_str).endswith("bfloat16") else torch.float32 top_k = config.num_experts_per_tok self.hidden_size = hidden_size diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 7a1c66de8..11a4663fd 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -195,9 +195,9 @@ def llm_load_model( ) device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" + dtype = "auto" if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 + dtype = torch.bfloat16 is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) @@ -210,7 +210,7 @@ def llm_load_model( if _use_hpu_compile_mode(): model = model_cls.from_pretrained( pretrained_model_name_or_path, - torch_dtype=torch_dtype, + dtype=dtype, attn_implementation="eager", trust_remote_code=trust_remote_code, device_map="auto" if use_auto_mapping else None, @@ -219,7 +219,7 @@ def llm_load_model( try: model = model_cls.from_pretrained( pretrained_model_name_or_path, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, device_map="auto" if use_auto_mapping else None, ) @@ -228,7 +228,7 @@ def llm_load_model( orig_func = set_fake_cuda_device_capability() model = model_cls.from_pretrained( pretrained_model_name_or_path, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, device_map="auto" if use_auto_mapping else None, ) @@ -241,7 +241,7 @@ def llm_load_model( logger.warning(f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry.") model = model_cls.from_pretrained( pretrained_model_name_or_path, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=False, device_map="auto" if use_auto_mapping else None, ) @@ -256,7 +256,7 @@ def llm_load_model( def mllm_load_model( pretrained_model_name_or_path, device="cpu", - torch_dtype="auto", + dtype="auto", use_auto_mapping=True, trust_remote_code=True, model_dtype=None, @@ -268,9 +268,9 @@ def mllm_load_model( from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" + dtype = "auto" if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 + dtype = torch.bfloat16 if os.path.isdir(pretrained_model_name_or_path): config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) else: @@ -306,7 +306,7 @@ def mllm_load_model( model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, + dtype=dtype, device_map="auto" if use_auto_mapping else None, ) else: @@ -318,7 +318,7 @@ def mllm_load_model( pretrained_model_name_or_path, model_base=None, model_name=pretrained_model_name_or_path, - torch_dtype=torch_dtype, + dtype=dtype, ) else: if architectures.endswith("Model") and hasattr( @@ -333,7 +333,7 @@ def mllm_load_model( model = cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, + dtype=dtype, device_map="auto" if use_auto_mapping else None, ) except ValueError as e: @@ -342,7 +342,7 @@ def mllm_load_model( model = cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, - torch_dtype=torch_dtype, + dtype=dtype, device_map="auto" if use_auto_mapping else None, ) torch.cuda.get_device_capability = orig_func @@ -383,7 +383,7 @@ def mllm_load_model( def diffusion_load_model( pretrained_model_name_or_path: str, device: Union[str, torch.device] = "cpu", - torch_dtype: Union[str, torch.dtype] = "auto", + dtype: Union[str, torch.dtype] = "auto", use_auto_mapping: bool = False, trust_remote_code: bool = True, model_dtype: str = None, @@ -393,15 +393,13 @@ def diffusion_load_model( from auto_round.utils.device import get_device_and_parallelism device_str, use_auto_mapping = get_device_and_parallelism(device) - torch_dtype = "auto" + dtype = "auto" if device_str is not None and "hpu" in device_str: - torch_dtype = torch.bfloat16 + dtype = torch.bfloat16 pipelines = LazyImport("diffusers.pipelines") - pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained( - pretrained_model_name_or_path, torch_dtype=torch_dtype - ) + pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, dtype=dtype) pipe = _to_model_dtype(pipe, model_dtype) model = pipe.transformer return pipe, model.to(device) diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 6efbc85e7..c7786cb5e 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -489,7 +489,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "opensourcerelease/DeepSeek-R1-bf16" tokenizer = AutoTokenizer.from_pretrained(model_name) -model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, dtype="auto") block = model.model.layers device_map = {} @@ -599,7 +599,7 @@ Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX) from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -615,7 +615,7 @@ Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -630,7 +630,7 @@ Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits infe from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" -model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -670,7 +670,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" quantization_config = AutoRoundConfig(backend="ipex") model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" + model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," @@ -701,7 +701,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig model_name = "ybelkada/opt-125m-gptq-4bit" quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" + model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) text = "There is a girl who likes adventure," diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index dfc387dee..0ff975460 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -24,7 +24,7 @@ class TestAutoRoundAct(unittest.TestCase): def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -35,7 +35,7 @@ def tearDownClass(self): def test_mx_fp4(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -54,7 +54,7 @@ def test_mx_fp4(self): def test_wint4fp8_dynamic(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size = 4, 128 autoround = AutoRound( @@ -93,7 +93,7 @@ def test_wfp8afp8_static(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" from auto_round.wrapper import WrapperWALayer - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( model, @@ -114,7 +114,7 @@ def test_wfp8afp8_static(self): self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30) model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( model, diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index f9801217e..cb0e86135 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -24,7 +24,7 @@ class TestAutoRound(unittest.TestCase): @classmethod def setUpClass(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 3adfd9f47..206a3b0ab 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -29,7 +29,7 @@ class TestAutoRound(unittest.TestCase): @classmethod def setUpClass(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @@ -103,7 +103,7 @@ def test_consecutive_quant(self): autoround.quantize() model = AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True + "/tf_dataset/auto_round/models/microsoft/phi-2", dtype="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True @@ -237,7 +237,7 @@ def test_disable_quanted_input(self): def test_enable_norm_bias_tuning_qwen3(self): bits, group_size, sym = 4, 128, True model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( model, @@ -337,7 +337,7 @@ def test_auto_device_map(self): bits, group_size, sym = 4, 128, False model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" + model_name, dtype="auto", trust_remote_code=True, device_map="auto" ) autoround = AutoRound( model, @@ -386,7 +386,7 @@ def test_fp32(self): bits, group_size, sym = 4, 128, False model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" + model_name, dtype=torch.float32, trust_remote_code=True, device_map="auto" ) autoround = AutoRound( model, @@ -417,7 +417,7 @@ def test_tensor_reshape(self): def test_rtn(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -426,7 +426,7 @@ def test_rtn(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( self.save_folder, - torch_dtype=torch.float16, + dtype=torch.float16, device_map="auto", ) @@ -457,7 +457,7 @@ def test_fallback_layers(self): bits, group_size, sym = 4, 128, True model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" + model_name, dtype=torch.float32, trust_remote_code=True, device_map="auto" ) layer_config = { "model.decoder.layers.0.self_attn.q_proj": {"bits": 16}, @@ -500,7 +500,7 @@ def test_not_convert_modules(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ" quantization_config = AutoRoundConfig() model = Qwen2VLForConditionalGeneration.from_pretrained( - model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16 + model_name, quantization_config=quantization_config, device_map="cpu", dtype=torch.float16 ) self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)) self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear)) @@ -545,7 +545,7 @@ def test_not_convert_modules(self): def test_fallback_layers_regex_awq(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16}, @@ -581,7 +581,7 @@ def test_fallback_layers_regex_awq(self): def test_fallback_layers_regex_gptq(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16}, @@ -617,7 +617,7 @@ def test_fallback_layers_regex_gptq(self): def test_fallback_layers_regex_round(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { r"model\.decoder\.layers\.(?:[0-9]|1[0-1])\.self_attn\.q_proj": {"bits": 16}, @@ -653,7 +653,7 @@ def test_fallback_layers_regex_round(self): def test_fallback_layers_regex_exception(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}} with self.assertRaises(ValueError): @@ -674,7 +674,7 @@ def test_fallback_layers_regex_exception(self): # model_name = "Qwen/Qwen3-0.6B-FP8" # ar = AutoRound(model=model_name, iters=0) # ar.quantize_and_save(output_dir=self.save_folder) - # model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(self.save_folder, dtype="auto", trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) # text = "There is a girl who likes adventure," # inputs = tokenizer(text, return_tensors="pt").to(model.device) diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 97211ade4..43b300cf8 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -38,7 +38,7 @@ def tearDownClass(self): def test_default_acc(self): model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True inp = torch.ones([1, 10], dtype=torch.long) @@ -57,7 +57,7 @@ def test_default_acc(self): out0 = model(inp) print(f"out0 = {float(out0[0][0][0][0])}") - model_tmp = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) + model_tmp = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float32, trust_remote_code=True) autoround_1 = AutoRound( model_tmp, tokenizer, diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 501caee25..3d3af1e25 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -160,7 +160,7 @@ def test_multimodal_quant(self): assert len(block_names_wo_vision) != (block_names_with_vision) def test_block_name_quant(self): - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) from auto_round.utils import get_block_names llm_block_names = get_block_names(self.model) diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py index 689cc705c..3834c7c44 100644 --- a/test/test_cpu/test_calib_dataset.py +++ b/test/test_cpu/test_calib_dataset.py @@ -39,7 +39,7 @@ def setUpClass(self): jsonl_file.write("\n") model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) def test_json(self): @@ -72,7 +72,7 @@ def test_jsonl(self): def test_apply_chat_template(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''" bits, group_size, sym = 4, 128, True diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py index edd28110f..ca74e6b4f 100644 --- a/test/test_cpu/test_conv1d.py +++ b/test/test_cpu/test_conv1d.py @@ -33,7 +33,7 @@ def tearDownClass(self): shutil.rmtree("runs", ignore_errors=True) def test_quant(self): - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( self.model, diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index ea484316b..a906a59e2 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -37,7 +37,7 @@ class TestAutoRound(unittest.TestCase): def setUpClass(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -218,7 +218,7 @@ def test_static_afp8_export(self, static_kv_dtype): from safetensors import safe_open model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) autoround = AutoRound( model, self.tokenizer, @@ -247,7 +247,7 @@ def test_static_afp8_export(self, static_kv_dtype): model = transformers.AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="auto", + dtype="auto", low_cpu_mem_usage=True, trust_remote_code=True, ) @@ -277,7 +277,7 @@ def test_static_afp8_export(self, static_kv_dtype): self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32) shutil.rmtree(quantized_model_path, ignore_errors=True) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) autoround = AutoRound( model, self.tokenizer, diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index 5018d1610..c9a60f617 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -23,7 +23,7 @@ class TestAutoRoundFormatGeneration(unittest.TestCase): @classmethod def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @@ -63,7 +63,7 @@ def test_4bits_sym(self): assert "!!!" not in res model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, device_map="cpu", quantization_config=quantization_config, torch_dtype=torch.float16 + quantized_model_path, device_map="cpu", quantization_config=quantization_config, dtype=torch.float16 ) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," @@ -74,7 +74,7 @@ def test_4bits_sym(self): def test_autoround_sym(self): for bits in [4]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound( diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 308425cd1..981f3415e 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -123,7 +123,7 @@ def test_func(self): shutil.rmtree("./saved", ignore_errors=True) # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) # autoround = AutoRound( # model, # self.tokenizer, @@ -148,7 +148,7 @@ def test_func(self): # # def test_q5_k(self): # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) # autoround = AutoRound( # model, # self.tokenizer, @@ -172,7 +172,7 @@ def test_func(self): # def test_q6_k(self): # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) # autoround = AutoRound( # model, # self.tokenizer, @@ -196,7 +196,7 @@ def test_func(self): def test_gguf_baseline(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) autoround = AutoRound( model, self.tokenizer, @@ -219,7 +219,7 @@ def test_gguf_baseline(self): print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) # - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) # autoround = AutoRound( # model, # self.tokenizer, @@ -243,7 +243,7 @@ def test_gguf_baseline(self): def test_q4_k_m(self): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { "lm_head": { @@ -282,7 +282,7 @@ def test_q4_k_m(self): self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0") shutil.rmtree("./saved", ignore_errors=True) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py index 051dfb075..963ec1036 100644 --- a/test/test_cpu/test_llmcompressor.py +++ b/test/test_cpu/test_llmcompressor.py @@ -15,7 +15,7 @@ class TestLLMC(unittest.TestCase): @classmethod def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index 4fb6bb977..dcd71457a 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -43,7 +43,7 @@ def test_load_gptq_no_dummy_gidx_model(self): with self.assertRaises(NotImplementedError) as cm: model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", trust_remote_code=True, device_map="cpu", quantization_config=quantization_config, @@ -54,7 +54,7 @@ def test_load_awq(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", trust_remote_code=True, device_map="cpu", quantization_config=quantization_config, diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py index 2c73d42cd..50da50305 100644 --- a/test/test_cpu/test_mix_bits.py +++ b/test/test_cpu/test_mix_bits.py @@ -40,7 +40,7 @@ class TestAutoRound(unittest.TestCase): def setUpClass(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = ".saved/" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -238,7 +238,7 @@ def test_mixed_MXFP_autoround_format_loading(self): autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="auto", + dtype="auto", device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 8510adca5..de49781ea 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -175,7 +175,7 @@ def test_str_input(self): model = Qwen2VLForConditionalGeneration.from_pretrained( quantized_model_path, - torch_dtype="float16", + dtype="float16", device_map="auto", ) processor = AutoProcessor.from_pretrained(quantized_model_path) @@ -232,7 +232,7 @@ def test_qwen2_5(self): from PIL import Image from transformers import AutoProcessor, AutoTokenizer, Qwen2_5_VLForConditionalGeneration - model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", torch_dtype="auto", device_map="auto") + model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./saved", dtype="auto", device_map="auto") image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" processor = AutoProcessor.from_pretrained("./saved") messages = [ diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 4fcd25135..23aaffdf1 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -37,7 +37,7 @@ class TestAutoRoundFP(unittest.TestCase): def setUpClass(self): model_name = "facebook/opt-125m" # /tf_dataset/auto_round/models/ self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -308,7 +308,7 @@ def test_qwen_moe_quant_infer(self): ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, dtype="auto", device_map="cpu") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py index aca5c7592..5528a209f 100644 --- a/test/test_cpu/test_mxfp_save_load.py +++ b/test/test_cpu/test_mxfp_save_load.py @@ -63,7 +63,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="auto", + dtype="auto", ) model.eval() assert has_module( diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py index 557bf1f38..7a334254e 100644 --- a/test/test_cpu/test_torch_backend.py +++ b/test/test_cpu/test_torch_backend.py @@ -63,7 +63,7 @@ def tearDownClass(self): shutil.rmtree("runs", ignore_errors=True) def test_torch_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -81,7 +81,7 @@ def test_torch_4bits_asym(self): quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config + quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -92,7 +92,7 @@ def test_torch_4bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -104,7 +104,7 @@ def test_torch_4bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) def test_torch_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -122,7 +122,7 @@ def test_torch_4bits_sym(self): quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config + quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index 8bb23f6aa..977277aa8 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -65,7 +65,7 @@ def model_infer(self, model, tokenizer): @require_greater_than_051 def test_3bits_autoround(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=3) quantized_model_path = self.save_dir @@ -73,7 +73,7 @@ def test_3bits_autoround(self): quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) @@ -85,7 +85,7 @@ def test_3bits_autoround(self): @require_greater_than_051 def test_3bits_asym_autoround(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) bits, sym = 3, False autoround = AutoRound(model, tokenizer, bits=bits, sym=sym) @@ -107,7 +107,7 @@ def test_3bits_asym_autoround(self): @require_greater_than_050 def test_norm_bias_tuning(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True) autoround.quantize() @@ -124,7 +124,7 @@ def test_norm_bias_tuning(self): @require_greater_than_050 def test_2bits_autoround(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=2, group_size=64) autoround.quantize() diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index 55fc1690f..1f33960f7 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -82,7 +82,7 @@ def tearDownClass(self): @require_package_version_ut("transformers", "<4.57.0") def test_autoround_asym(self): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, False autoround = AutoRound( @@ -112,7 +112,7 @@ def test_autoround_asym(self): @require_autogptq def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} @@ -128,7 +128,7 @@ def test_mixed_precision(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -140,7 +140,7 @@ def test_mixed_precision(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_awq_backend(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -157,7 +157,7 @@ def test_awq_backend(self): quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -168,7 +168,7 @@ def test_awq_backend(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -180,7 +180,7 @@ def test_tritonv2_bf16(self): model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + model_name, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -190,7 +190,7 @@ def test_tritonv2_bf16(self): @require_ipex def test_autoround_gptq_sym_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -245,7 +245,7 @@ def test_autoround_gptq_sym_format(self): @require_ipex @require_package_version_ut("transformers", "<4.57.0") def test_autoround_awq_sym_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -271,7 +271,7 @@ def test_autoround_awq_sym_format(self): assert "!!!" not in res model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, device_map="cpu", trust_remote_code=True, torch_dtype=torch.bfloat16 + quantized_model_path, device_map="cpu", trust_remote_code=True, dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) text = "There is a girl who likes adventure," @@ -285,7 +285,7 @@ def test_autoround_awq_sym_format(self): @require_greater_than_050 def test_autoround_sym(self): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound( @@ -319,7 +319,7 @@ def test_load_gptq_model_3bits(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", trust_remote_code=True, device_map="auto", quantization_config=quantization_config, diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index 70366cf05..a8b728c64 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -100,7 +100,7 @@ def test_multi_card_1(self): from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", device_map="auto") scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4")) ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py index b66f60127..76eb06d23 100644 --- a/test/test_cuda/test_calib_dataset.py +++ b/test/test_cuda/test_calib_dataset.py @@ -30,7 +30,7 @@ def setUpClass(self): jsonl_file.write("\n") model_name = "facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) def test_combine_dataset(self): diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index e617bf55e..f795b81f5 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -35,7 +35,7 @@ def tearDownClass(self): @require_gptqmodel def test_quant(self): - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True from auto_round import AutoRoundConfig diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 5c12e0557..0a08cf0cf 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -65,7 +65,7 @@ def tearDownClass(self): @require_gptqmodel def test_gptqmodel_exllmav2_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -76,7 +76,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -87,7 +87,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -100,7 +100,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): @require_autogptq def test_gptq_exllamav2_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -118,7 +118,7 @@ def test_gptq_exllamav2_4bits_sym(self): quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -133,7 +133,7 @@ def test_gptq_exllamav2_4bits_sym(self): def test_gptq_exllamav2_4bits_sym_group_size(self): for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, group_size, True autoround = AutoRound( @@ -152,7 +152,7 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index d6d6c1f93..62b1dae52 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -35,7 +35,7 @@ def tearDownClass(self): @require_optimum def test_autogptq_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -66,7 +66,7 @@ def test_autogptq_format(self): @require_optimum def test_autogptq_format_fp_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} for n, m in model.named_modules(): @@ -103,7 +103,7 @@ def test_autogptq_format_fp_layers(self): shutil.rmtree("./saved", ignore_errors=True) def test_autogptq_format_qsave_fp_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} for n, m in model.named_modules(): @@ -154,7 +154,7 @@ def test_autogptq_format_qsave_fp_layers(self): shutil.rmtree("./saved", ignore_errors=True) def test_autoround_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -187,7 +187,7 @@ def test_autoround_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_autoawq_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -221,7 +221,7 @@ def test_autoawq_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_autoawq_format_fp_qsave_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, "model.decoder.layers.9.self_attn.v_proj": {"bits": 16}, @@ -262,7 +262,7 @@ def test_autoawq_format_fp_qsave_layers(self): shutil.rmtree("./saved", ignore_errors=True) def test_autoround_3bit_asym_torch_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 3, 128, False autoround = AutoRound( @@ -291,7 +291,7 @@ def test_autoround_3bit_asym_torch_format(self): shutil.rmtree("./saved", ignore_errors=True) def test_autoround_3bit_sym_torch_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 3, 128, True autoround = AutoRound( diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py index 43e745c4e..e8ede8801 100644 --- a/test/test_cuda/test_fp8_input.py +++ b/test/test_cuda/test_fp8_input.py @@ -26,7 +26,7 @@ def test_small_model_rtn_generation(self): model_name = "/models/Qwen3-0.6B-FP8" ar = AutoRound(model=model_name, iters=0) ar.quantize_and_save(output_dir=self.save_dir) - model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.save_dir, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) @@ -44,7 +44,7 @@ def test_gguf_imatrix(self): # output = llm("There is a girl who likes adventure,", max_tokens=32) # print(output) # shutil.rmtree("./saved", ignore_errors=True) - # model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(self.save_dir, dtype="auto", trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # text = "There is a girl who likes adventure," # inputs = tokenizer(text, return_tensors="pt").to(model.device) diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index 0193b5c14..a8e0415c9 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -38,14 +38,14 @@ def check_block_names(self, block_names, prefixs=[], n_layers=[]): def test_glm4(self): model_name = "/models/glm-4-9b-chat" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["transformer.encoder.layers"], [40]) assert is_pure_text_model(model), "Expected model to be pure text model" def test_opt_125m(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.decoder.layers"], [12]) @@ -53,56 +53,56 @@ def test_opt_125m(self): def test_Qwen(self): model_name = "/models/Qwen2.5-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [28]) assert is_pure_text_model(model) def test_phi4(self): model_name = "/models/phi-4" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [40]) assert is_pure_text_model(model) def test_llama3(self): model_name = "/models/Meta-Llama-3.1-8B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) assert is_pure_text_model(model) def test_mixtral(self): model_name = "/models/Mixtral-8x7B-Instruct-v0.1" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) assert is_pure_text_model(model) def test_falcon(self): model_name = "/models/Falcon3-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [28]) assert is_pure_text_model(model) def test_orca(self): model_name = "/models/Orca-2-7b" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) assert is_pure_text_model(model) def test_OLMo(self): model_name = "/models/OLMo-2-1124-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) assert is_pure_text_model(model) def test_Qwen2VL(self): model_name = "/models/Qwen2-VL-2B-Instruct" - model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.language_model.layers"], [28]) @@ -112,7 +112,7 @@ def test_Qwen2VL(self): def test_Llama32(self): model_name = "/models/Llama-3.2-11B-Vision-Instruct" - model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForVision2Seq.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.language_model.layers"], [40]) @@ -131,7 +131,7 @@ def test_Llama32(self): def test_SmolVLM(self): model_name = "/models/SmolVLM-Instruct" - model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForVision2Seq.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.text_model.layers"], [24]) @@ -141,7 +141,7 @@ def test_SmolVLM(self): def test_glm_4v(self): model_name = "/models/glm-4v-9b" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["transformer.encoder.layers"], [40]) @@ -153,7 +153,7 @@ def test_glm_4v(self): def test_gemma3(self): model_name = "/models/gemma-3-12b-it" - model = Gemma3ForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = Gemma3ForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.language_model.layers"], [48]) @@ -165,7 +165,7 @@ def test_gemma3(self): def test_Mistral3(self): model_name = "/models/Mistral-Small-3.1-24B-Instruct-2503" - model = Mistral3ForConditionalGeneration.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = Mistral3ForConditionalGeneration.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.language_model.layers"], [40]) @@ -177,7 +177,7 @@ def test_Mistral3(self): def test_Molmo(self): model_name = "/models/Molmo-7B-D-0924" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.transformer.blocks"], [28]) diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 681387285..11ebc1275 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -41,7 +41,7 @@ def tearDownClass(self): @require_optimum def test_backend(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) autoround.quantize() @@ -69,7 +69,7 @@ def test_backend(self): @require_package_version_ut("transformers", "<4.57.0") def test_backend_awq(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) autoround.quantize() @@ -87,7 +87,7 @@ def test_backend_awq(self): @require_gptqmodel def test_fp_layers(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -112,7 +112,7 @@ def test_fp_layers(self): @require_package_version_ut("transformers", "<4.57.0") def test_fp_layers_awq(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -135,7 +135,7 @@ def test_fp_layers_awq(self): @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") def test_undivided_group_size_tuning(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2) @@ -144,7 +144,7 @@ def test_undivided_group_size_tuning(self): @require_gptqmodel def test_adam(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128) autoround.quantize() @@ -165,7 +165,7 @@ def test_autoround_asym(self): ##need to install false print("skip autoround asym test, as autoround is not installed from source") return model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False) autoround.quantize() diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index 26d3ddca2..331ad4188 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -26,7 +26,7 @@ class TestAutoRoundMarlinBackend(unittest.TestCase): def test_marlin_group_size(self): for group_size in [-1, 64]: print(f"{group_size}!!!!!!!!!!!!!!!!!") - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, group_size, True autoround = AutoRound( @@ -44,7 +44,7 @@ def test_marlin_group_size(self): quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -55,7 +55,7 @@ def test_marlin_group_size(self): for group_size in [32, 128]: print(f"{group_size}!!!!!!!!!!!!!!!!!") - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, group_size, True autoround = AutoRound( @@ -73,7 +73,7 @@ def test_marlin_group_size(self): quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -120,7 +120,7 @@ def tearDownClass(self): shutil.rmtree("runs", ignore_errors=True) def test_marlin_4bits_sym_with_zp_m_1(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -138,7 +138,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -149,7 +149,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -161,7 +161,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): shutil.rmtree("./saved", ignore_errors=True) # def test_marlin_4bits_sym(self): - # model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + # model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) # bits, group_size, sym = 4, 128, True # autoround = AutoRound( @@ -180,7 +180,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # quantization_config = AutoRoundConfig(backend="marlin") # model = AutoModelForCausalLM.from_pretrained( # self.save_folder, - # torch_dtype=torch.float16, + # dtype=torch.float16, # device_map="auto", # quantization_config=quantization_config # ) @@ -194,7 +194,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # # model = AutoModelForCausalLM.from_pretrained( # self.save_folder, - # torch_dtype=torch.bfloat16, + # dtype=torch.bfloat16, # device_map="auto", # quantization_config=quantization_config # ) diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py index 4f7d39d8c..575b3f16d 100644 --- a/test/test_cuda/test_mix_bits.py +++ b/test/test_cuda/test_mix_bits.py @@ -34,7 +34,7 @@ class TestAutoRound(unittest.TestCase): def setUpClass(self): self.model_name = "/models/opt-125m" self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.llm_dataloader = LLMDataLoader() @@ -234,7 +234,7 @@ def test_mixed_MXFP_autoround_format_loading(self): autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="auto_round") model = AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="auto", + dtype="auto", device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index f2f1685be..c1e826865 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -42,7 +42,7 @@ def tearDownClass(self): @require_gptqmodel def test_device_map_str(self): model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = ".*q_proj:0,.*k_proj:cuda:0,v_proj:1,.*up_proj:1" autoround = AutoRound(model, tokenizer, device_map=device_map) @@ -59,7 +59,7 @@ def test_device_map_str(self): @multi_card def test_layer_norm(self): model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = {"norm": "cuda:1"} autoround = AutoRound( @@ -70,7 +70,7 @@ def test_layer_norm(self): @multi_card def test_rms_norm(self): model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = {"norm": "cuda:1"} autoround = AutoRound( @@ -81,7 +81,7 @@ def test_rms_norm(self): @multi_card def test_act_quantization(self): model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} autoround = AutoRound( @@ -92,7 +92,7 @@ def test_act_quantization(self): @multi_card def test_lm_head(self): model_name = "/models/Qwen2.5-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1} layer_config = {"lm_head": {"bits": 4}} @@ -111,7 +111,7 @@ def test_lm_head(self): @multi_card def test_device_map(self): model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"} autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32) @@ -170,7 +170,7 @@ def test_device_map(self): "cuda", "auto", ]: - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map=tmp_device_map) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", device_map=tmp_device_map) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -214,7 +214,7 @@ def test_device_map_dict(self): device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} bits, group_size, sym = 4, 128, False model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound( model, @@ -314,7 +314,7 @@ def test_device_map_for_triton(self): "auto", ]: model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype="auto", device_map=tmp_device_map, quantization_config=quantization_config + model_name, dtype="auto", device_map=tmp_device_map, quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py index 0dc43b093..07a9c2408 100644 --- a/test/test_cuda/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py @@ -32,7 +32,7 @@ def test_e2e_quant_and_infer(scheme): model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", - torch_dtype="auto", + dtype="auto", trust_remote_code=True, ) @@ -52,7 +52,7 @@ def test_e2e_quant_and_infer(scheme): # Perform inference with the quantized model model = AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="auto", + dtype="auto", ) model.eval() assert has_module(model, QMODULE_MAPPING[scheme]), f"Expected {QMODULE_MAPPING[scheme].__name__} in the model." diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 48dd27d9b..82c731765 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -157,7 +157,7 @@ def test_qwen_moe_quant_infer(self): ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index d73d474d6..9e6127582 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -52,7 +52,7 @@ def test_load_gptq_model_8bits(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", trust_remote_code=True, device_map="cpu", quantization_config=quantization_config, @@ -66,7 +66,7 @@ def test_load_gptq_model_2bits(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", trust_remote_code=True, device_map="cpu", quantization_config=quantization_config, @@ -76,7 +76,7 @@ def test_load_gptq_model_2bits(self): @require_itrex def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} @@ -95,7 +95,7 @@ def test_mixed_precision(self): model = AutoModelForCausalLM.from_pretrained( self.save_folder, - torch_dtype=torch.float16, + dtype=torch.float16, device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -109,7 +109,7 @@ def test_mixed_precision(self): @require_gptqmodel def test_autoround_sym(self): for bits in [4]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2) diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index fe036361f..0685cf5c9 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -39,7 +39,7 @@ def tearDownClass(self): # from transformers import Qwen2VLForConditionalGeneration, AutoProcessor # model = Qwen2VLForConditionalGeneration.from_pretrained( # quantized_model_path, - # torch_dtype="float16", + # dtype="float16", # device_map=f"cuda:{self.device}", # ) # processor = AutoProcessor.from_pretrained(quantized_model_path) @@ -99,7 +99,7 @@ def test_phi3(self): quantized_model_path, device_map=f"cuda:{self.device}", trust_remote_code=True, - torch_dtype="float16", + dtype="float16", ) processor = AutoProcessor.from_pretrained(quantized_model_path, trust_remote_code=True, num_crops=4) @@ -146,7 +146,7 @@ def test_phi3_vision_awq(self): quantized_model_path = os.path.join(self.save_dir, "Phi-3.5-vision-instruct-w4g128") res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, device_map=f"cuda:{self.device}", trust_remote_code=True, torch_dtype="auto" + quantized_model_path, device_map=f"cuda:{self.device}", trust_remote_code=True, dtype="auto" ) assert "WQLinear_GEMM" in str( type(model.model.vision_embed_tokens.img_processor.vision_model.encoder.layers[0].mlp.fc1) @@ -200,7 +200,7 @@ class DataArgs: quantized_model_path, model_base=None, model_name=quantized_model_path, - torch_dtype="auto", + dtype="auto", device_map=f"cuda:{self.device}", ) image_url = "http://images.cocodataset.org/train2017/000000116003.jpg" @@ -232,7 +232,7 @@ class DataArgs: # quantized_model_path = os.path.join(self.save_dir, "Llama-3.2-11B-Vision-Instruct-w4g128") # model = MllamaForConditionalGeneration.from_pretrained( # quantized_model_path, - # torch_dtype="float16", + # dtype="float16", # device_map=f"cuda:{self.device}", # ) # processor = AutoProcessor.from_pretrained(quantized_model_path) @@ -277,7 +277,7 @@ def test_cogvlm(self): model = ( AutoModelForCausalLM.from_pretrained( quantized_model_path, - torch_dtype="float16", + dtype="float16", trust_remote_code=True, device_map=DEVICE, ) @@ -343,7 +343,7 @@ def test_deepseek_vl2(self): quantized_model_path, trust_remote_code=True, device_map=f"cuda:{self.device}", - torch_dtype="float16", + dtype="float16", ) vl_gpt = vl_gpt.eval() diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py index 3f7cb4141..f50eb4cb4 100644 --- a/test/test_cuda/test_torch_backend.py +++ b/test/test_cuda/test_torch_backend.py @@ -64,7 +64,7 @@ def tearDownClass(self): shutil.rmtree("runs", ignore_errors=True) def test_torch_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -82,7 +82,7 @@ def test_torch_4bits_asym(self): quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -93,7 +93,7 @@ def test_torch_4bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -105,7 +105,7 @@ def test_torch_4bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) def test_torch_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -123,7 +123,7 @@ def test_torch_4bits_sym(self): quantization_config = AutoRoundConfig(backend="torch") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 6f953339d..d0c16acc4 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -60,7 +60,7 @@ def setUpClass(cls): torch.cuda.synchronize() cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) cls.quantized_model = AutoModelForCausalLM.from_pretrained( - cls.model_name, device_map=cls.device_map, torch_dtype=torch.float16 + cls.model_name, device_map=cls.device_map, dtype=torch.float16 ) def tearDown(self): @@ -90,7 +90,7 @@ def test_quantized_model_bf16(self): quantization_config = AutoRoundConfig(backend="triton") quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name, - torch_dtype=torch.bfloat16, + dtype=torch.bfloat16, device_map=self.device_map, quantization_config=quantization_config, ) @@ -105,7 +105,7 @@ def test_quantized_model_on_cpu(self): """ input_ids = self.tokenizer(self.input_text, return_tensors="pt") - quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") + quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto") output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) @@ -121,7 +121,7 @@ def test_save_pretrained(self): quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name, device_map=self.device_map, - torch_dtype=torch.float16, + dtype=torch.float16, quantization_config=quantization_config, ) @@ -140,7 +140,7 @@ def test_quantized_model_multi_gpu(self): """ quantization_config = AutoRoundConfig(backend="triton") quantized_model = AutoModelForCausalLM.from_pretrained( - self.model_name, device_map="auto", quantization_config=quantization_config, torch_dtype="auto" + self.model_name, device_map="auto", quantization_config=quantization_config, dtype="auto" ) input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device) output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) @@ -155,7 +155,7 @@ def test_convert_from_gptq(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cuda", quantization_config=quantization_config, torch_dtype="auto" + model_name, device_map="cuda", quantization_config=quantization_config, dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -173,7 +173,7 @@ def test_convert_from_awq_cpu(self): quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( - model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto" + model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -186,7 +186,7 @@ def test_mixed_bits(self): Simple test that checks if auto-round work properly with mixed bits """ model_name = "facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 8}, @@ -199,7 +199,7 @@ def test_mixed_bits(self): autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) with tempfile.TemporaryDirectory() as tmpdirname: autoround.quantize_and_save(output_dir=tmpdirname) - model = AutoModelForCausalLM.from_pretrained(tmpdirname, torch_dtype=torch.float16, device_map="cuda") + model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=torch.float16, device_map="cuda") text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index 7cbc8719d..1902128c0 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -60,7 +60,7 @@ def tearDownClass(self): @require_greater_than_050 def test_tritonv2_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( @@ -78,7 +78,7 @@ def test_tritonv2_4bits_asym(self): quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -89,7 +89,7 @@ def test_tritonv2_4bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -102,7 +102,7 @@ def test_tritonv2_4bits_asym(self): @require_greater_than_050 def test_tritonv2_2bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 2, 32, False autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) @@ -111,7 +111,7 @@ def test_tritonv2_2bits_asym(self): quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -122,7 +122,7 @@ def test_tritonv2_2bits_asym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -135,7 +135,7 @@ def test_tritonv2_2bits_asym(self): @require_greater_than_050 def test_tritonv2_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -153,7 +153,7 @@ def test_tritonv2_4bits_sym(self): quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -164,7 +164,7 @@ def test_tritonv2_4bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -178,7 +178,7 @@ def test_tritonv2_4bits_sym(self): @require_greater_than_050 def test_tritonv2_8bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 256, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, nsamples=1, iters=1) @@ -187,7 +187,7 @@ def test_tritonv2_8bits_sym(self): quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -198,7 +198,7 @@ def test_tritonv2_8bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -211,7 +211,7 @@ def test_tritonv2_8bits_sym(self): @require_greater_than_050 def test_tritonv2_2bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 2, 64, True autoround = AutoRound( @@ -226,7 +226,7 @@ def test_tritonv2_2bits_sym(self): quantization_config = AutoRoundConfig(backend="tritonv2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) @@ -237,7 +237,7 @@ def test_tritonv2_2bits_sym(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_folder, dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index eee7c055a..f3822ff28 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -49,7 +49,7 @@ def qwen_inference(self, quantized_model_dir): processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( quantized_model_dir, - torch_dtype="float16", + dtype="float16", device_map="auto", ##revision="df7f44c" ##AutoGPTQ format ) diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py index 2bb7983e5..2a95e1eed 100644 --- a/test/test_hpu/test_auto_round.py +++ b/test/test_hpu/test_auto_round.py @@ -11,7 +11,7 @@ def run_opt_125m_on_hpu(): from auto_round import AutoRound model_name = "facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -58,7 +58,7 @@ def test_w4a8(data_type): model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", attn_implementation="eager", trust_remote_code=True, ) diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py index e0a0ef321..57d032bf3 100644 --- a/test/test_hpu/test_inference.py +++ b/test/test_hpu/test_inference.py @@ -32,7 +32,7 @@ def is_hpex_available(): # @classmethod # def setUpClass(self): # model_name = "facebook/opt-125m" -# self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) +# self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True) # self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # self.llm_dataloader = LLMDataLoader() diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 8052a8af0..997b2f69c 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -35,7 +35,7 @@ def tearDownClass(self): def test_gptq_format(self): model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" + model_name, dtype="auto", trust_remote_code=True, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -67,9 +67,7 @@ def test_gptq_format(self): def test_awq_format(self): model_name = "facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu" - ) + model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True, device_map="xpu") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound(