@@ -47,11 +47,11 @@ class BackendInfo:
4747
4848 Attributes:
4949 device: A list of strings representing the devices the backend supports
50- (e.g., 'cuda ', 'cpu ').
50+ (e.g., 'cpu ', 'xpu', 'cuda ').
5151 sym: A list of booleans indicating whether the backend supports symmetric
5252 quantization for weights (True if symmetric, False if not).
5353 packing_format: A list of strings representing the packing formats used by the backend
54- (e.g., 'triton ', 'qbits ').
54+ (e.g., 'ark ', 'triton ').
5555 bits: A list of integers specifying the bit-widths supported by the backend
5656 for weight quantization (e.g., [2, 4, 8]).
5757 group_size: An optional list of integers specifying the group sizes supported
@@ -430,51 +430,51 @@ def fp8_static_scheme_checker(
430430 requirements = ["autoawq" , "transformers<4.57.0" ],
431431)
432432
433- BackendInfos ["qbits " ] = BackendInfo (
434- device = ["cpu" ],
435- sym = [True , False ],
433+ BackendInfos ["auto_round_kernel " ] = BackendInfo (
434+ device = ["cpu" , "xpu" ],
435+ sym = [True ],
436436 packing_format = GPTQ_FORMAT_NO_ZP ,
437437 bits = [2 , 4 , 8 ],
438438 group_size = None ,
439- priority = 1 ,
439+ priority = 0 ,
440440 checkers = [],
441- alias = ["itrex" , "qbits " ],
442- compute_dtype = ["float16 " , "bfloat16 " ],
441+ alias = ["ark " ],
442+ compute_dtype = ["float32 " , "float16 " ],
443443 data_type = ["int" ],
444444 act_bits = WOQ_DEFAULT_ACT_BITS ,
445- requirements = ["torch<2.7.0" , "intel-extension-for-transformers " ],
445+ requirements = ["torch>=2.9.0 " ],
446446)
447447
448- BackendInfos ["qbits_zp " ] = BackendInfo (
449- device = ["cpu" ],
450- sym = [True , False ],
448+ BackendInfos ["auto_round_kernel_zp " ] = BackendInfo (
449+ device = ["cpu" , "xpu" ],
450+ sym = [True ],
451451 packing_format = GPTQ_FORMAT ,
452452 bits = [2 , 4 , 8 ],
453453 group_size = None ,
454- compute_dtype = ["float16" , "bfloat16" ],
454+ priority = 0 ,
455+ checkers = [],
456+ alias = ["ark" ],
457+ compute_dtype = ["float32" , "float16" ],
455458 data_type = ["int" ],
456459 act_bits = WOQ_DEFAULT_ACT_BITS ,
457- priority = 1 ,
458- checkers = [],
459- alias = ["itrex" , "qbits" ],
460- requirements = ["torch<2.7.0" , "intel-extension-for-transformers" ],
460+ requirements = ["torch>=2.9.0" ],
461461)
462462
463-
464- BackendInfos ["qbits_awq" ] = BackendInfo (
463+ BackendInfos ["auto_round_kernel_awq" ] = BackendInfo (
465464 device = ["cpu" ],
466465 sym = [True , False ],
467466 packing_format = AWQ_FORMAT ,
468467 bits = [2 , 4 , 8 ],
469468 group_size = None ,
470- compute_dtype = ["float16" , "bfloat16" ],
469+ priority = 0 ,
470+ checkers = [],
471+ alias = ["ark" ],
472+ compute_dtype = ["float32" , "float16" ],
471473 data_type = ["int" ],
472474 act_bits = WOQ_DEFAULT_ACT_BITS ,
473- priority = 1 ,
474- checkers = [],
475- alias = ["itrex" , "qbits" ],
476- requirements = ["torch<2.7.0" , "intel-extension-for-transformers" ],
475+ requirements = ["torch>=2.9.0" ],
477476)
477+
478478BackendInfos ["ipex_gptq" ] = BackendInfo (
479479 device = ["cpu" , "xpu" ],
480480 sym = [True , False ],
@@ -601,12 +601,12 @@ def dynamic_import_inference_linear(backend, config):
601601 """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
602602
603603 This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
604- configuration (e.g., qbits , marlin, hpu, gptq, awq, auto_round ). It imports specific modules or raises
604+ configuration (e.g., ark , marlin, hpu, gptq, awq). It imports specific modules or raises
605605 errors if the required packages are not installed or the environment is not set up.
606606
607607 Args:
608608 backend (str):
609- The backend to be used for quantization (e.g., 'qbits ', 'marlin', 'hpu', 'gptq', 'awq', 'auto_round ').
609+ The backend to be used for quantization (e.g., 'ark ', 'marlin', 'hpu', 'gptq', 'awq').
610610 config (QuantizationScheme):
611611 The quantization configuration containing parameters like bits, group_size, and sym.
612612
@@ -616,7 +616,7 @@ def dynamic_import_inference_linear(backend, config):
616616
617617 Raises:
618618 ImportError:
619- If required modules are missing for a backend (e.g., Intel Extension , GPTQ, auto_awq).
619+ If required modules are missing for a backend (e.g., ark , GPTQ, auto_awq).
620620 """
621621 bits , group_size , sym = config ["bits" ], config ["group_size" ], config ["sym" ]
622622
@@ -629,26 +629,22 @@ def dynamic_import_inference_linear(backend, config):
629629 if "torch_nvfp4" in backend :
630630 return ar_qmodules .NVFP4QuantLinear
631631
632- if "qbits" in backend :
632+ if "auto_round_kernel" in backend or 'ark' in backend :
633633 try :
634- from intel_extension_for_transformers import qbits # pylint: disable=E0401
634+ import auto_round_kernel as ark # pylint: disable=E0401
635635 except Exception as e :
636636 raise ImportError (
637- "Please install Intel Extension for Transformers via 'pip install "
638- "intel-extension-for-transformers' to inference on X86 CPU"
637+ "Please install auto_round_kernel version for CPU/XPU"
639638 )
639+ import auto_round_extension .kernel .qlinear as qlinear
640+
640641 if "zp" in backend :
641- import auto_round_extension .qbits .qlinear_qbits_gptq as qlinear_qbits_gptq
642-
643- return qlinear_qbits_gptq .QuantLinear
642+ return qlinear .QuantLinearGPTQ
644643 elif "awq" in backend :
645- import auto_round_extension .qbits .qbits_awq as qlinear_qbits_awq
646-
647- return qlinear_qbits_awq .QuantLinear
644+ return qlinear .QuantLinearAWQ
648645 else : # auto_round must be at the end
649- import auto_round_extension . qbits . qlinear_qbits as qlinear_qbits_autoround
646+ return qlinear . QuantLinear
650647
651- return qlinear_qbits_autoround .QuantLinear
652648 if "ipex_gptq" in backend :
653649 from auto_round_extension .ipex .qlinear_ipex_gptq import QuantLinear
654650
0 commit comments