@@ -104,14 +104,30 @@ def _verify_tokenizer_mode(self) -> None:
104
104
105
105
def _verify_quantization (self ) -> None :
106
106
supported_quantization = ["awq" , "squeezellm" ]
107
- if self .quantization is None :
108
- return
109
- quantization = self .quantization .lower ()
110
- if quantization not in supported_quantization :
111
- raise ValueError (
112
- f"Unknown quantization: { self .quantization } . Must be one of "
113
- f"{ supported_quantization } ." )
114
- self .quantization = quantization
107
+ if self .quantization is not None :
108
+ self .quantization = self .quantization .lower ()
109
+
110
+ # Parse quantization method from the HF model config, if available.
111
+ hf_quant_config = getattr (self .hf_config , "quantization_config" , None )
112
+ if hf_quant_config is not None :
113
+ hf_quant_method = str (hf_quant_config ["quant_method" ]).lower ()
114
+ if self .quantization is None :
115
+ self .quantization = hf_quant_method
116
+ elif self .quantization != hf_quant_method :
117
+ raise ValueError (
118
+ "Quantization method specified in the model config "
119
+ f"({ hf_quant_method } ) does not match the quantization "
120
+ f"method specified in the `quantization` argument "
121
+ f"({ self .quantization } )." )
122
+
123
+ if self .quantization is not None :
124
+ if self .quantization not in supported_quantization :
125
+ raise ValueError (
126
+ f"Unknown quantization method: { self .quantization } . Must "
127
+ f"be one of { supported_quantization } ." )
128
+ logger .warning (f"{ self .quantization } quantization is not fully "
129
+ "optimized yet. The speed can be slower than "
130
+ "non-quantized models." )
115
131
116
132
def verify_with_parallel_config (
117
133
self ,
0 commit comments