@@ -292,7 +292,6 @@ def __init__(
292
292
"Please use more GPUs by setting `--device 0,1,2,3` or just place the model on CPU."
293
293
)
294
294
check_and_mark_fp8_model (model )
295
- model = _handle_moe_model (model )
296
295
self .model = model .eval ()
297
296
self .tokenizer = tokenizer
298
297
self .shared_cache_keys = get_shared_keys (self .model )
@@ -351,7 +350,6 @@ def __init__(
351
350
"AutoRound does not support parameters on meta device. "
352
351
"Please use more GPUs by setting `--device_map 0,1,2,3` or just place the model on CPU."
353
352
)
354
- model = _handle_moe_model (model )
355
353
self .model = model .eval ()
356
354
self .tokenizer = tokenizer
357
355
self .shared_cache_keys = get_shared_keys (self .model )
@@ -1081,7 +1079,8 @@ def _quantize_embedding_layer(self):
1081
1079
except RuntimeError as e :
1082
1080
cuda_error_msg = traceback .format_exc ()
1083
1081
try :
1084
- logger .info ("out of VRAM, falling back to CPU" )
1082
+ logger .error (cuda_error_msg )
1083
+ logger .warning ("falling back to CPU" )
1085
1084
weight , scale , zp = quant_func (
1086
1085
module .weight .to ("cpu" ),
1087
1086
** {
@@ -1090,7 +1089,6 @@ def _quantize_embedding_layer(self):
1090
1089
},
1091
1090
)
1092
1091
except Exception as e :
1093
- logger .error (cuda_error_msg )
1094
1092
raise
1095
1093
1096
1094
# Overwrite the module's weights with the quantized version
@@ -1232,6 +1230,7 @@ def get_imatrix_hook(module, input, output):
1232
1230
except RuntimeError as e :
1233
1231
cuda_error_msg = traceback .format_exc ()
1234
1232
try :
1233
+ logger .error (cuda_error_msg )
1235
1234
# Final fallback: warn and use CPU-only quantization
1236
1235
logger .warning (
1237
1236
"Fallback to CPU. "
@@ -1249,7 +1248,6 @@ def get_imatrix_hook(module, input, output):
1249
1248
self ._quantize_via_rtn_blockwise (all_to_quantized_module_names )
1250
1249
self .device = orig_device
1251
1250
except Exception as e :
1252
- logger .error (cuda_error_msg )
1253
1251
raise
1254
1252
finally :
1255
1253
# Always remove hooks
@@ -1394,7 +1392,8 @@ def _quantize_layer_via_rtn(self, name: str) -> None:
1394
1392
cuda_error_msg = traceback .format_exc ()
1395
1393
m = m .orig_layer if hasattr (m , "orig_layer" ) else m
1396
1394
try :
1397
- logger .warning ("Out of VRAM, falling back to CPU." )
1395
+ logger .error (cuda_error_msg )
1396
+ logger .warning ("falling back to CPU." )
1398
1397
m .to ("cpu" )
1399
1398
m = WrapperLinear (
1400
1399
m ,
@@ -1404,7 +1403,6 @@ def _quantize_layer_via_rtn(self, name: str) -> None:
1404
1403
)
1405
1404
m = m .unwrapper ({})
1406
1405
except Exception as e :
1407
- logger .error (cuda_error_msg )
1408
1406
raise
1409
1407
1410
1408
# Step 3: Optional immediate packing/export
@@ -1645,6 +1643,10 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
1645
1643
for n , m in self .model .named_modules ():
1646
1644
m .tmp_name = n
1647
1645
self ._check_compatibility ()
1646
+ formats = self .formats if hasattr (self , "formats" ) else None
1647
+ # It is best to modify the model structure in the quantize function and check the format,
1648
+ # because it may cause the gguf format to not be exported normally.
1649
+ self .model = _handle_moe_model (self .model , formats = formats )
1648
1650
self .has_qlayer_outside_block = self ._set_layerwise_config (self .layer_config )
1649
1651
if not hasattr (self , "formats" ):
1650
1652
logger .warning ("this API is deprecated, please use `quantize_and_save` instead" )
0 commit comments