Skip to content

Commit b9500dc

Browse files
[OpenVINO] Add workaround logic for default int4 quantization of openai/gpt-oss-20b model (#1490)
* Add a workaround for GPT-OSS quantization * Fix tests * Add exception * Address review comments * Add config save/load tests
1 parent fb8fd98 commit b9500dc

File tree

6 files changed

+157
-32
lines changed

6 files changed

+157
-32
lines changed

optimum/exporters/openvino/__main__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,9 +520,18 @@ class StoreAttr(object):
520520
"Quantization of the weights requires nncf, please install it with `pip install nncf`"
521521
)
522522

523+
from optimum.intel.openvino.configuration import _GPTOSSQuantizationConfig
523524
from optimum.intel.openvino.quantization import _weight_only_quantization
524525

525-
_weight_only_quantization(submodel, quantization_config)
526+
if isinstance(quantization_config, _GPTOSSQuantizationConfig):
527+
# A workaround for GPT-OSS model is required to run quantization twice, this way it is possible to
528+
# selectively quantize some weights to 4 bits and some to 8 bits.
529+
_weight_only_quantization(submodel, quantization_config.quantization_config1)
530+
_weight_only_quantization(
531+
submodel, quantization_config.quantization_config2, verify_not_optimized=False
532+
)
533+
else:
534+
_weight_only_quantization(submodel, quantization_config)
526535
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
527536
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
528537
del submodel

optimum/intel/openvino/configuration.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,13 +316,34 @@ class OVQuantizationMethod(str, Enum):
316316
},
317317
},
318318
"openai/gpt-oss-20b": {
319-
"bits": 4,
320-
"sym": True,
321-
"group_size": 32,
322-
"ignored_scope": {
323-
"patterns": [".*self_attn.*", ".*router.*"],
319+
"quantization_config1": {
320+
"bits": 4,
321+
"sym": True,
322+
"group_size": 32,
323+
# With ignored scope below we keep some weights in their original precision during the first quantization
324+
# run and then quantize them to int8 in the second run.
325+
"ignored_scope": {"patterns": [".*self_attn.*", ".*router.*"]},
326+
},
327+
"quantization_config2": {
328+
"bits": 8,
329+
"sym": False,
330+
"weight_only": True,
331+
},
332+
},
333+
"openai/gpt-oss-120b": {
334+
"quantization_config1": {
335+
"bits": 4,
336+
"sym": True,
337+
"group_size": 32,
338+
# With ignored scope below we keep some weights in their original precision during the first quantization
339+
# run and then quantize them to int8 in the second run.
340+
"ignored_scope": {"patterns": [".*self_attn.*", ".*router.*"]},
341+
},
342+
"quantization_config2": {
343+
"bits": 8,
344+
"sym": False,
345+
"weight_only": True,
324346
},
325-
"backup_precision": "none",
326347
},
327348
}
328349

@@ -1149,6 +1170,8 @@ def _get_dtype(quantization_config):
11491170
elif isinstance(quantization_config, OVPipelineQuantizationConfig):
11501171
dtypes = [OVConfig._get_dtype(config) for config in quantization_config.quantization_configs.values()]
11511172
dtype = "_".join(dtypes)
1173+
elif isinstance(quantization_config, _GPTOSSQuantizationConfig):
1174+
dtype = "int4_int8"
11521175
else:
11531176
raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")
11541177
return dtype
@@ -1366,6 +1389,36 @@ def post_init(self):
13661389
submodel_config.post_init()
13671390

13681391

1392+
class _GPTOSSQuantizationConfig(QuantizationConfigMixin):
1393+
def __init__(
1394+
self,
1395+
quantization_config1: Union[Dict, OVWeightQuantizationConfig],
1396+
quantization_config2: Union[Dict, OVWeightQuantizationConfig],
1397+
**kwargs,
1398+
):
1399+
"""
1400+
Configuration class for GPT-OSS quantization.
1401+
1402+
# TODO (nikita.savelyevv): Introduce OVSequentialQuantizationConfig to support this.
1403+
"""
1404+
1405+
if isinstance(quantization_config1, dict):
1406+
quantization_config1 = OVWeightQuantizationConfig.from_dict(quantization_config1)
1407+
self.quantization_config1 = quantization_config1
1408+
self.quantization_config1.post_init()
1409+
1410+
if isinstance(quantization_config2, dict):
1411+
quantization_config2 = OVWeightQuantizationConfig.from_dict(quantization_config2)
1412+
self.quantization_config2 = quantization_config2
1413+
self.quantization_config2.post_init()
1414+
1415+
def to_dict(self) -> Dict[str, Any]:
1416+
result = super().to_dict()
1417+
result["quantization_config1"] = self.quantization_config1.to_dict()
1418+
result["quantization_config2"] = self.quantization_config2.to_dict()
1419+
return result
1420+
1421+
13691422
def _quantization_config_from_dict(config_dict: Dict[str, Any]) -> OVQuantizationConfigBase:
13701423
"""
13711424
Helper function to create a quantization config from a dictionary.
@@ -1379,6 +1432,10 @@ def _quantization_config_from_dict(config_dict: Dict[str, Any]) -> OVQuantizatio
13791432
if "quantization_configs" in config_dict:
13801433
return OVPipelineQuantizationConfig.from_dict(config_dict)
13811434

1435+
# Check for GPT-OSS quantization config
1436+
if "quantization_config1" in config_dict and "quantization_config2" in config_dict:
1437+
return _GPTOSSQuantizationConfig.from_dict(config_dict)
1438+
13821439
# Either OVWeightQuantizationConfig or OVQuantizationConfig
13831440
# Try to detect the type of config based on the keys present in the dictionary
13841441
wq_args = set(inspect.getfullargspec(OVWeightQuantizationConfig.__init__).args)

optimum/intel/openvino/modeling_decoder.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,14 @@ def _from_pretrained(
876876
init_cls = cls
877877

878878
if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
879-
default_config = get_default_quantization_config(config.name_or_path, weight_format="int4")
879+
if config.name_or_path in ["openai/gpt-oss-20b", "openai/gpt-oss-120b"]:
880+
raise NotImplementedError(
881+
"Quantization with the default 4-bit config is not supported through Python API for openai/gpt-oss-20b model. "
882+
"Please export the model via optimum-cli with `--weight-format int4` argument. This way the "
883+
"recommended quantization config will be used."
884+
)
885+
else:
886+
default_config = get_default_quantization_config(config.name_or_path, weight_format="int4")
880887
quantization_config = cls._prepare_quantization_config(
881888
default_config or _DEFAULT_4BIT_WQ_CONFIG, load_in_8bit
882889
)

optimum/intel/openvino/quantization.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1561,9 +1561,11 @@ def _weight_only_quantization(
15611561
model: openvino.Model,
15621562
quantization_config: Union[OVWeightQuantizationConfig, Dict],
15631563
calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
1564+
verify_not_optimized: bool = True,
15641565
**kwargs,
15651566
) -> openvino.Model:
1566-
_verify_not_optimized(model)
1567+
if verify_not_optimized:
1568+
_verify_not_optimized(model)
15671569
config = quantization_config
15681570
if isinstance(config, dict):
15691571
config = OVWeightQuantizationConfig.from_dict(quantization_config)

tests/openvino/test_exporters_cli.py

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,26 +1108,47 @@ def test_exporters_cli_full_quantization(
11081108
expected_fake_nodes_per_model,
11091109
)
11101110

1111-
@parameterized.expand(
1112-
[
1113-
(
1114-
"falcon-40b",
1115-
"bigscience/bloomz-560m",
1116-
AutoModelForCausalLM,
1117-
OVModelForCausalLM,
1118-
"--task text-generation-with-past --weight-format int4",
1119-
_DEFAULT_4BIT_WQ_CONFIGS,
1120-
),
1121-
(
1122-
"clip",
1123-
"hf-tiny-model-private/tiny-random-CLIPModel",
1124-
AutoModelForZeroShotImageClassification,
1125-
OVModelForZeroShotImageClassification,
1126-
"--task zero-shot-image-classification --quant-mode int8",
1127-
_DEFAULT_INT8_FQ_CONFIGS,
1128-
),
1129-
]
1130-
)
1111+
DEFAULT_CONFIG_TEST_CONFIGURATIONS = [
1112+
(
1113+
"falcon-40b",
1114+
"bigscience/bloomz-560m",
1115+
AutoModelForCausalLM,
1116+
OVModelForCausalLM,
1117+
"--task text-generation-with-past --weight-format int4",
1118+
_DEFAULT_4BIT_WQ_CONFIGS,
1119+
{"model": {"int8": 6, "int4": 6}},
1120+
{"model": 0},
1121+
),
1122+
(
1123+
"clip",
1124+
"hf-tiny-model-private/tiny-random-CLIPModel",
1125+
AutoModelForZeroShotImageClassification,
1126+
OVModelForZeroShotImageClassification,
1127+
"--task zero-shot-image-classification --quant-mode int8",
1128+
_DEFAULT_INT8_FQ_CONFIGS,
1129+
{"model": {"int8": 65}},
1130+
{"model": 65},
1131+
),
1132+
(
1133+
"gpt_oss_mxfp4",
1134+
"openai/gpt-oss-20b",
1135+
AutoModelForCausalLM,
1136+
OVModelForCausalLM,
1137+
"--task text-generation-with-past --weight-format int4",
1138+
_DEFAULT_4BIT_WQ_CONFIGS,
1139+
{"model": {"int8": 22, "int4": 4}},
1140+
{"model": 0},
1141+
),
1142+
]
1143+
1144+
# filter models type depending on min max transformers version
1145+
SUPPORTED_DEFAULT_CONFIG_TEST_CONFIGURATIONS = [
1146+
config
1147+
for config in DEFAULT_CONFIG_TEST_CONFIGURATIONS
1148+
if TEST_NAME_TO_MODEL_TYPE.get(config[0], config[0]) in get_supported_model_for_library("transformers")
1149+
]
1150+
1151+
@parameterized.expand(SUPPORTED_DEFAULT_CONFIG_TEST_CONFIGURATIONS)
11311152
def test_exporters_cli_with_default_config(
11321153
self,
11331154
model_name,
@@ -1136,6 +1157,8 @@ def test_exporters_cli_with_default_config(
11361157
ov_model_cls,
11371158
options,
11381159
default_configs_collection,
1160+
expected_num_weight_nodes_per_model,
1161+
expected_fake_nodes_per_model,
11391162
):
11401163
with TemporaryDirectory() as tmpdir:
11411164
pt_model = auto_model_cls.from_pretrained(MODEL_NAMES[model_name])
@@ -1167,15 +1190,26 @@ def test_exporters_cli_with_default_config(
11671190
)
11681191

11691192
model = ov_model_cls.from_pretrained(tmpdir)
1193+
1194+
check_compression_state_per_model(
1195+
self,
1196+
model.ov_submodels,
1197+
expected_num_weight_nodes_per_model,
1198+
expected_fake_nodes_per_model,
1199+
)
1200+
11701201
rt_info = model.model.get_rt_info()
11711202
nncf_info = rt_info["nncf"]
11721203
model_quantization_config = nncf_info["weight_compression" if is_weight_compression else "quantization"]
11731204

11741205
default_config = {**default_configs_collection[model_id]}
1175-
default_config.pop("dataset", None)
1206+
if "quantization_config2" in default_config:
1207+
# For GPT-OSS use the second config as reference
1208+
default_config = default_config["quantization_config2"]
1209+
dataset = default_config.pop("dataset", None)
1210+
default_config.pop("weight_only", None)
11761211
if is_weight_compression:
11771212
bits = default_config.pop("bits", None)
1178-
self.assertEqual(bits, 4)
11791213
sym = default_config.pop("sym", False)
11801214
default_config["mode"] = f"int{bits}_{'sym' if sym else 'asym'}"
11811215
quant_method = default_config.pop("quant_method", None)
@@ -1184,7 +1218,8 @@ def test_exporters_cli_with_default_config(
11841218
advanced_parameters = eval(model_quantization_config["advanced_parameters"].value)
11851219
model_quantization_config["statistics_path"] = Mock()
11861220
model_quantization_config["statistics_path"].value = advanced_parameters["statistics_path"]
1187-
default_config["statistics_path"] = f"{tmpdir}/statistics"
1221+
if dataset is not None:
1222+
default_config["statistics_path"] = f"{tmpdir}/statistics"
11881223
else:
11891224
dtype = default_config.pop("dtype", None)
11901225
self.assertEqual(dtype, "int8")

tests/openvino/test_quantization.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
_DEFAULT_4BIT_WQ_CONFIGS,
7979
_DEFAULT_4BIT_WQ_CONFIG,
8080
_quantization_config_from_dict,
81+
_GPTOSSQuantizationConfig,
8182
)
8283
from optimum.intel.openvino.modeling_visual_language import _OVNanoLlavaForCausalLM
8384
from optimum.intel.openvino.utils import TemporaryDirectory
@@ -1899,6 +1900,12 @@ class OVQuantizationConfigTest(unittest.TestCase):
18991900
advanced_parameters=nncf.AdvancedCompressionParameters(),
19001901
),
19011902
),
1903+
(
1904+
_GPTOSSQuantizationConfig(
1905+
quantization_config1=OVWeightQuantizationConfig(bits=4, group_size=16),
1906+
quantization_config2=OVWeightQuantizationConfig(bits=8),
1907+
),
1908+
),
19021909
)
19031910

19041911
QUANTIZATION_CONFIG_DICTS = (
@@ -1988,6 +1995,14 @@ class OVQuantizationConfigTest(unittest.TestCase):
19881995
OVPipelineQuantizationConfig,
19891996
None,
19901997
),
1998+
(
1999+
dict(
2000+
quantization_config1=dict(bits=4, group_size=16),
2001+
quantization_config2=dict(bits=8, weight_only=True),
2002+
),
2003+
_GPTOSSQuantizationConfig,
2004+
None,
2005+
),
19912006
)
19922007

19932008
QUANTIZATION_CONFIGS_WITH_KWARGS = (

0 commit comments

Comments
 (0)