Skip to content

Commit b7e0a09

Browse files
authored
Fix export models for image generation (#3508) (#3514)
Fixing issue introduced with: d981ff0. When performing quantization after switching from int8 to int4 in export_models.py we encountered issue: "Channel size 64 should be divisible by size of group 128." Cherry-pick of:c55551763d02825829337b62c2dcef9339706f79 Ticket:CVS-169363
1 parent 71296e8 commit b7e0a09

File tree

2 files changed

+16
-11
lines changed

2 files changed

+16
-11
lines changed

demos/common/export_models/export_model.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def add_common_arguments(parser):
3030
parser.add_argument('--overwrite_models', default=False, action='store_true', help='Overwrite the model if it already exists in the models repository', dest='overwrite_models')
3131
parser.add_argument('--target_device', default="CPU", help='CPU, GPU, NPU or HETERO, default is CPU', dest='target_device')
3232
parser.add_argument('--ov_cache_dir', default=None, help='Folder path for compilation cache to speedup initialization time', dest='ov_cache_dir')
33+
parser.add_argument('--extra_quantization_params', required=False, help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')
3334

3435
parser = argparse.ArgumentParser(description='Export Hugging face models to OVMS models repository including all configuration for deployments')
3536

@@ -38,7 +39,6 @@ def add_common_arguments(parser):
3839
add_common_arguments(parser_text)
3940
parser_text.add_argument('--pipeline_type', default=None, choices=["LM", "LM_CB", "VLM", "VLM_CB", "AUTO"], help='Type of the pipeline to be used. AUTO is used by default', dest='pipeline_type')
4041
parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
41-
parser_text.add_argument('--extra_quantization_params', help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')
4242
parser_text.add_argument('--enable_prefix_caching', action='store_true', help='This algorithm is used to cache the prompt tokens.', dest='enable_prefix_caching')
4343
parser_text.add_argument('--disable_dynamic_split_fuse', action='store_false', help='The maximum number of tokens that can be batched together.', dest='dynamic_split_fuse')
4444
parser_text.add_argument('--max_num_batched_tokens', default=None, help='empty or integer. The maximum number of tokens that can be batched together.', dest='max_num_batched_tokens')
@@ -379,11 +379,9 @@ def export_text_generation_model(model_repository_path, source_model, model_name
379379
if precision != 'int4':
380380
print("NPU target device requires int4 precision. Changing to int4")
381381
precision = 'int4'
382-
if task_parameters['extra_quantization_params'] is None:
382+
if task_parameters['extra_quantization_params'] == "":
383383
print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
384384
task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
385-
if task_parameters['extra_quantization_params'] is None:
386-
task_parameters['extra_quantization_params'] = ""
387385
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
388386
if os.system(optimum_command):
389387
raise ValueError("Failed to export llm model", source_model)
@@ -478,7 +476,7 @@ def export_embeddings_model(model_repository_path, source_model, model_name, pre
478476
embeddings_path = os.path.join(model_repository_path, model_name,'embeddings', version)
479477
print("Exporting embeddings model to ",embeddings_path)
480478
if not os.path.isdir(embeddings_path) or args['overwrite_models']:
481-
optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task feature-extraction --weight-format {} --trust-remote-code --library sentence_transformers {}".format(source_model, precision, tmpdirname)
479+
optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task feature-extraction --weight-format {} {} --trust-remote-code --library sentence_transformers {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname)
482480
if os.system(optimum_command):
483481
raise ValueError("Failed to export embeddings model", source_model)
484482
set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json')
@@ -517,7 +515,7 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
517515
destination_path = os.path.join(model_repository_path, model_name)
518516
print("Exporting embeddings model to ",destination_path)
519517
if not os.path.isdir(destination_path) or args['overwrite_models']:
520-
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} --trust-remote-code --library sentence_transformers {}".format(source_model, precision, destination_path)
518+
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code --library sentence_transformers {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
521519
if os.system(optimum_command):
522520
raise ValueError("Failed to export embeddings model", source_model)
523521
if truncate:
@@ -539,7 +537,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
539537
destination_path = os.path.join(model_repository_path, model_name)
540538
print("Exporting rerank model to ",destination_path)
541539
if not os.path.isdir(destination_path) or args['overwrite_models']:
542-
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
540+
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
543541
if os.system(optimum_command):
544542
raise ValueError("Failed to export rerank model", source_model)
545543
print("Exporting tokenizer to ", destination_path)
@@ -565,7 +563,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi
565563
embeddings_path = os.path.join(model_repository_path, model_name, 'rerank', version)
566564
print("Exporting rerank model to ",embeddings_path)
567565
if not os.path.isdir(embeddings_path) or args['overwrite_models']:
568-
optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} --trust-remote-code {}".format(source_model, precision, tmpdirname)
566+
optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname)
569567
if os.system(optimum_command):
570568
raise ValueError("Failed to export rerank model", source_model)
571569
set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json')
@@ -601,7 +599,8 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
601599
if os.path.isfile(model_index_path):
602600
print("Model index file already exists. Skipping conversion, re-generating graph only.")
603601
else:
604-
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {}".format(source_model, precision, target_path)
602+
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
603+
print(f'optimum cli command: {optimum_command}')
605604
if os.system(optimum_command):
606605
raise ValueError("Failed to export image generation model", source_model)
607606

@@ -653,6 +652,8 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
653652
template_parameters = {k: v for k, v in args.items() if k not in ['model_repository_path', 'source_model', 'model_name', 'precision', 'version', 'config_file_path', 'overwrite_models']}
654653
print("template params:", template_parameters)
655654

655+
if template_parameters['extra_quantization_params'] is None:
656+
template_parameters['extra_quantization_params'] = ""
656657
if args['task'] == 'text_generation':
657658
export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])
658659

@@ -677,5 +678,6 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
677678
'max_num_images_per_prompt',
678679
'default_num_inference_steps',
679680
'max_num_inference_steps',
681+
'extra_quantization_params'
680682
]}
681683
export_image_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['num_streams'])

demos/image_generation/README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,13 +127,16 @@ Run `export_model.py` script to download and quantize the model:
127127
128128
> **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
129129
130+
> **Note:** The `--extra_quantization_params` parameter is used to pass additional parameters to the optimum-cli. It may be required to set the `--group-size` parameter when quantizing the model when encountering errors like: `Channel size 64 should be divisible by size of group 128.`
131+
130132
### Export model for CPU
131133
```console
132134
python export_model.py image_generation \
133135
--source_model black-forest-labs/FLUX.1-schnell \
134136
--weight-format int4 \
135137
--config_file_path models/config.json \
136138
--model_repository_path models \
139+
--extra_quantization_params "--group-size 64" \
137140
--overwrite_models
138141
```
139142

@@ -145,6 +148,7 @@ python export_model.py image_generation \
145148
--target_device GPU \
146149
--config_file_path models/config.json \
147150
--model_repository_path models \
151+
--extra_quantization_params "--group-size 64" \
148152
--overwrite_models
149153
```
150154

@@ -209,7 +213,7 @@ In case you want to use GPU device to run the generation, add extra docker param
209213
to `docker run` command, use the image with GPU support. Export the models with precision matching the GPU capacity and adjust pipeline configuration.
210214
It can be applied using the commands below:
211215
```bash
212-
docker run -d --rm -p 8000:8000 -v $(pwd)/models:/workspace:ro \
216+
docker run -d --rm -p 8000:8000 -v $(pwd)/models:/models:ro \
213217
--device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
214218
openvino/model_server:2025.2-gpu \
215219
--rest_port 8000 \
@@ -233,7 +237,6 @@ ovms --rest_port 8000 ^
233237

234238
::::
235239

236-
237240
## Readiness Check
238241

239242
Wait for the model to load. You can check the status with a simple command:

0 commit comments

Comments
 (0)