Skip to content

Commit c80f736

Browse files
authored
Add MAX_PROMPT_LEN in export models script (openvinotoolkit#3216)
1 parent 381f1ce commit c80f736

File tree

4 files changed

+43
-10
lines changed

4 files changed

+43
-10
lines changed

demos/common/export_models/export_model.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ def add_common_arguments(parser):
3131
parser.add_argument('--weight-format', default='int8', help='precision of the exported model', dest='precision')
3232
parser.add_argument('--config_file_path', default='config.json', help='path to the config file', dest='config_file_path')
3333
parser.add_argument('--overwrite_models', default=False, action='store_true', help='Overwrite the model if it already exists in the models repository', dest='overwrite_models')
34-
parser.add_argument('--target_device', default="CPU", help='CPU or GPU, default is CPU', dest='target_device')
34+
parser.add_argument('--target_device', default="CPU", help='CPU, GPU, NPU or HETERO, default is CPU', dest='target_device')
3535

3636
parser = argparse.ArgumentParser(description='Export Hugging face models to OVMS models repository including all configuration for deployments')
3737

3838
subparsers = parser.add_subparsers(help='subcommand help', required=True, dest='task')
3939
parser_text = subparsers.add_parser('text_generation', help='export model for chat and completion endpoints')
4040
add_common_arguments(parser_text)
41-
parser_text.add_argument('--pipeline_type', default=None, help='Type of the pipeline to be used. Can be either TEXT_CB or VLM_CB. When undefined, it will be autodetected', dest='pipeline_type')
41+
parser_text.add_argument('--pipeline_type', default=None, choices=["LM", "LM_CB", "VLM", "VLM_CB", "AUTO"], help='Type of the pipeline to be used. AUTO is used by default', dest='pipeline_type')
4242
parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
4343
parser_text.add_argument('--extra_quantization_params', help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')
4444
parser_text.add_argument('--enable_prefix_caching', action='store_true', help='This algorithm is used to cache the prompt tokens.', dest='enable_prefix_caching')
@@ -50,6 +50,8 @@ def add_common_arguments(parser):
5050
'Using this option will create configuration for speculative decoding', dest='draft_source_model')
5151
parser_text.add_argument('--draft_model_name', required=False, default=None, help='Draft model name that should be used in the deployment. '
5252
'Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.', dest='draft_model_name')
53+
parser_text.add_argument('--max_prompt_len', required=False, type=int, default=None, help='Sets NPU specific property for maximum number of tokens in the prompt. '
54+
'Not effective if target device is not NPU', dest='max_prompt_len')
5355

5456
parser_embeddings = subparsers.add_parser('embeddings', help='export model for embeddings endpoint')
5557
add_common_arguments(parser_embeddings)
@@ -148,7 +150,7 @@ def add_common_arguments(parser):
148150
{%- if pipeline_type %}
149151
pipeline_type: {{pipeline_type}},{% endif %}
150152
models_path: "{{model_path}}",
151-
plugin_config: '{ {% if kv_cache_precision %}"KV_CACHE_PRECISION": "{{kv_cache_precision}}"{% endif %}}',
153+
plugin_config: '{{plugin_config}}',
152154
enable_prefix_caching: {% if not enable_prefix_caching %}false{% else %} true{% endif%},
153155
cache_size: {{cache_size|default("10", true)}},
154156
{%- if max_num_batched_tokens %}
@@ -267,6 +269,7 @@ def add_servable_to_config(config_path, mediapipe_name, base_path):
267269

268270
def export_text_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
269271
model_path = "./"
272+
### Export model
270273
if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')):
271274
print("OV model is source folder. Skipping conversion.")
272275
model_path = source_model
@@ -286,7 +289,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
286289
optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
287290
if os.system(optimum_command):
288291
raise ValueError("Failed to export llm model", source_model)
289-
### Speculative decoding specific
292+
### Export draft model for speculative decoding
290293
draft_source_model = task_parameters.get("draft_source_model", None)
291294
draft_model_dir_name = None
292295
if draft_source_model:
@@ -295,12 +298,35 @@ def export_text_generation_model(model_repository_path, source_model, model_name
295298
if os.path.isfile(os.path.join(draft_llm_model_path, 'openvino_model.xml')):
296299
print("OV model is source folder. Skipping conversion.")
297300
else: # assume HF model name or local pytorch model folder
298-
print("Exporting LLM model to ", draft_llm_model_path)
301+
print("Exporting draft LLM model to ", draft_llm_model_path)
299302
if not os.path.isdir(draft_llm_model_path) or args['overwrite_models']:
300303
optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(draft_source_model, precision, draft_llm_model_path)
301304
if os.system(optimum_command):
302305
raise ValueError("Failed to export llm model", source_model)
303-
###
306+
307+
### Prepare plugin config string for jinja rendering
308+
plugin_config = {}
309+
if task_parameters['kv_cache_precision'] is not None:
310+
plugin_config['KV_CACHE_PRECISION'] = task_parameters['kv_cache_precision']
311+
if task_parameters['max_prompt_len'] is not None:
312+
if task_parameters['target_device'] != 'NPU':
313+
raise ValueError("max_prompt_len is only supported for NPU target device")
314+
if task_parameters['max_prompt_len'] <= 0:
315+
raise ValueError("max_prompt_len should be a positive integer")
316+
plugin_config['MAX_PROMPT_LEN'] = task_parameters['max_prompt_len']
317+
318+
# Additional plugin properties for HETERO
319+
if "HETERO" in task_parameters['target_device']:
320+
if task_parameters['pipeline_type'] is None:
321+
raise ValueError("pipeline_type should be specified for HETERO target device. It should be set to either LM or VLM")
322+
if task_parameters['pipeline_type'] not in ["LM", "VLM"]:
323+
raise ValueError("pipeline_type should be either LM or VLM for HETERO target device")
324+
plugin_config['MODEL_DISTRIBUTION_POLICY'] = 'PIPELINE_PARALLEL'
325+
###
326+
327+
plugin_config_str = json.dumps(plugin_config)
328+
task_parameters['plugin_config'] = plugin_config_str
329+
304330
os.makedirs(os.path.join(model_repository_path, model_name), exist_ok=True)
305331
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
306332
graph_content = gtemplate.render(tokenizer_model="{}_tokenizer_model".format(model_name), embeddings_model="{}_embeddings_model".format(model_name),
@@ -309,8 +335,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
309335
f.write(graph_content)
310336
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
311337
add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
312-
313-
338+
314339
def export_embeddings_model(model_repository_path, source_model, model_name, precision, task_parameters, version, config_file_path, truncate=True):
315340
if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')):
316341
print("OV model is source folder. Skipping conversion.")

demos/llm_npu/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ models
7272
└── tokenizer.json
7373
```
7474

75-
The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [LLM calculator documentation](../../docs/llm/reference.md) to learn more about configuration options.
75+
The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments.
76+
Note that by default, NPU sets limitation on the prompt length to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter.
77+
Run the script with `--help` argument to check available parameters and see the [LLM calculator documentation](../../docs/llm/reference.md) to learn more about configuration options.
7678

7779
## Server Deployment
7880

demos/vlm_npu/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ Run `export_model.py` script to download and quantize the model:
4040
```console
4141
python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models --overwrite_models
4242
```
43+
44+
Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter.
45+
4346
> **Note:** You can change the model used in the demo out of any topology [tested](https://github.com/openvinotoolkit/openvino.genai/blob/master/SUPPORTED_MODELS.md#visual-language-models) with OpenVINO.
4447
4548
You should have a model folder like below:

docs/llm/reference.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ Errors during configuration files processing (access issue, corrupted file, inco
200200
There are several known limitations which are expected to be addressed in the coming releases:
201201

202202
- Metrics related to text generation are not exposed via `metrics` endpoint. Key metrics from LLM calculators are included in the server logs with information about active requests, scheduled for text generation and KV Cache usage. It is possible to track in the metrics the number of active generation requests using metric called `ovms_current_graphs`. Also tracking statistics for request and responses is possible. [Learn more](../metrics.md)
203-
- Multi modal models are not supported yet. Images can't be sent now as the context.
204203
- `logprobs` parameter is not supported currently in streaming mode. It includes only a single logprob and do not include values for input tokens.
205204
- Server logs might sporadically include a message "PCRE2 substitution failed with error code -55" - this message can be safely ignored. It will be removed in next version.
206205

@@ -210,10 +209,14 @@ Some servable types introduce additional limitations:
210209
- `finish_reason` not supported (always set to `stop`),
211210
- `logprobs` not supported,
212211
- sequential request processing (only one request is handled at a time)
212+
- only a single response can be returned. Parameter `n` is not supported.
213+
- **[NPU only]** beam_search algorithm is not supported with NPU. Greedy search and multinomial algorithms are supported.
214+
- **[NPU only]** models must be exported with INT4 precision and `--sym --ratio 1.0 --group-size -1` params. This is enforced in the export_model.py script when the target_device in NPU.
213215

214216
### Visual Language servable limitations
215217
- works only on `/chat/completions` endpoint,
216218
- `image_url` input supports only base64 encoded image, not an actual URL
219+
- **[NPU only]** requests MUST include one and only one image in the messages context. Other request will be rejected
217220

218221
## References
219222
- [Chat Completions API](../model_server_rest_api_chat.md)

0 commit comments

Comments
 (0)