Add MAX_PROMPT_LEN in export models script (openvinotoolkit#3216)

mzegla · web-flow · commit c80f7361cbac · 2025-04-09T14:38:55.000+02:00
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -31,14 +31,14 @@ def add_common_arguments(parser):
     parser.add_argument('--weight-format', default='int8', help='precision of the exported model', dest='precision')
     parser.add_argument('--config_file_path', default='config.json', help='path to the config file', dest='config_file_path')
     parser.add_argument('--overwrite_models', default=False, action='store_true', help='Overwrite the model if it already exists in the models repository', dest='overwrite_models')
-    parser.add_argument('--target_device', default="CPU", help='CPU or GPU, default is CPU', dest='target_device')
+    parser.add_argument('--target_device', default="CPU", help='CPU, GPU, NPU or HETERO, default is CPU', dest='target_device')
 
 parser = argparse.ArgumentParser(description='Export Hugging face models to OVMS models repository including all configuration for deployments')
 
 subparsers = parser.add_subparsers(help='subcommand help', required=True, dest='task')
 parser_text = subparsers.add_parser('text_generation', help='export model for chat and completion endpoints')
 add_common_arguments(parser_text)
-parser_text.add_argument('--pipeline_type', default=None, help='Type of the pipeline to be used. Can be either TEXT_CB or VLM_CB. When undefined, it will be autodetected', dest='pipeline_type')
+parser_text.add_argument('--pipeline_type', default=None, choices=["LM", "LM_CB", "VLM", "VLM_CB", "AUTO"], help='Type of the pipeline to be used. AUTO is used by default', dest='pipeline_type')
 parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
 parser_text.add_argument('--extra_quantization_params', help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')
 parser_text.add_argument('--enable_prefix_caching', action='store_true', help='This algorithm is used to cache the prompt tokens.', dest='enable_prefix_caching')
@@ -50,6 +50,8 @@ def add_common_arguments(parser):
                          'Using this option will create configuration for speculative decoding', dest='draft_source_model')
 parser_text.add_argument('--draft_model_name', required=False, default=None, help='Draft model name that should be used in the deployment. '
                          'Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.', dest='draft_model_name')
+parser_text.add_argument('--max_prompt_len', required=False, type=int, default=None, help='Sets NPU specific property for maximum number of tokens in the prompt. '
+                         'Not effective if target device is not NPU', dest='max_prompt_len')
 
 parser_embeddings = subparsers.add_parser('embeddings', help='export model for embeddings endpoint')
 add_common_arguments(parser_embeddings)
@@ -148,7 +150,7 @@ def add_common_arguments(parser):
           {%- if pipeline_type %}
           pipeline_type: {{pipeline_type}},{% endif %}
           models_path: "{{model_path}}",
-          plugin_config: '{ {% if kv_cache_precision %}"KV_CACHE_PRECISION": "{{kv_cache_precision}}"{% endif %}}',
+          plugin_config: '{{plugin_config}}',
           enable_prefix_caching: {% if not enable_prefix_caching %}false{% else %} true{% endif%},
           cache_size: {{cache_size|default("10", true)}},
           {%- if max_num_batched_tokens %}
@@ -267,6 +269,7 @@ def add_servable_to_config(config_path, mediapipe_name, base_path):
 
 def export_text_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
     model_path = "./"
+    ### Export model
     if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')):
             print("OV model is source folder. Skipping conversion.")
             model_path = source_model
@@ -286,7 +289,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
             optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
             if os.system(optimum_command):
                 raise ValueError("Failed to export llm model", source_model)    
-    ### Speculative decoding specific 
+    ### Export draft model for speculative decoding 
     draft_source_model = task_parameters.get("draft_source_model", None)
     draft_model_dir_name = None   
     if draft_source_model:
@@ -295,12 +298,35 @@ def export_text_generation_model(model_repository_path, source_model, model_name
         if os.path.isfile(os.path.join(draft_llm_model_path, 'openvino_model.xml')):
                 print("OV model is source folder. Skipping conversion.")
         else: # assume HF model name or local pytorch model folder
-            print("Exporting LLM model to ", draft_llm_model_path)
+            print("Exporting draft LLM model to ", draft_llm_model_path)
             if not os.path.isdir(draft_llm_model_path) or args['overwrite_models']:
                 optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(draft_source_model, precision, draft_llm_model_path)
                 if os.system(optimum_command):
                     raise ValueError("Failed to export llm model", source_model)
-    ###
+
+    ### Prepare plugin config string for jinja rendering
+    plugin_config = {}
+    if task_parameters['kv_cache_precision'] is not None:
+        plugin_config['KV_CACHE_PRECISION'] = task_parameters['kv_cache_precision']
+    if task_parameters['max_prompt_len'] is not None:
+        if task_parameters['target_device'] != 'NPU':
+            raise ValueError("max_prompt_len is only supported for NPU target device")
+        if task_parameters['max_prompt_len'] <= 0:
+            raise ValueError("max_prompt_len should be a positive integer")
+        plugin_config['MAX_PROMPT_LEN'] = task_parameters['max_prompt_len']
+    
+    # Additional plugin properties for HETERO
+    if "HETERO" in task_parameters['target_device']:
+        if task_parameters['pipeline_type'] is None:
+            raise ValueError("pipeline_type should be specified for HETERO target device. It should be set to either LM or VLM")
+        if task_parameters['pipeline_type'] not in ["LM", "VLM"]:
+            raise ValueError("pipeline_type should be either LM or VLM for HETERO target device")
+        plugin_config['MODEL_DISTRIBUTION_POLICY'] = 'PIPELINE_PARALLEL'
+    ### 
+
+    plugin_config_str = json.dumps(plugin_config)
+    task_parameters['plugin_config'] = plugin_config_str
+    
     os.makedirs(os.path.join(model_repository_path, model_name), exist_ok=True)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
     graph_content = gtemplate.render(tokenizer_model="{}_tokenizer_model".format(model_name), embeddings_model="{}_embeddings_model".format(model_name), 
@@ -309,8 +335,7 @@ def export_text_generation_model(model_repository_path, source_model, model_name
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
     add_servable_to_config(config_file_path, model_name, os.path.relpath( os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
-    
-    
+
 def export_embeddings_model(model_repository_path, source_model, model_name, precision, task_parameters, version, config_file_path, truncate=True):
     if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')):
         print("OV model is source folder. Skipping conversion.")
diff --git a/demos/llm_npu/README.md b/demos/llm_npu/README.md
@@ -72,7 +72,9 @@ models
         └── tokenizer.json
 ```
 
-The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [LLM calculator documentation](../../docs/llm/reference.md) to learn more about configuration options.
+The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. 
+Note that by default, NPU sets limitation on the prompt length to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter.
+Run the script with `--help` argument to check available parameters and see the [LLM calculator documentation](../../docs/llm/reference.md) to learn more about configuration options.
 
 ## Server Deployment
 
diff --git a/demos/vlm_npu/README.md b/demos/vlm_npu/README.md
@@ -40,6 +40,9 @@ Run `export_model.py` script to download and quantize the model:
 ```console
 python export_model.py text_generation --source_model microsoft/Phi-3.5-vision-instruct --target_device NPU --config_file_path models/config.json --model_repository_path models  --overwrite_models
 ```
+
+Note that by default, NPU sets limitation on the prompt length (which in VLM also include image tokens) to 1024 tokens. You can modify that limit by using `--max_prompt_len` parameter.
+
 > **Note:** You can change the model used in the demo out of any topology [tested](https://github.com/openvinotoolkit/openvino.genai/blob/master/SUPPORTED_MODELS.md#visual-language-models) with OpenVINO.
 
 You should have a model folder like below:
diff --git a/docs/llm/reference.md b/docs/llm/reference.md
@@ -200,7 +200,6 @@ Errors during configuration files processing (access issue, corrupted file, inco
 There are several known limitations which are expected to be addressed in the coming releases:
 
 - Metrics related to text generation are not exposed via `metrics` endpoint. Key metrics from LLM calculators are included in the server logs with information about active requests, scheduled for text generation and KV Cache usage. It is possible to track in the metrics the number of active generation requests using metric called `ovms_current_graphs`. Also tracking statistics for request and responses is possible. [Learn more](../metrics.md)
-- Multi modal models are not supported yet. Images can't be sent now as the context.
 - `logprobs` parameter is not supported currently in streaming mode. It includes only a single logprob and do not include values for input tokens.
 - Server logs might sporadically include a message "PCRE2 substitution failed with error code -55" - this message can be safely ignored. It will be removed in next version.
 
@@ -210,10 +209,14 @@ Some servable types introduce additional limitations:
 - `finish_reason` not supported (always set to `stop`),
 - `logprobs` not supported,
 - sequential request processing (only one request is handled at a time)
+- only a single response can be returned. Parameter `n` is not supported.
+- **[NPU only]** beam_search algorithm is not supported with NPU. Greedy search and multinomial algorithms are supported.
+- **[NPU only]** models must be exported with INT4 precision and `--sym --ratio 1.0 --group-size -1` params. This is enforced in the export_model.py script when the target_device in NPU.
 
 ### Visual Language servable limitations
 - works only on `/chat/completions` endpoint,
 - `image_url` input supports only base64 encoded image, not an actual URL
+- **[NPU only]** requests MUST include one and only one image in the messages context. Other request will be rejected
 
 ## References
 - [Chat Completions API](../model_server_rest_api_chat.md)