Update code generation demo (openvinotoolkit#3270)

dkalinowski · RH-steve-grubb · commit a0ffb9b6c163 · 2025-05-14T15:54:08.000-04:00
CVS-165597
diff --git a/demos/c_api_minimal_app/Makefile b/demos/c_api_minimal_app/Makefile
@@ -25,13 +25,13 @@ BASE_OS ?= ubuntu24
 
 ifeq ($(BASE_OS),ubuntu24)
   BASE_OS_TAG_UBUNTU ?= 24.04
-  PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24.tar.gz"
+  PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24.tar.gz"
   BASE_IMAGE ?= ubuntu:$(BASE_OS_TAG_UBUNTU)
   DIST_OS=ubuntu
 endif
 ifeq ($(BASE_OS),redhat)
   BASE_OS_TAG_REDHAT ?= 9.5
-  PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat.tar.gz"
+  PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat.tar.gz"
   BASE_IMAGE ?= registry.access.redhat.com/ubi9/ubi:$(BASE_OS_TAG_REDHAT)
   DIST_OS=redhat
 endif
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -21,21 +21,41 @@ mkdir models
 > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
 
 Export `codellama/CodeLlama-7b-Instruct-hf`:
+
+::::{tab-set}
+:::{tab-item} Intel GPU
 ```console
-python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --cache_size 1 --overwrite_models
 ```
+:::
 
-> **Note:** Use `--target_device GPU` for Intel GPU or omit this parameter to run on Intel CPU
+:::{tab-item} Intel NPU
+```console
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+```
+:::
+::::
 
 ## Prepare Code Completion Model
 For this task we need smaller, lighter model that will produce code quicker than chat task.
 Since we do not want to wait for the code to appear, we need to use smaller model. It should be responsive enough to generate multi-line blocks of code ahead of time as we type.
 Code completion works in non-streaming, unary mode. Do not use instruct model, there is no chat involved in the process.
 
 Export `Qwen/Qwen2.5-Coder-1.5B`:
+
+::::{tab-set}
+:::{tab-item} Intel GPU
+```console
+python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --cache_size 1 --overwrite_models
+```
+:::
+
+:::{tab-item} Intel NPU
 ```console
 python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
 ```
+:::
+::::
 
 Examine that workspace is set up properly `models/config_all.json`:
 ```
@@ -105,10 +125,21 @@ ovms --rest_port 8000 --config_path ./models/config_all.json
 ```
 
 ### Linux: via Docker
+::::{tab-set}
+:::{tab-item} Intel GPU
+```bash
+docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.1 --rest_port 8000 --config_path /workspace/models/config_all.json
+```
+:::
+
+:::{tab-item} Intel NPU
 ```bash
 docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
   -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.1 --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
+:::
+::::
 
 ## Set Up Visual Studio Code
 
diff --git a/docs/deploying_server_baremetal.md b/docs/deploying_server_baremetal.md
@@ -8,12 +8,12 @@ To deploy Model Server on baremetal, use pre-compiled binaries for Ubuntu22, Ubu
 :sync: ubuntu-22-04
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu22.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22.tar.gz
 tar -xzvf ovms_ubuntu22.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu22_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22_python_on.tar.gz
 tar -xzvf ovms_ubuntu22_python_on.tar.gz
 ```
 Install required libraries:
@@ -36,12 +36,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
 :sync: ubuntu-24-04
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24.tar.gz
 tar -xzvf ovms_ubuntu24.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24_python_on.tar.gz
 tar -xzvf ovms_ubuntu24_python_on.tar.gz
 ```
 Install required libraries:
@@ -64,12 +64,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
 :sync: rhel-9.5
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat.tar.gz
 tar -xzvf ovms_redhat.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat_python_on.tar.gz
 tar -xzvf ovms_redhat_python_on.tar.gz
 ```
 Install required libraries:
@@ -95,7 +95,7 @@ Make sure you have [Microsoft Visual C++ Redistributable](https://aka.ms/vs/17/r
 Download and unpack model server archive for Windows:
 
 ```bat
-curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_windows.zip -o ovms.zip
+curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_windows.zip -o ovms.zip
 tar -xf ovms.zip
 ```