Merge pull request #68 from RH-steve-grubb/add-tokenizer

vaibhavjainwiz · web-flow · commit c4885033efcf · 2025-05-15T20:57:54.000+05:30
Add tokenizer &amp; upstream patch
diff --git a/Dockerfile.redhat b/Dockerfile.redhat
@@ -227,6 +227,16 @@ ARG ov_tokenizers_branch=master
 RUN git clone https://github.com/openvinotoolkit/openvino_tokenizers.git /openvino_tokenizers && cd /openvino_tokenizers && git checkout $ov_tokenizers_branch && git submodule update --init --recursive
 WORKDIR /openvino_tokenizers/build
 RUN cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_VERBOSE_MAKEFILE="${VERBOSE_LOGS}" -DCMAKE_CXX_FLAGS=" ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}"  && cmake --build . --parallel $JOBS ; cp /openvino_tokenizers/build/src/lib*.so /opt/intel/openvino/runtime/lib/intel64/
+WORKDIR /openvino_tokenizers/
+# Install the openvino_tokenizers python bindings and use a symlink to point
+# to the shared object in it's final location.
+RUN if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \
+    mkdir -p /opt/intel/openvino/python/openvino_tokenizers/lib ; \
+    cp -r python/* /opt/intel/openvino/python/ ; \
+    cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \
+    mkdir -p /opt/intel/openvino/python/openvino_tokenizers-2025.1.dist-info ; \
+    echo $'Metadata-Version: 1.0\nName: openvino-tokenizers\nVersion: 2025.1\nRequires-Python: >=3.9\nRequires-Dist: openvino~=2025.1.0' > /opt/intel/openvino/python/openvino_tokenizers-2025.1.dist-info/METADATA ; \
+    ln -s /ovms/lib/libopenvino_tokenizers.so /opt/intel/openvino/python/openvino_tokenizers/lib/libopenvino_tokenizers.so ; fi
 
 # Build OpenVINO Model Server
 WORKDIR /ovms
@@ -346,7 +356,9 @@ ARG FUZZER_BUILD=0
 ARG debug_bazel_flags="--strip=always  --config=mp_on_py_on --//:distro=redhat"
 COPY --from=capi-build /ovms_release/lib/libovms_shared.so /ovms_release/lib/
 COPY create_package.sh /
-RUN ./create_package.sh
+RUN ./create_package.sh ; if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \
+   echo $'#!/bin/bash\npython3 -m openvino_tokenizers.cli "$@"' > /ovms_release/bin/convert_tokenizer ; \
+   chmod +x /ovms_release/bin/convert_tokenizer ; fi
 
 # hadolint ignore=DL3059
 RUN chown -R ovms:ovms /ovms_release
@@ -395,6 +407,7 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
     useradd --home-dir /home/ovms --create-home --uid 5000 --gid 5000 --groups 39,44 --shell /bin/bash --skel /dev/null ovms
 
 ENV LD_LIBRARY_PATH=/ovms/lib
+ENV PATH="$PATH:/ovms/bin"
 
 COPY --from=pkg /ovms_release /ovms
 COPY --from=pkg /usr/local/lib/python3.*/site-packages/jinja2 /ovms/python_deps/jinja2
diff --git a/demos/c_api_minimal_app/Makefile b/demos/c_api_minimal_app/Makefile
@@ -25,13 +25,13 @@ BASE_OS ?= ubuntu24
 
 ifeq ($(BASE_OS),ubuntu24)
   BASE_OS_TAG_UBUNTU ?= 24.04
-  PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24.tar.gz"
+  PACKAGE_URL ?="https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24.tar.gz"
   BASE_IMAGE ?= ubuntu:$(BASE_OS_TAG_UBUNTU)
   DIST_OS=ubuntu
 endif
 ifeq ($(BASE_OS),redhat)
   BASE_OS_TAG_REDHAT ?= 9.5
-  PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat.tar.gz"
+  PACKAGE_URL ="https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat.tar.gz"
   BASE_IMAGE ?= registry.access.redhat.com/ubi9/ubi:$(BASE_OS_TAG_REDHAT)
   DIST_OS=redhat
 endif
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -21,21 +21,41 @@ mkdir models
 > **Note:** The users in China need to set environment variable HF_ENDPOINT="https://hf-mirror.com" before running the export script to connect to the HF Hub.
 
 Export `codellama/CodeLlama-7b-Instruct-hf`:
+
+::::{tab-set}
+:::{tab-item} Intel GPU
 ```console
-python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --cache_size 1 --overwrite_models
 ```
+:::
 
-> **Note:** Use `--target_device GPU` for Intel GPU or omit this parameter to run on Intel CPU
+:::{tab-item} Intel NPU
+```console
+python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
+```
+:::
+::::
 
 ## Prepare Code Completion Model
 For this task we need smaller, lighter model that will produce code quicker than chat task.
 Since we do not want to wait for the code to appear, we need to use smaller model. It should be responsive enough to generate multi-line blocks of code ahead of time as we type.
 Code completion works in non-streaming, unary mode. Do not use instruct model, there is no chat involved in the process.
 
 Export `Qwen/Qwen2.5-Coder-1.5B`:
+
+::::{tab-set}
+:::{tab-item} Intel GPU
+```console
+python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device GPU --cache_size 1 --overwrite_models
+```
+:::
+
+:::{tab-item} Intel NPU
 ```console
 python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models
 ```
+:::
+::::
 
 Examine that workspace is set up properly `models/config_all.json`:
 ```
@@ -105,10 +125,21 @@ ovms --rest_port 8000 --config_path ./models/config_all.json
 ```
 
 ### Linux: via Docker
+::::{tab-set}
+:::{tab-item} Intel GPU
+```bash
+docker run -d --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
+  -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.1 --rest_port 8000 --config_path /workspace/models/config_all.json
+```
+:::
+
+:::{tab-item} Intel NPU
 ```bash
 docker run -d --rm --device /dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
   -p 8000:8000 -v $(pwd)/:/workspace/ openvino/model_server:2025.1 --rest_port 8000 --config_path /workspace/models/config_all.json
 ```
+:::
+::::
 
 ## Set Up Visual Studio Code
 
diff --git a/docs/deploying_server_baremetal.md b/docs/deploying_server_baremetal.md
@@ -8,12 +8,12 @@ To deploy Model Server on baremetal, use pre-compiled binaries for Ubuntu22, Ubu
 :sync: ubuntu-22-04
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu22.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22.tar.gz
 tar -xzvf ovms_ubuntu22.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu22_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22_python_on.tar.gz
 tar -xzvf ovms_ubuntu22_python_on.tar.gz
 ```
 Install required libraries:
@@ -36,12 +36,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
 :sync: ubuntu-24-04
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24.tar.gz
 tar -xzvf ovms_ubuntu24.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_ubuntu24_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24_python_on.tar.gz
 tar -xzvf ovms_ubuntu24_python_on.tar.gz
 ```
 Install required libraries:
@@ -64,12 +64,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
 :sync: rhel-9.5
 Download precompiled package (without python support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat.tar.gz
 tar -xzvf ovms_redhat.tar.gz
 ```
 or precompiled package (with python and LLM support):
 ```{code} sh
-wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_redhat_python_on.tar.gz
+wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat_python_on.tar.gz
 tar -xzvf ovms_redhat_python_on.tar.gz
 ```
 Install required libraries:
@@ -95,7 +95,7 @@ Make sure you have [Microsoft Visual C++ Redistributable](https://aka.ms/vs/17/r
 Download and unpack model server archive for Windows:
 
 ```bat
-curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.0/ovms_windows.zip -o ovms.zip
+curl -L https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_windows.zip -o ovms.zip
 tar -xf ovms.zip
 ```