diff --git a/notebooks/funasr-nano/cmd_helper.py b/notebooks/funasr-nano/cmd_helper.py new file mode 100644 index 00000000000..3dd2d928181 --- /dev/null +++ b/notebooks/funasr-nano/cmd_helper.py @@ -0,0 +1,63 @@ +import logging +import subprocess # nosec - disable B404:import-subprocess check +import sys +import os +from pathlib import Path +import platform + + +def clone_repo(repo_url: str, revision: str = None, add_to_sys_path: bool = True) -> Path: + repo_path = Path(repo_url.split("/")[-1].replace(".git", "")) + + if not repo_path.exists(): + try: + subprocess.run(["git", "clone", repo_url], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + except Exception as exc: + print(f"Failed to clone the repository: {exc.stderr}") + raise + + if revision: + subprocess.run(["git", "checkout", revision], cwd=str(repo_path), check=True) + if add_to_sys_path and str(repo_path.resolve()) not in sys.path: + sys.path.insert(0, str(repo_path.resolve())) + + return repo_path + + +def optimum_cli(model_id, output_dir, show_command=True, additional_args: dict[str, str] = None, debug_logs=False): + export_command = f"optimum-cli export openvino --model {model_id} {output_dir}" + if additional_args is not None: + for arg, value in additional_args.items(): + export_command += f" --{arg}" + if value: + export_command += f" {value}" + + if show_command: + from IPython.display import Markdown, display + + display(Markdown("**Export command:**")) + display(Markdown(f"`{export_command}`")) + + transofrmers_loglevel = None + if debug_logs: + transofrmers_loglevel = os.environ.pop("TRANSFORMERS_VERBOSITY", None) + os.environ["TRANSFORMERS_VERBOSITY"] = "debug" + + try: + subprocess.run(export_command.split(" "), shell=(platform.system() == "Windows"), check=True, capture_output=True) + except subprocess.CalledProcessError as exc: + logger = logging.getLogger() + logger.error(f"Command failed with exit code {exc.returncode}") + + if exc.stdout: + logger.error(f"STDOUT:\n{exc.stdout.decode(errors='replace')}") + + if exc.stderr: + logger.error(f"STDERR:\n{exc.stderr.decode(errors='replace')}") + + if transofrmers_loglevel is not None: + os.environ["TRANSFORMERS_VERBOSITY"] = transofrmers_loglevel + raise exc + finally: + if transofrmers_loglevel is not None: + os.environ["TRANSFORMERS_VERBOSITY"] = transofrmers_loglevel diff --git a/notebooks/funasr-nano/funasr_fixed.ipynb b/notebooks/funasr-nano/funasr_fixed.ipynb new file mode 100644 index 00000000000..47bd6342352 --- /dev/null +++ b/notebooks/funasr-nano/funasr_fixed.ipynb @@ -0,0 +1,1278 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# End-to-End Speech Recognition with FunASR Nano and OpenVINO\n", + "\n", + "[FunASR Nano](https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512) is an end-to-end speech recognition large model launched by Tongyi Lab. It is trained on tens of millions of hours of real speech data, supporting low-latency real-time transcription across 31 languages. It excels in vertical domains such as education and finance, accurately recognizing professional terminology and regional accents.\n", + "\n", + "**Model architecture** -- FunASR Nano (~800M parameters) is a **multimodal audio-language model** with four components:\n", + "\n", + "| Component | Role |\n", + "|---|---|\n", + "| **Audio Frontend (WavFrontend)** | Extracts Fbank features from raw waveform (mel-frequency filterbank) |\n", + "| **Audio Encoder** | Converts audio features into audio embeddings |\n", + "| **Text Embeddings** | Standard token embeddings for the LLM vocabulary |\n", + "| **Language Model (Qwen3-0.6B)** | Generates transcription from merged audio + text embeddings |\n", + "\n", + "The pipeline works as: **Audio -> Frontend -> Encoder -> Embeddings merge with text prompt -> LLM -> Transcribed text**\n", + "\n", + "In this tutorial we demonstrate how to convert, run, and optimize FunASR Nano using **OpenVINO** and discuss **OpenVINO GenAI** integration.\n", + "\n", + "#### Table of contents:\n", + "\n", + "- [1. Environment Setup](#1.-Environment-Setup)\n", + "- [2. Hugging Face Authentication](#2.-Hugging-Face-Authentication)\n", + "- [3. Model Download and Analysis](#3.-Model-Download-and-Analysis)\n", + "- [4. Conversion to OpenVINO IR](#4.-Conversion-to-OpenVINO-IR)\n", + "- [5. OpenVINO Runtime Inference](#5.-OpenVINO-Runtime-Inference)\n", + "- [6. Multi-Device Inference (CPU / GPU / NPU)](#6.-Multi-Device-Inference-(CPU-/-GPU-/-NPU))\n", + " - [6.1 CPU Inference](#6.1-CPU-Inference)\n", + " - [6.2 GPU Inference](#6.2-GPU-Inference)\n", + " - [6.3 NPU Inference](#6.3-NPU-Inference)\n", + "- [7. OpenVINO GenAI Integration](#7.-OpenVINO-GenAI-Integration)\n", + "- [8. Interactive Demo](#8.-Interactive-Demo)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Environment Setup\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Install all required dependencies: OpenVINO, OpenVINO GenAI, PyTorch, FunASR, and audio processing libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "notebook_utils.py already exists\n", + "cmd_helper.py already exists\n", + "pip_helper.py already exists\n", + "ov_funasr_helper.py already exists\n", + "gradio_helper.py already exists\n" + ] + } + ], + "source": [ + "# Fetch utility modules from openvino_notebooks repository\n", + "import requests\n", + "from pathlib import Path\n", + "\n", + "utils = {\n", + " # General OpenVINO notebook utilities\n", + " \"notebook_utils.py\": \"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + " \"cmd_helper.py\": \"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py\",\n", + " \"pip_helper.py\": \"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py\",\n", + " # FunASR-specific helpers\n", + " \"ov_funasr_helper.py\": \"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/funasr-nano/ov_funasr_helper.py\",\n", + " \"gradio_helper.py\": \"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/funasr-nano/gradio_helper.py\",\n", + "}\n", + "\n", + "for filename, url in utils.items():\n", + " if not Path(filename).exists():\n", + " r = requests.get(url=url)\n", + " r.raise_for_status()\n", + " Path(filename).write_text(r.text)\n", + " print(f\"Downloaded {filename}\")\n", + " else:\n", + " print(f\"{filename} already exists\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cmd_helper import clone_repo\n", + "from pip_helper import pip_install\n", + "import platform\n", + "\n", + "# Uninstall potentially conflicting packages before clean install\n", + "!pip uninstall -y -q torch torchaudio optimum-intel optimum\n", + "\n", + "pip_install(\n", + " \"-q\",\n", + " \"--extra-index-url\",\n", + " \"https://download.pytorch.org/whl/cpu\",\n", + " \"torch\",\n", + " \"nncf\",\n", + " \"torchaudio\",\n", + " \"openvino==2025.3.0\", # optimum-intel 1.26.x requires <2025.4\n", + " \"openvino-genai==2025.3.0.0\",\n", + " \"optimum==2.1.0\",\n", + " \"optimum-intel==1.26.1\", # last version compatible with openvino 2025.3\n", + " \"transformers>=4.51,<4.56\", # 4.51+ for Qwen3; <4.56 for optimum-intel 1.26\n", + " \"funasr>=1.2.7\",\n", + " \"gradio\",\n", + " \"huggingface_hub\",\n", + " \"librosa\",\n", + ")\n", + "\n", + "# Clone the Fun-ASR repository (contains model.py needed for model loading)\n", + "repo_dir = Path(\"Fun-ASR\")\n", + "revision = \"efe63c122929bcca095fedc537c3081c5c4ee062\"\n", + "clone_repo(\"https://github.com/FunAudioLLM/Fun-ASR.git\", revision)\n", + "\n", + "if platform.system() == \"Darwin\":\n", + " pip_install(\"numpy<2.0\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Hugging Face Authentication\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "FunASR Nano is a **public model** — no authentication token is required to download it. However, if you work with gated models in the future, you can authenticate using one of these methods:\n", + "\n", + "- Set an environment variable: `export HF_TOKEN=your_token_here`\n", + "- Or run: `huggingface-cli login`\n", + "\n", + "The cell below will use the token from the environment if available." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No HF_TOKEN found — proceeding without authentication (OK for public models)\n" + ] + } + ], + "source": [ + "import os\n", + "from huggingface_hub import login\n", + "\n", + "token = os.getenv(\"HF_TOKEN\")\n", + "if token:\n", + " login(token=token, add_to_git_credential=False)\n", + " print(\"Logged into Hugging Face using HF_TOKEN\")\n", + "else:\n", + " print(\"No HF_TOKEN found — proceeding without authentication (OK for public models)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Model Download and Analysis\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "FunASR Nano is available in two variants:\n", + "\n", + "| Model | Languages | Training Data |\n", + "|---|---|---|\n", + "| **Fun-ASR-Nano-2512** | Chinese, English, Japanese + 7 dialects & 26 accents | Tens of millions of hours |\n", + "| **Fun-ASR-MLT-Nano-2512** | 31 languages (incl. East/Southeast Asian, European) | Hundreds of thousands of hours |\n", + "\n", + "Both models share the same architecture (~800M parameters) based on **Qwen3-0.6B** as the language model backbone.\n", + "\n", + "### Model architecture detail\n", + "\n", + "The model processes audio through a multi-stage pipeline:\n", + "\n", + "1. **WavFrontend**: Extracts 80-dimensional mel-filterbank features at 10ms frame shift\n", + "2. **Audio Encoder**: Convolutional + Transformer layers that downsample and encode audio features\n", + "3. **Audio Adaptor**: Projects encoder output to the LLM hidden dimension\n", + "4. **Embedding Merge**: Audio embeddings replace placeholder tokens in the text prompt, then are concatenated with text embeddings\n", + "5. **Qwen3-0.6B LLM**: Autoregressive decoder that generates the transcription\n", + "\n", + "**Model inputs:**\n", + "- Raw audio waveform (WAV, MP3, etc.) at any sample rate (resampled internally to 16kHz)\n", + "- Text prompt template with `<|startofspeech|>...<|endofspeech|>` markers\n", + "\n", + "**Model outputs:**\n", + "- Transcribed text string" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "500b1db8193c4d9799062200347cb54b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Model:', options=('FunAudioLLM/Fun-ASR-Nano-2512', 'FunAudioLLM/Fun-ASR-MLT-Nano-2512'),…" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "from pathlib import Path\n", + "\n", + "model_ids = [\"FunAudioLLM/Fun-ASR-Nano-2512\", \"FunAudioLLM/Fun-ASR-MLT-Nano-2512\"]\n", + "\n", + "model_selector = widgets.Dropdown(\n", + " options=model_ids,\n", + " default=model_ids[0],\n", + " description=\"Model:\",\n", + ")\n", + "\n", + "model_selector" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "88d624813c4a43a1b3965cda0db041fc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 21 files: 0%| | 0/21 [00:00 decoded to text string\n" + ] + } + ], + "source": [ + "import torch\n", + "from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video\n", + "\n", + "# Load a sample audio file\n", + "wav_path = str(model_dir / \"example\" / \"en.mp3\")\n", + "print(f\"Sample audio: {wav_path}\")\n", + "\n", + "data_src = load_audio_text_image_video(wav_path, fs=frontend.fs)\n", + "speech, speech_lengths = extract_fbank(\n", + " data_src, data_type=\"sound\", frontend=frontend, is_final=True\n", + ") # speech: [B, T, D]\n", + "\n", + "print(f\"\\n--- Tensor shapes through the pipeline ---\")\n", + "print(f\"[Frontend output]\")\n", + "print(f\" speech (Fbank features): {speech.shape} (batch, time_frames, feat_dim)\")\n", + "print(f\" speech_lengths: {speech_lengths}\")\n", + "\n", + "# Run through encoder (expects [B, T, D])\n", + "with torch.no_grad():\n", + " encoder_out, encoder_out_lens = pt_model.audio_encoder(speech, speech_lengths)\n", + "\n", + "print(f\"\\n[Encoder output]\")\n", + "print(f\" encoder_out: {encoder_out.shape} (batch, time_frames, hidden_dim)\")\n", + "print(f\" encoder_out_lens: {encoder_out_lens}\")\n", + "\n", + "# Run through adaptor\n", + "if hasattr(pt_model, 'audio_adaptor'):\n", + " with torch.no_grad():\n", + " adapted_out, adapted_lens = pt_model.audio_adaptor(encoder_out, encoder_out_lens)\n", + " print(f\"\\n[Adaptor output]\")\n", + " print(f\" adapted_out: {adapted_out.shape} (batch, time_frames, llm_hidden_dim)\")\n", + " print(f\" adapted_out_lens: {adapted_lens}\")\n", + "\n", + "# Token embeddings\n", + "tokenizer = kwargs[\"tokenizer\"]\n", + "sample_text = \"Hello world\"\n", + "tokens = tokenizer.encode(sample_text, return_tensors=\"pt\")\n", + "with torch.no_grad():\n", + " text_embeds = pt_model.llm.model.get_input_embeddings()(tokens)\n", + "print(f\"\\n[Text embeddings]\")\n", + "print(f\" input_ids: {tokens.shape}\")\n", + "print(f\" text_embeds: {text_embeds.shape} (batch, seq_len, llm_hidden_dim)\")\n", + "\n", + "print(f\"\\n[LLM output]\")\n", + "print(f\" Generates tokens autoregressively -> decoded to text string\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorch model freed from memory\n" + ] + } + ], + "source": [ + "# Free PyTorch model memory before OpenVINO conversion\n", + "import gc\n", + "\n", + "del pt_model\n", + "gc.collect()\n", + "print(\"PyTorch model freed from memory\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Conversion to OpenVINO IR\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "FunASR Nano is a **multi-component model** that cannot be exported as a single OpenVINO IR. Instead, we convert each component separately:\n", + "\n", + "| Component | File | Conversion method |\n", + "|---|---|---|\n", + "| Text Embeddings | `openvino_text_embeddings_model.xml` | `ov.convert_model` from PyTorch embedding layer |\n", + "| Audio Encoder + Adaptor | `openvino_encoder_model.xml` | `ov.convert_model` with wrapped forward |\n", + "| Language Model (Qwen3) | `openvino_model.xml` | `ov.convert_model` → make stateful (KV-cache) |\n", + "\n", + "The conversion also saves the tokenizer and frontend configuration so the pipeline is self-contained.\n", + "\n", + "The helper `convert_funasr()` from the OpenVINO Notebooks repository handles the full conversion pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ov_funasr_helper.py already patched\n" + ] + } + ], + "source": [ + "# Apply Python 3.9 compatibility patch to ov_funasr_helper.py\n", + "# The helper uses X | Y union type syntax (PEP 604) which requires Python 3.10+.\n", + "# Adding `from __future__ import annotations` makes annotations lazily evaluated,\n", + "# fixing the runtime TypeError on Python 3.9.\n", + "helper_path = Path(\"ov_funasr_helper.py\")\n", + "content = helper_path.read_text()\n", + "if not content.startswith(\"from __future__ import annotations\"):\n", + " helper_path.write_text(\"from __future__ import annotations\\n\\n\" + content)\n", + " print(\"Applied Python 3.9 compatibility patch to ov_funasr_helper.py\")\n", + "else:\n", + " print(\"ov_funasr_helper.py already patched\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[OK] Fun-ASR-Nano-2512 model already converted. You can find results in Fun-ASR-Nano-2512-ov\n", + "\n", + "OpenVINO IR files in Fun-ASR-Nano-2512-ov:\n", + " openvino_encoder_model.xml weights: 445.8 MB\n", + " openvino_model.xml weights: 1136.9 MB\n", + " openvino_text_embeddings_model.xml weights: 296.8 MB\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pkrzemin/tasks/benchmark/venv/lib/python3.9/site-packages/openvino/runtime/__init__.py:10: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from ov_funasr_helper import convert_funasr\n", + "\n", + "ov_model_dir = Path(model_name + \"-ov\")\n", + "convert_funasr(str(model_dir), ov_model_dir)\n", + "\n", + "# Show resulting IR files\n", + "print(f\"\\nOpenVINO IR files in {ov_model_dir}:\")\n", + "for p in sorted(ov_model_dir.glob(\"*.xml\")):\n", + " bin_path = p.with_suffix(\".bin\")\n", + " bin_size = bin_path.stat().st_size / (1024 * 1024) if bin_path.exists() else 0\n", + " print(f\" {p.name:45s} weights: {bin_size:.1f} MB\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect OpenVINO IR models\n", + "\n", + "Let's verify the converted IR files and examine their input/output specifications." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n", + "[Text Embeddings] — openvino_text_embeddings_model.xml\n", + "==================================================\n", + " Inputs (1):\n", + " input shape=[?,?] dtype=\n", + " Outputs (1):\n", + " (unnamed) shape=[?,?,1024] dtype=\n", + "\n", + "==================================================\n", + "[Audio Encoder] — openvino_encoder_model.xml\n", + "==================================================\n", + " Inputs (2):\n", + " speech shape=[?,?,?] dtype=\n", + " speech_lengths shape=[?] dtype=\n", + " Outputs (2):\n", + " (unnamed) shape=[?,?,1024] dtype=\n", + " lengths.1 shape=[?] dtype=\n", + "\n", + "==================================================\n", + "[Language Model] — openvino_model.xml\n", + "==================================================\n", + " Inputs (4):\n", + " attention_mask shape=[?,?] dtype=\n", + " position_ids shape=[?,?] dtype=\n", + " inputs_embeds shape=[?,?,1024] dtype=\n", + " beam_idx shape=[?] dtype=\n", + " Outputs (1):\n", + " logits shape=[?,?,151936] dtype=\n", + " Stateful: Yes (56 state variables — KV-cache hidden inside model)\n" + ] + } + ], + "source": [ + "import openvino as ov\n", + "\n", + "core = ov.Core()\n", + "\n", + "ir_files = {\n", + " \"Text Embeddings\": ov_model_dir / \"openvino_text_embeddings_model.xml\",\n", + " \"Audio Encoder\": ov_model_dir / \"openvino_encoder_model.xml\",\n", + " \"Language Model\": ov_model_dir / \"openvino_model.xml\",\n", + "}\n", + "\n", + "def tensor_name(t):\n", + " try:\n", + " return t.get_any_name()\n", + " except RuntimeError:\n", + " return \"(unnamed)\"\n", + "\n", + "for name, xml_path in ir_files.items():\n", + " model = core.read_model(xml_path)\n", + " print(f\"\\n{'=' * 50}\")\n", + " print(f\"[{name}] — {xml_path.name}\")\n", + " print(f\"{'=' * 50}\")\n", + " print(f\" Inputs ({len(model.inputs)}):\")\n", + " for inp in model.inputs:\n", + " print(f\" {tensor_name(inp):35s} shape={inp.get_partial_shape()} dtype={inp.get_element_type()}\")\n", + " print(f\" Outputs ({len(model.outputs)}):\")\n", + " for out in model.outputs[:5]: # show first 5 to avoid flooding\n", + " print(f\" {tensor_name(out):35s} shape={out.get_partial_shape()} dtype={out.get_element_type()}\")\n", + " if len(model.outputs) > 5:\n", + " print(f\" ... and {len(model.outputs) - 5} more outputs (KV-cache)\")\n", + " if len(model.get_sinks()) > 0:\n", + " print(f\" Stateful: Yes ({len(model.get_sinks())} state variables — KV-cache hidden inside model)\")\n", + " del model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. OpenVINO Runtime Inference\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Now we load the converted models and run inference using the `OVFunASRNano` wrapper class. This class:\n", + "- Loads all three IR components (text embeddings, encoder, LLM)\n", + "- Orchestrates the full pipeline: audio preprocessing -> encoding -> embedding merge -> LLM generation\n", + "- Uses `OVModelForCausalLMWithEmbed` to support `inputs_embeds` input for the LLM (needed for multimodal fusion)\n", + "\n", + "### Select Inference Device" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9e45878a2b434aaba08b7da58d23a514", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', options=('CPU',), value='CPU')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from notebook_utils import device_widget\n", + "\n", + "device = device_widget(\"CPU\", exclude=[\"AUTO\"])\n", + "\n", + "device" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Device-specific LLM configuration\n", + "llm_ov_config = {\n", + " \"CPU\": {},\n", + " \"GPU\": {\"ACTIVATIONS_SCALE_FACTOR\": \"8.0\"},\n", + " \"NPU\": {\n", + " \"ACTIVATIONS_SCALE_FACTOR\": \"8.0\",\n", + " \"NPU_USE_NPUW\": \"YES\",\n", + " \"NPUW_LLM\": \"YES\",\n", + " \"NPUW_ONLINE_PIPELINE\": \"NONE\",\n", + " \"MAX_PROMPT_LEN\": 1024,\n", + " \"NPUW_LLM_MIN_RESPONSE_LEN\": 512,\n", + " },\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[OK] Tokenizer loaded from Fun-ASR-Nano-2512-ov\n", + "[OK] Frontend and inference config loaded from Fun-ASR-Nano-2512-ov/frontend_config.json\n" + ] + } + ], + "source": [ + "from ov_funasr_helper import OVFunASRNano\n", + "\n", + "ov_model = OVFunASRNano(ov_model_dir, device=device.value, llm_ov_config=llm_ov_config[device.value])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run Speech Recognition\n", + "\n", + "Let's transcribe a sample English audio file included with the model." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Audio file: Fun-ASR-Nano-2512/example/en.mp3\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transcription: The tribal chieftain called for the boy, and presented him with fifty pieces of gold.\n", + "\n", + "Inference time: 1.50s\n", + "Audio duration: 7.20s\n", + "Real-time factor: 0.21x\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "# Transcribe English sample\n", + "wav_path_en = str(model_dir / \"example\" / \"en.mp3\")\n", + "print(f\"Audio file: {wav_path_en}\\n\")\n", + "\n", + "start = time.perf_counter()\n", + "res, meta_data = ov_model.inference(data_in=[wav_path_en])\n", + "elapsed = time.perf_counter() - start\n", + "\n", + "text = res[0][\"text\"]\n", + "print(f\"Transcription: {text}\")\n", + "print(f\"\\nInference time: {elapsed:.2f}s\")\n", + "if \"batch_data_time\" in meta_data:\n", + " print(f\"Audio duration: {meta_data['batch_data_time']:.2f}s\")\n", + " print(f\"Real-time factor: {elapsed / meta_data['batch_data_time']:.2f}x\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Chinese transcription: 开饭时间早上九点至下午五点。\n", + "Inference time: 0.82s\n" + ] + } + ], + "source": [ + "# Transcribe Chinese sample (if available)\n", + "wav_path_zh = str(model_dir / \"example\" / \"zh.mp3\")\n", + "if Path(wav_path_zh).exists():\n", + " start = time.perf_counter()\n", + " res_zh, meta_zh = ov_model.inference(data_in=[wav_path_zh])\n", + " elapsed = time.perf_counter() - start\n", + " print(f\"Chinese transcription: {res_zh[0]['text']}\")\n", + " print(f\"Inference time: {elapsed:.2f}s\")\n", + "else:\n", + " print(\"Chinese sample not available in this model variant\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Multi-Device Inference (CPU / GPU / NPU)\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "OpenVINO supports multiple hardware backends. The following subsections validate inference on CPU and provide ready-to-run code for GPU and NPU devices." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available OpenVINO devices: ['CPU']\n", + "\n", + "=======================================================\n", + "6.1 CPU Inference\n", + "=======================================================\n", + "[OK] Tokenizer loaded from Fun-ASR-Nano-2512-ov\n", + "[OK] Frontend and inference config loaded from Fun-ASR-Nano-2512-ov/frontend_config.json\n", + " Result: The tribal chieftain called for the boy and presented him with fifty pieces of gold.\n", + " Time: 1.58s\n" + ] + } + ], + "source": [ + "import openvino as ov\n", + "import time\n", + "\n", + "core = ov.Core()\n", + "available_devices = core.available_devices\n", + "print(f\"Available OpenVINO devices: {available_devices}\")\n", + "\n", + "wav_test = str(model_dir / \"example\" / \"en.mp3\")\n", + "\n", + "# --- CPU inference (always available) ----------------------------------------\n", + "print(\"\\n\" + \"=\" * 55)\n", + "print(\"6.1 CPU Inference\")\n", + "print(\"=\" * 55)\n", + "\n", + "ov_cpu = OVFunASRNano(ov_model_dir, device=\"CPU\", llm_ov_config={})\n", + "start = time.perf_counter()\n", + "res_cpu, _ = ov_cpu.inference(data_in=[wav_test])\n", + "cpu_time = time.perf_counter() - start\n", + "print(f\" Result: {res_cpu[0]['text']}\")\n", + "print(f\" Time: {cpu_time:.2f}s\")\n", + "del ov_cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.2 GPU Inference\n", + "\n", + "Intel Xe / Arc / Iris Xe GPUs are supported via the `GPU` OpenVINO plugin.\n", + "Run the cell below on a machine with an Intel GPU to compare throughput against CPU.\n", + "\n", + "Key configuration knob:\n", + "- `ACTIVATIONS_SCALE_FACTOR` (`\"8.0\"`) - scales activations to reduce numeric range overflow on GPU, improving accuracy for quantized attention layers.\n", + "\n", + "> **Tip**: The first run may be slower due to kernel compilation. Use `CACHE_DIR` to persist compiled kernels across sessions." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No GPU device found. Available devices: ['CPU']\n", + "To run GPU inference, use a machine with an Intel GPU (Arc, Iris Xe, UHD).\n" + ] + } + ], + "source": [ + "gpu_llm_config = {\n", + " \"ACTIVATIONS_SCALE_FACTOR\": \"8.0\",\n", + " \"CACHE_DIR\": \".ovms_cache_gpu\",\n", + "}\n", + "\n", + "if \"GPU\" in core.available_devices:\n", + " print(\"=\" * 55)\n", + " print(\"GPU Inference\")\n", + " print(\"=\" * 55)\n", + " try:\n", + " ov_gpu = OVFunASRNano(\n", + " model_dir=ov_model_dir,\n", + " device=\"GPU\",\n", + " llm_ov_config=gpu_llm_config,\n", + " )\n", + " wav_test_en = str(model_dir / \"example\" / \"en.mp3\")\n", + " wav_test_zh = str(model_dir / \"example\" / \"zh.mp3\")\n", + "\n", + " start = time.perf_counter()\n", + " res_gpu_en, _ = ov_gpu.inference(data_in=[wav_test_en])\n", + " gpu_time_en = time.perf_counter() - start\n", + " print(f\" [EN] Result: {res_gpu_en[0]['text']}\")\n", + " print(f\" Time: {gpu_time_en:.2f}s\")\n", + "\n", + " start = time.perf_counter()\n", + " res_gpu_zh, _ = ov_gpu.inference(data_in=[wav_test_zh])\n", + " gpu_time_zh = time.perf_counter() - start\n", + " print(f\" [ZH] Result: {res_gpu_zh[0]['text']}\")\n", + " print(f\" Time: {gpu_time_zh:.2f}s\")\n", + "\n", + " del ov_gpu\n", + " except Exception as e:\n", + " print(f\"GPU inference failed: {e}\")\n", + "else:\n", + " print(f\"No GPU device found. Available devices: {core.available_devices}\")\n", + " print(\"To run GPU inference, use a machine with an Intel GPU (Arc, Iris Xe, UHD).\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.3 NPU Inference\n", + "\n", + "Intel NPU (Neural Processing Unit) is available on Intel Core Ultra (Series 1 / 2), Meteor Lake, and Lunar Lake processors.\n", + "\n", + "For FunASR Nano, **the LLM component runs on the NPU** while the audio encoder runs on CPU (NPU does not support dynamic shapes required by the encoder). This hybrid approach can reduce CPU load significantly.\n", + "\n", + "> **Note**: NPU compilation can take 30-60 seconds on the first run. Use `CACHE_DIR` to cache compiled models for instant re-use." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No NPU device found. Available devices: ['CPU']\n", + "To run NPU inference, use an Intel Core Ultra (Meteor Lake / Lunar Lake) CPU.\n" + ] + } + ], + "source": [ + "npu_llm_config = {\n", + " \"CACHE_DIR\": \".ovms_cache_npu\",\n", + " # Uncomment the line below for higher numerical precision on NPU layers:\n", + " # \"NPU_COMPILATION_MODE_PARAMS\": \"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add\",\n", + "}\n", + "\n", + "if \"NPU\" in core.available_devices:\n", + " print(\"=\" * 55)\n", + " print(\"NPU Inference (LLM → NPU | encoder → CPU)\")\n", + " print(\"=\" * 55)\n", + " try:\n", + " ov_npu = OVFunASRNano(\n", + " model_dir=ov_model_dir,\n", + " device=\"NPU\",\n", + " llm_ov_config=npu_llm_config,\n", + " )\n", + " wav_test_en = str(model_dir / \"example\" / \"en.mp3\")\n", + " wav_test_zh = str(model_dir / \"example\" / \"zh.mp3\")\n", + "\n", + " start = time.perf_counter()\n", + " res_npu_en, _ = ov_npu.inference(data_in=[wav_test_en])\n", + " npu_time_en = time.perf_counter() - start\n", + " print(f\" [EN] Result: {res_npu_en[0]['text']}\")\n", + " print(f\" Time: {npu_time_en:.2f}s\")\n", + "\n", + " start = time.perf_counter()\n", + " res_npu_zh, _ = ov_npu.inference(data_in=[wav_test_zh])\n", + " npu_time_zh = time.perf_counter() - start\n", + " print(f\" [ZH] Result: {res_npu_zh[0]['text']}\")\n", + " print(f\" Time: {npu_time_zh:.2f}s\")\n", + "\n", + " del ov_npu\n", + " except Exception as e:\n", + " print(f\"NPU inference failed: {e}\")\n", + "else:\n", + " print(f\"No NPU device found. Available devices: {core.available_devices}\")\n", + " print(\"To run NPU inference, use an Intel Core Ultra (Meteor Lake / Lunar Lake) CPU.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. OpenVINO GenAI Integration\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "### Can FunASR Nano use OpenVINO GenAI?\n", + "\n", + "OpenVINO GenAI provides high-level pipeline APIs for common model types. The relevant pipelines for an ASR model are:\n", + "\n", + "| GenAI Pipeline | Architecture | Applicable to FunASR? |\n", + "|---|---|---|\n", + "| `WhisperPipeline` | Whisper encoder-decoder with cross-attention | No - Different architecture |\n", + "| `LLMPipeline` | Text-only autoregressive LLMs (`input_ids`) | No - Needs `inputs_embeds` |\n", + "| `VLMPipeline` | Vision-Language models (image + text) | No - Not audio-based |\n", + "\n", + "The following cells **actually attempt** each pipeline and capture the resulting errors to show exactly why they fail.\n", + "\n", + "---\n", + "\n", + "**Why `WhisperPipeline` fails:**\n", + "\n", + "Whisper is a self-contained encoder-decoder where the encoder processes mel spectrograms and the decoder attends to encoder outputs via cross-attention. `WhisperPipeline` expects this specific two-model structure (e.g. `encoder_model.xml` + `decoder_model.xml`).\n", + "\n", + "FunASR is fundamentally different: its audio encoder produces embeddings that are **spliced directly into the LLM's token embedding sequence** before being fed to a standard causal LM. There is no cross-attention - the architecture is closer to a Vision-Language Model than to Whisper.\n", + "\n", + "**Why `LLMPipeline` fails:**\n", + "\n", + "The Qwen3-0.6B backbone *is* a standard causal LM, so at first glance `LLMPipeline` looks promising. However `LLMPipeline.generate()` only accepts raw text or `input_ids` - it has **no `inputs_embeds` pathway**. FunASR requires passing audio embeddings as `inputs_embeds` so they can be merged with text token embeddings before the first transformer layer. Without that, audio context cannot reach the LLM." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OpenVINO GenAI version: 2025.3.0.0-2463-3c0e2d3e7e1\n", + "\n", + "Available GenAI pipeline classes:\n", + " - ContinuousBatchingPipeline\n", + " - Image2ImagePipeline\n", + " - InpaintingPipeline\n", + " - LLMPipeline\n", + " - Text2ImagePipeline\n", + " - Text2SpeechPipeline\n", + " - TextEmbeddingPipeline\n", + " - TextRerankPipeline\n", + " - VLMPipeline\n", + " - WhisperPipeline\n", + "\n", + "======================================================================\n", + "Attempt 1: openvino_genai.WhisperPipeline(ov_model_dir, 'CPU')\n", + "======================================================================\n", + "Expected: FAILS -- WhisperPipeline needs encoder_model.xml / decoder_model.xml\n", + " FunASR uses openvino_encoder_model.xml + openvino_model.xml\n", + "\n", + " [FAIL] RuntimeError: Exception from src/inference/src/cpp/core.cpp:126:\n", + "Exception from src/inference/src/dev/plugin.cpp:58:\n", + "Check 'consumer.get_expr()->get_loop_ids() == loop_ids' failed at src/common/snippets/src/lowered/pass/move_scalar_to_consumer.cpp:34:\n", + "All consumers of a Scalar expression are expected to have the same loop IDs\n", + "\n", + "\n", + "\n", + "\n", + "======================================================================\n", + "Attempt 2: openvino_genai.LLMPipeline(ov_model_dir, 'CPU')\n", + "======================================================================\n", + "Expected: FAILS or gives wrong answers -- openvino_model.xml uses\n", + " 'inputs_embeds' not 'input_ids'; audio context is not injected.\n", + "\n", + " Pipeline loaded. Attempting text-only generation (no audio)...\n", + " [FAIL] RuntimeError: Check 'm_ireq_queue_tokenizer' failed at /home/jenkins/agent/workspace/private-ci/ie/build-linux-manylinux2014/b/repos/openvino.genai/src/cpp/src/tokenizer/tokenizer_impl.cpp:554:\n", + "Either openvino_tokenizer.xml was not provided or it was not loaded correctly. Tokenizer::encode is not available\n", + "\n", + "\n", + "======================================================================\n", + "Result: FunASR Nano cannot be used with any standard GenAI pipeline.\n", + " The OVFunASRNano wrapper in Section 5 is the correct approach.\n", + "======================================================================\n" + ] + } + ], + "source": [ + "import openvino_genai\n", + "\n", + "print(\"OpenVINO GenAI version:\", openvino_genai.__version__)\n", + "print()\n", + "print(\"Available GenAI pipeline classes:\")\n", + "for attr in sorted(dir(openvino_genai)):\n", + " if \"Pipeline\" in attr:\n", + " print(f\" - {attr}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Attempt 1: WhisperPipeline on the FunASR IR directory\n", + "# -----------------------------------------------------------------------------\n", + "print()\n", + "print(\"=\" * 70)\n", + "print(\"Attempt 1: openvino_genai.WhisperPipeline(ov_model_dir, 'CPU')\")\n", + "print(\"=\" * 70)\n", + "print(\"Expected: FAILS -- WhisperPipeline needs encoder_model.xml / decoder_model.xml\")\n", + "print(\" FunASR uses openvino_encoder_model.xml + openvino_model.xml\")\n", + "print()\n", + "try:\n", + " whisper_pipe = openvino_genai.WhisperPipeline(str(ov_model_dir), \"CPU\")\n", + " import numpy as np\n", + " import soundfile as sf\n", + " audio, sr = sf.read(wav_path_en, dtype=\"float32\")\n", + " result = whisper_pipe.generate(audio.tolist())\n", + " print(f\" [Unexpected success] Result: {result}\")\n", + "except Exception as e:\n", + " print(f\" [FAIL] {type(e).__name__}: {e}\")\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# Attempt 2: LLMPipeline on the FunASR IR directory\n", + "# -----------------------------------------------------------------------------\n", + "print()\n", + "print(\"=\" * 70)\n", + "print(\"Attempt 2: openvino_genai.LLMPipeline(ov_model_dir, 'CPU')\")\n", + "print(\"=\" * 70)\n", + "print(\"Expected: FAILS or gives wrong answers -- openvino_model.xml uses\")\n", + "print(\" 'inputs_embeds' not 'input_ids'; audio context is not injected.\")\n", + "print()\n", + "try:\n", + " llm_pipe = openvino_genai.LLMPipeline(str(ov_model_dir), \"CPU\")\n", + " print(\" Pipeline loaded. Attempting text-only generation (no audio)...\")\n", + " text_out = llm_pipe.generate(\n", + " \"Transcribe the following audio:\",\n", + " openvino_genai.GenerationConfig(max_new_tokens=20),\n", + " )\n", + " print(f\" Output (no audio context): {text_out!r}\")\n", + " print()\n", + " print(\" NOTE: Even if loading succeeds, there is no way to pass audio into\")\n", + " print(\" LLMPipeline -- it only accepts text/input_ids, not inputs_embeds.\")\n", + "except Exception as e:\n", + " print(f\" [FAIL] {type(e).__name__}: {e}\")\n", + "\n", + "print()\n", + "print(\"=\" * 70)\n", + "print(\"Result: FunASR Nano cannot be used with any standard GenAI pipeline.\")\n", + "print(\" The OVFunASRNano wrapper in Section 5 is the correct approach.\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Interactive Demo\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Launch a Gradio interface for interactive audio transcription. You can upload audio files or record from microphone." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gradio_helper import make_demo\n", + "\n", + "demo = make_demo(ov_model, model_dir)\n", + "\n", + "try:\n", + " demo.launch(debug=True)\n", + "except Exception:\n", + " demo.launch(debug=True, share=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + }, + "openvino_notebooks": { + "imageUrl": "https://github.com/user-attachments/assets/d55ea91b-0dd2-4a92-b6a1-3460edb41b6f", + "tags": { + "categories": [ + "Model Demos", + "AI Trends" + ], + "libraries": [], + "other": [], + "tasks": [ + "Speech Recognition" + ] + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/funasr-nano/notebook_utils.py b/notebooks/funasr-nano/notebook_utils.py new file mode 100644 index 00000000000..7b6967b6271 --- /dev/null +++ b/notebooks/funasr-nano/notebook_utils.py @@ -0,0 +1,756 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +import platform +import sys +import threading +import time +from os import PathLike +from pathlib import Path +from typing import List, NamedTuple, Optional + + +# ## Files +# +# Load an image, download a file, download an IR model, and create a progress bar to show download progress. + +# In[ ]: + + +def device_widget(default="AUTO", exclude=None, added=None, description="Device:"): + import openvino as ov + import ipywidgets as widgets + + core = ov.Core() + + supported_devices = core.available_devices + ["AUTO"] + exclude = exclude or [] + if exclude: + for ex_device in exclude: + if ex_device in supported_devices: + supported_devices.remove(ex_device) + + added = added or [] + if added: + for add_device in added: + if add_device not in supported_devices: + supported_devices.append(add_device) + + device = widgets.Dropdown( + options=supported_devices, + value=default, + description=description, + disabled=False, + ) + return device + + +def quantization_widget(default=True): + import ipywidgets as widgets + + to_quantize = widgets.Checkbox( + value=default, + description="Quantization", + disabled=False, + ) + + return to_quantize + + +def pip_install(*args): + import subprocess # nosec - disable B404:import-subprocess check + + cli_args = [] + for arg in args: + cli_args.extend(str(arg).split(" ")) + subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], shell=(platform.system() == "Windows"), check=True) + + +def load_image(name: str, url: str = None): + """ + Loads an image by `url` and returns it as BGR numpy array. The image is + stored to the filesystem with name `name`. If the image file already exists + loads the local image. + + :param name: Local path name of the image. + :param url: url to the image + :return: image as BGR numpy array + """ + import cv2 + import numpy as np + import requests + + if not Path(name).exists(): + # Set User-Agent to Mozilla because some websites block + # requests with User-Agent Python + response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) + array = np.asarray(bytearray(response.content), dtype="uint8") + image = cv2.imdecode(array, -1) # Loads the image as BGR + cv2.imwrite(name, image) + else: + image = cv2.imread(name) + + return image + + +def download_file( + url: PathLike, + filename: PathLike = None, + directory: PathLike = None, + show_progress: bool = True, +) -> PathLike: + """ + Download a file from a url and save it to the local filesystem. The file is saved to the + current directory by default, or to `directory` if specified. If a filename is not given, + the filename of the URL will be used. + + :param url: URL that points to the file to download + :param filename: Name of the local file to save. Should point to the name of the file only, + not the full path. If None the filename from the url will be used + :param directory: Directory to save the file to. Will be created if it doesn't exist + If None the file will be saved to the current working directory + :param show_progress: If True, show an TQDM ProgressBar + :param silent: If True, do not print a message if the file already exists + :param timeout: Number of seconds before cancelling the connection attempt + :return: path to downloaded file + """ + from tqdm.notebook import tqdm_notebook + import requests + import urllib.parse + + filename = filename or Path(urllib.parse.urlparse(url).path).name + chunk_size = 16384 # make chunks bigger so that not too many updates are triggered for Jupyter front-end + + filename = Path(filename) + if len(filename.parts) > 1: + raise ValueError( + "`filename` should refer to the name of the file, excluding the directory. " + "Use the `directory` parameter to specify a target directory for the downloaded file." + ) + + filepath = Path(directory) / filename if directory is not None else filename + if filepath.exists(): + return filepath.resolve() + + # create the directory if it does not exist, and add the directory to the filename + if directory is not None: + Path(directory).mkdir(parents=True, exist_ok=True) + + try: + response = requests.get(url=url, headers={"User-agent": "Mozilla/5.0"}, stream=True) + response.raise_for_status() + except ( + requests.exceptions.HTTPError + ) as error: # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}" + raise Exception(error) from None + except requests.exceptions.Timeout: + raise Exception( + "Connection timed out. If you access the internet through a proxy server, please " + "make sure the proxy is set in the shell from where you launched Jupyter." + ) from None + except requests.exceptions.RequestException as error: + raise Exception(f"File downloading failed with error: {error}") from None + + # download the file if it does not exist + filesize = int(response.headers.get("Content-length", 0)) + if not filepath.exists(): + with tqdm_notebook( + total=filesize, + unit="B", + unit_scale=True, + unit_divisor=1024, + desc=str(filename), + disable=not show_progress, + ) as progress_bar: + with open(filepath, "wb") as file_object: + for chunk in response.iter_content(chunk_size): + file_object.write(chunk) + progress_bar.update(len(chunk)) + progress_bar.refresh() + else: + print(f"'{filepath}' already exists.") + + response.close() + + return filepath.resolve() + + +def download_ir_model(model_xml_url: str, destination_folder: PathLike = None) -> PathLike: + """ + Download IR model from `model_xml_url`. Downloads model xml and bin file; the weights file is + assumed to exist at the same location and name as model_xml_url with a ".bin" extension. + + :param model_xml_url: URL to model xml file to download + :param destination_folder: Directory where downloaded model xml and bin are saved. If None, model + files are saved to the current directory + :return: path to downloaded xml model file + """ + model_bin_url = model_xml_url[:-4] + ".bin" + model_xml_path = download_file(model_xml_url, directory=destination_folder, show_progress=False) + download_file(model_bin_url, directory=destination_folder) + return model_xml_path + + +# ## Images + +# ### Convert Pixel Data +# +# Normalize image pixel values between 0 and 1, and convert images to RGB and BGR. + +# In[ ]: + + +def normalize_minmax(data): + """ + Normalizes the values in `data` between 0 and 1 + """ + if data.max() == data.min(): + raise ValueError("Normalization is not possible because all elements of" f"`data` have the same value: {data.max()}.") + return (data - data.min()) / (data.max() - data.min()) + + +def to_rgb(image_data): + """ + Convert image_data from BGR to RGB + """ + import cv2 + + return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) + + +def to_bgr(image_data): + """ + Convert image_data from RGB to BGR + """ + import cv2 + + return cv2.cvtColor(image_data, cv2.COLOR_RGB2BGR) + + +# ## Videos + +# ### Video Player +# +# Custom video player to fulfill FPS requirements. You can set target FPS and output size, flip the video horizontally or skip first N frames. + +# In[ ]: + + +class VideoPlayer: + """ + Custom video player to fulfill FPS requirements. You can set target FPS and output size, + flip the video horizontally or skip first N frames. + + :param source: Video source. It could be either camera device or video file. + :param size: Output frame size. + :param flip: Flip source horizontally. + :param fps: Target FPS. + :param skip_first_frames: Skip first N frames. + """ + + def __init__(self, source, size=None, flip=False, fps=None, skip_first_frames=0, width=1280, height=720): + import cv2 + + self.cv2 = cv2 # This is done to access the package in class methods + self.__cap = cv2.VideoCapture(source) + # try HD by default to get better video quality + self.__cap.set(cv2.CAP_PROP_FRAME_WIDTH, width) + self.__cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height) + + if not self.__cap.isOpened(): + raise RuntimeError(f"Cannot open {'camera' if isinstance(source, int) else ''} {source}") + # skip first N frames + self.__cap.set(cv2.CAP_PROP_POS_FRAMES, skip_first_frames) + # fps of input file + self.__input_fps = self.__cap.get(cv2.CAP_PROP_FPS) + if self.__input_fps <= 0: + self.__input_fps = 60 + # target fps given by user + self.__output_fps = fps if fps is not None else self.__input_fps + self.__flip = flip + self.__size = None + self.__interpolation = None + if size is not None: + self.__size = size + # AREA better for shrinking, LINEAR better for enlarging + self.__interpolation = cv2.INTER_AREA if size[0] < self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH) else cv2.INTER_LINEAR + # first frame + _, self.__frame = self.__cap.read() + self.__lock = threading.Lock() + self.__thread = None + self.__stop = False + + """ + Start playing. + """ + + def start(self): + self.__stop = False + self.__thread = threading.Thread(target=self.__run, daemon=True) + self.__thread.start() + + """ + Stop playing and release resources. + """ + + def stop(self): + self.__stop = True + if self.__thread is not None: + self.__thread.join() + self.__cap.release() + + def __run(self): + prev_time = 0 + while not self.__stop: + t1 = time.time() + ret, frame = self.__cap.read() + if not ret: + break + + # fulfill target fps + if 1 / self.__output_fps < time.time() - prev_time: + prev_time = time.time() + # replace by current frame + with self.__lock: + self.__frame = frame + + t2 = time.time() + # time to wait [s] to fulfill input fps + wait_time = 1 / self.__input_fps - (t2 - t1) + # wait until + time.sleep(max(0, wait_time)) + + self.__frame = None + + """ + Get current frame. + """ + + def next(self): + import cv2 + + with self.__lock: + if self.__frame is None: + return None + # need to copy frame, because can be cached and reused if fps is low + frame = self.__frame.copy() + if self.__size is not None: + frame = self.cv2.resize(frame, self.__size, interpolation=self.__interpolation) + if self.__flip: + frame = self.cv2.flip(frame, 1) + return frame + + +# ## Visualization + +# ### Segmentation +# +# Define a SegmentationMap NamedTuple that keeps the labels and colormap for a segmentation project/dataset. Create CityScapesSegmentation and BinarySegmentation SegmentationMaps. Create a function to convert a segmentation map to an RGB image with a colormap, and to show the segmentation result as an overlay over the original image. + +# In[ ]: + + +class Label(NamedTuple): + index: int + color: tuple + name: Optional[str] = None + + +# In[ ]: + + +class SegmentationMap(NamedTuple): + labels: list + + def get_colormap(self): + import numpy as np + + return np.array([label.color for label in self.labels]) + + def get_labels(self): + labelnames = [label.name for label in self.labels] + if any(labelnames): + return labelnames + else: + return None + + +# In[ ]: + + +cityscape_labels = [ + Label(index=0, color=(128, 64, 128), name="road"), + Label(index=1, color=(244, 35, 232), name="sidewalk"), + Label(index=2, color=(70, 70, 70), name="building"), + Label(index=3, color=(102, 102, 156), name="wall"), + Label(index=4, color=(190, 153, 153), name="fence"), + Label(index=5, color=(153, 153, 153), name="pole"), + Label(index=6, color=(250, 170, 30), name="traffic light"), + Label(index=7, color=(220, 220, 0), name="traffic sign"), + Label(index=8, color=(107, 142, 35), name="vegetation"), + Label(index=9, color=(152, 251, 152), name="terrain"), + Label(index=10, color=(70, 130, 180), name="sky"), + Label(index=11, color=(220, 20, 60), name="person"), + Label(index=12, color=(255, 0, 0), name="rider"), + Label(index=13, color=(0, 0, 142), name="car"), + Label(index=14, color=(0, 0, 70), name="truck"), + Label(index=15, color=(0, 60, 100), name="bus"), + Label(index=16, color=(0, 80, 100), name="train"), + Label(index=17, color=(0, 0, 230), name="motorcycle"), + Label(index=18, color=(119, 11, 32), name="bicycle"), + Label(index=19, color=(255, 255, 255), name="background"), +] + +CityScapesSegmentation = SegmentationMap(cityscape_labels) + +binary_labels = [ + Label(index=0, color=(255, 255, 255), name="background"), + Label(index=1, color=(0, 0, 0), name="foreground"), +] + +BinarySegmentation = SegmentationMap(binary_labels) + + +# In[ ]: + + +def segmentation_map_to_image(result, colormap, remove_holes: bool = False): + """ + Convert network result of floating point numbers to an RGB image with + integer values from 0-255 by applying a colormap. + + :param result: A single network result after converting to pixel values in H,W or 1,H,W shape. + :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class. + :param remove_holes: If True, remove holes in the segmentation result. + :return: An RGB image where each pixel is an int8 value according to colormap. + """ + import cv2 + import numpy as np + + if len(result.shape) != 2 and result.shape[0] != 1: + raise ValueError(f"Expected result with shape (H,W) or (1,H,W), got result with shape {result.shape}") + + if len(np.unique(result)) > colormap.shape[0]: + raise ValueError( + f"Expected max {colormap[0]} classes in result, got {len(np.unique(result))} " + "different output values. Please make sure to convert the network output to " + "pixel values before calling this function." + ) + elif result.shape[0] == 1: + result = result.squeeze(0) + + result = result.astype(np.uint8) + + contour_mode = cv2.RETR_EXTERNAL if remove_holes else cv2.RETR_TREE + mask = np.zeros((result.shape[0], result.shape[1], 3), dtype=np.uint8) + for label_index, color in enumerate(colormap): + label_index_map = result == label_index + label_index_map = label_index_map.astype(np.uint8) * 255 + contours, hierarchies = cv2.findContours(label_index_map, contour_mode, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours( + mask, + contours, + contourIdx=-1, + color=color.tolist(), + thickness=cv2.FILLED, + ) + + return mask + + +def segmentation_map_to_overlay(image, result, alpha, colormap, remove_holes=False): + """ + Returns a new image where a segmentation mask (created with colormap) is overlayed on + the source image. + + :param image: Source image. + :param result: A single network result after converting to pixel values in H,W or 1,H,W shape. + :param alpha: Alpha transparency value for the overlay image. + :param colormap: A numpy array of shape (num_classes, 3) with an RGB value per class. + :param remove_holes: If True, remove holes in the segmentation result. + :return: An RGP image with segmentation mask overlayed on the source image. + """ + import cv2 + import numpy as np + + if len(image.shape) == 2: + image = np.repeat(np.expand_dims(image, -1), 3, 2) + mask = segmentation_map_to_image(result, colormap, remove_holes) + image_height, image_width = image.shape[:2] + mask = cv2.resize(src=mask, dsize=(image_width, image_height)) + return cv2.addWeighted(mask, alpha, image, 1 - alpha, 0) + + +# ### Network Results +# +# Show network result image, optionally together with the source image and a legend with labels. + +# In[ ]: + + +def viz_result_image( + result_image, + source_image=None, + source_title: str = None, + result_title: str = None, + labels: Optional[List[Label]] = None, + resize: bool = False, + bgr_to_rgb: bool = False, + hide_axes: bool = False, +): + """ + Show result image, optionally together with source images, and a legend with labels. + + :param result_image: Numpy array of RGB result image. + :param source_image: Numpy array of source image. If provided this image will be shown + next to the result image. source_image is expected to be in RGB format. + Set bgr_to_rgb to True if source_image is in BGR format. + :param source_title: Title to display for the source image. + :param result_title: Title to display for the result image. + :param labels: list of labels. If provided, a legend will be shown with the given labels. + :param resize: If true, resize the result image to the same shape as the source image. + :param bgr_to_rgb: If true, convert the source image from BGR to RGB. Use this option if + source_image is a BGR image. + :param hide_axes: If true, do not show matplotlib axes. + :return: Matplotlib figure with result image + """ + import cv2 + import numpy as np + import matplotlib.pyplot as plt + from matplotlib.lines import Line2D + + if bgr_to_rgb: + source_image = to_rgb(source_image) + if resize: + result_image = cv2.resize(result_image, (source_image.shape[1], source_image.shape[0])) + + num_images = 1 if source_image is None else 2 + + fig, ax = plt.subplots(1, num_images, figsize=(16, 8), squeeze=False) + if source_image is not None: + ax[0, 0].imshow(source_image) + ax[0, 0].set_title(source_title) + + ax[0, num_images - 1].imshow(result_image) + ax[0, num_images - 1].set_title(result_title) + + if hide_axes: + for a in ax.ravel(): + a.axis("off") + if labels: + colors = labels.get_colormap() + lines = [ + Line2D( + [0], + [0], + color=[item / 255 for item in c.tolist()], + linewidth=3, + linestyle="-", + ) + for c in colors + ] + plt.legend( + lines, + labels.get_labels(), + bbox_to_anchor=(1, 1), + loc="upper left", + prop={"size": 12}, + ) + plt.close(fig) + return fig + + +# ### Live Inference + +# In[ ]: + + +def show_array(frame, display_handle=None): + """ + Display array `frame`. Replace information at `display_handle` with `frame` + encoded as jpeg image. `frame` is expected to have data in BGR order. + + Create a display_handle with: `display_handle = display(display_id=True)` + """ + import cv2 + from IPython.display import Image, display + + _, frame = cv2.imencode(ext=".jpeg", img=frame) + if display_handle is None: + display_handle = display(Image(data=frame.tobytes()), display_id=True) + else: + display_handle.update(Image(data=frame.tobytes())) + return display_handle + + +# ## Checks and Alerts +# +# Create an alert class to show stylized info/error/warning messages and a `check_device` function that checks whether a given device is available. + +# In[ ]: + + +class NotebookAlert(Exception): + def __init__(self, message: str, alert_class: str): + """ + Show an alert box with the given message. + + :param message: The message to display. + :param alert_class: The class for styling the message. Options: info, warning, success, danger. + """ + self.message = message + self.alert_class = alert_class + self.show_message() + + def show_message(self): + from IPython.display import HTML, display + + display(HTML(f"""
{self.message}""")) + + +class DeviceNotFoundAlert(NotebookAlert): + def __init__(self, device: str): + """ + Show a warning message about an unavailable device. This class does not check whether or + not the device is available, use the `check_device` function to check this. `check_device` + also shows the warning if the device is not found. + + :param device: The unavailable device. + :return: A formatted alert box with the message that `device` is not available, and a list + of devices that are available. + """ + import openvino as ov + + core = ov.Core() + supported_devices = core.available_devices + self.message = f"Running this cell requires a {device} device, " "which is not available on this system. " + self.alert_class = "warning" + if len(supported_devices) == 1: + self.message += f"The following device is available: {core.available_devices[0]}" + else: + self.message += "The following devices are available: " f"{', '.join(core.available_devices)}" + super().__init__(self.message, self.alert_class) + + +def check_device(device: str) -> bool: + """ + Check if the specified device is available on the system. + + :param device: Device to check. e.g. CPU, GPU + :return: True if the device is available, False if not. If the device is not available, + a DeviceNotFoundAlert will be shown. + """ + import openvino as ov + + core = ov.Core() + if device not in core.available_devices: + DeviceNotFoundAlert(device) + return False + else: + return True + + +def check_openvino_version(version: str) -> bool: + """ + Check if the specified OpenVINO version is installed. + + :param version: the OpenVINO version to check. Example: 2021.4 + :return: True if the version is installed, False if not. If the version is not installed, + an alert message will be shown. + """ + import openvino as ov + + installed_version = ov.get_version() + if version not in installed_version: + NotebookAlert( + f"This notebook requires OpenVINO {version}. " + f"The version on your system is: {installed_version}.
" + "Please run pip install --upgrade -r requirements.txt " + "in the openvino_env environment to install this version. " + "See the " + "OpenVINO Notebooks README for detailed instructions", + alert_class="danger", + ) + return False + else: + return True + + +def optimize_bge_embedding(model_path, output_model_path): + """ + optimize_bge_embedding used to optimize BGE model for NPU device + + Arguments: + model_path {str} -- original BGE IR model path + output_model_path {str} -- Converted BGE IR model path + """ + import openvino as ov + + try: + from openvino.passes import Manager, MatcherPass, WrapType, Matcher + from openvino import opset10 as ops + except ImportError: + from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher + from openvino.runtime import opset10 as ops + core = ov.Core() + ov_model = core.read_model(model_path) + manager = Manager() + packed_layername_tensor_dict_list = [{"name": "aten::mul/Multiply"}] + + class ReplaceTensor(MatcherPass): + def __init__(self, packed_layername_tensor_dict_list): + MatcherPass.__init__(self) + self.model_changed = False + + param = WrapType("opset10.Multiply") + + def callback(matcher: Matcher) -> bool: + import numpy as np + + root = matcher.get_match_root() + if root is None: + return False + for y in packed_layername_tensor_dict_list: + root_name = root.get_friendly_name() + if root_name.find(y["name"]) != -1: + max_fp16 = np.array([[[[-np.finfo(np.float16).max]]]]).astype(np.float32) + new_tenser = ops.constant(max_fp16, ov.Type.f32, name="Constant_4431") + root.set_arguments([root.input_value(0).node, new_tenser]) + packed_layername_tensor_dict_list.remove(y) + + return True + + self.register_matcher(Matcher(param, "ReplaceTensor"), callback) + + manager.register_pass(ReplaceTensor(packed_layername_tensor_dict_list)) + manager.run_passes(ov_model) + ov.save_model(ov_model, output_model_path, compress_to_fp16=False) + + +def collect_telemetry(file: str = ""): + """ + The function only tracks that the notebooks cell was executed and does not include any personally identifiable information (PII). + """ + try: + import os + import requests + import platform + from pathlib import Path + + if os.getenv("SCARF_NO_ANALYTICS") == "1" or os.getenv("DO_NOT_TRACK") == "1": + return + url = "https://openvino.gateway.scarf.sh/telemetry" + params = { + "notebook_dir": Path(__file__).parent.name, + "platform": platform.system(), + "arch": platform.machine(), + "python_version": platform.python_version(), + } + if file: + params["file"] = file + requests.get(url, params=params) + except Exception: + pass diff --git a/notebooks/funasr-nano/ov_funasr_helper.py b/notebooks/funasr-nano/ov_funasr_helper.py index 0f2aa53ddfc..31892ccb91c 100644 --- a/notebooks/funasr-nano/ov_funasr_helper.py +++ b/notebooks/funasr-nano/ov_funasr_helper.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import openvino as ov import nncf from pathlib import Path @@ -417,7 +419,7 @@ def convert_funasr(model_id, model_path=None, quantization_config=None): model_path = Path(model_path) if all((model_path / model_name).exists() for model_name in [FRONTEND_CONFIG_PATH, TEXT_EMBEDDINGS_PATH, ENCODER_PATH, LANGUAGE_PATH]): - print(f"✅ {model_id} model already converted. You can find results in {model_path}") + print(f"[OK] {model_id} model already converted. You can find results in {model_path}") return model_path print(f"⌛ {model_id} conversion started. Be patient, it may takes some time.") print("⌛ Load Original model") @@ -426,7 +428,7 @@ def convert_funasr(model_id, model_path=None, quantization_config=None): pt_model, kwargs = FunASRNano.from_pretrained(model=model_id, device="cpu") kwargs pt_model = pt_model.to(torch.float32) - print("✅ Original model successfully loaded") + print("[OK] Original model successfully loaded") print("⌛ Export tokenizer and config") kwargs["tokenizer"].save_pretrained(model_path) for json_file in Path(model_id + "/Qwen3-0.6B").glob("*.json"): @@ -457,7 +459,7 @@ def convert_funasr(model_id, model_path=None, quantization_config=None): } with open(model_path / FRONTEND_CONFIG_PATH, "w") as f: json.dump(frontend_config, f, indent=2) - print("✅ Frontend config exported") + print("[OK] Frontend config exported") if not (model_path / TEXT_EMBEDDINGS_PATH).exists(): print("⌛ Convert TEXT_EMBEDDINGS model") @@ -467,7 +469,7 @@ def convert_funasr(model_id, model_path=None, quantization_config=None): del ov_model cleanup_torchscript_cache() gc.collect() - print("✅ TEXT_EMBEDDINGS model successfully converted") + print("[OK] TEXT_EMBEDDINGS model successfully converted") if not (model_path / ENCODER_PATH).exists(): print("⌛ Convert ENCODER_PATH model") @@ -493,7 +495,7 @@ def forward_wrap_encoder(self, speech: torch.Tensor, speech_lengths: torch.Tenso del pt_model._orig_forward cleanup_torchscript_cache() gc.collect() - print("✅ ENCODER model successfully converted") + print("[OK] ENCODER model successfully converted") if not (model_path / LANGUAGE_PATH).exists(): print("⌛ Convert LANGUAGE_MODEL model") @@ -584,11 +586,11 @@ def forward_wrap( for output, output_name in zip(ov_model.outputs, output_names): output.get_tensor().set_names({output_name}) patch_stateful(ov_model) - print("✅ Decoder model successfully converted") + print("[OK] Decoder model successfully converted") if quantization_config is not None and "llm" in quantization_config: print(f"⌛ Weights compression with {quantization_config['llm']['mode']} mode started") ov_model = nncf.compress_weights(ov_model, **quantization_config["llm"]) - print("✅ Weights compression finished") + print("[OK] Weights compression finished") else: ov_model.set_rt_info("f16", ["runtime_options", "KV_CACHE_PRECISION"]) ov_model.set_rt_info("8.0", ["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]) @@ -599,7 +601,7 @@ def forward_wrap( del pt_model gc.collect() - print(f"✅ {model_id} model conversion finished. You can find results in {model_path}") + print(f"[OK] {model_id} model conversion finished. You can find results in {model_path}") return model_path @@ -851,7 +853,7 @@ def __init__(self, pretrained_dir, device, llm_ov_config={}): # Load tokenizer from saved config self.tokenizer = AutoTokenizer.from_pretrained(model_dir) - print(f"✅ Tokenizer loaded from {model_dir}") + print(f"[OK] Tokenizer loaded from {model_dir}") # Load frontend from saved config frontend_config_path = model_dir / FRONTEND_CONFIG_PATH @@ -869,11 +871,11 @@ def __init__(self, pretrained_dir, device, llm_ov_config={}): "max_length": config.get("max_length", 512), "batch_size": config.get("batch_size", 1), } - print(f"✅ Frontend and inference config loaded from {frontend_config_path}") + print(f"[OK] Frontend and inference config loaded from {frontend_config_path}") else: self.frontend = None self.inference_kwargs = {} - print(f"⚠️ Frontend config not found at {frontend_config_path}, frontend will need to be provided manually") + print(f"[WARN] Frontend config not found at {frontend_config_path}, frontend will need to be provided manually") def data_template(self, data): system, user, assistant = [], [], [] diff --git a/notebooks/funasr-nano/pip_helper.py b/notebooks/funasr-nano/pip_helper.py new file mode 100644 index 00000000000..18116023b9e --- /dev/null +++ b/notebooks/funasr-nano/pip_helper.py @@ -0,0 +1,10 @@ +import sys + + +def pip_install(*args): + import subprocess # nosec - disable B404:import-subprocess check + + cli_args = [] + for arg in args: + cli_args.extend(str(arg).split(" ")) + subprocess.run([sys.executable, "-m", "pip", "install", *cli_args], check=True)