NexaAI · ximing-crypto · Oct 22, 2025 · Oct 23, 2025 · mengshengwu · Oct 22, 2025
diff --git a/bindings/python/npu.ipynb b/bindings/python/npu.ipynb
@@ -12,6 +12,7 @@
         "- **VLM (Vision Language Model)**: Multimodal understanding and generation\n",
         "- **Embedder**: Text vectorization and similarity computation\n",
         "- **Reranker**: Document reranking\n",
+        "- **CV (Computer Vision)**: Image processing and OCR tasks\n",
         "- **ASR (Automatic Speech Recognition)**: Speech-to-text transcription\n",
         "\n",
         "## Prerequisites\n",
@@ -49,6 +50,12 @@
         "```bash\n",
         "export NEXA_TOKEN=\"YOUR_NEXA_TOKEN_HERE\"\n",
         "```\n"
+        "# CV model\n",
+        "nexa pull NexaAI/paddleocr-npu\n",
+        "\n",
+        "# ASR model\n",
+        "nexa pull NexaAI/parakeet-npu\n",
+        "```"
       ]
     },
     {
@@ -61,6 +68,19 @@
         "\n",
         "# Replace \"YOUR_NEXA_TOKEN_HERE\" with your actual token from https://sdk.nexa.ai/\n",
         "os.environ[\"NEXA_TOKEN\"] = \"YOUR_NEXA_TOKEN_HERE\"\n",
+        "import io\n",
+        "import time\n",
+        "import numpy as np\n",
+        "from typing import List, Optional\n",
+        "\n",
+        "# NexaAI SDK imports\n",
+        "from nexaai.llm import LLM, GenerationConfig\n",
+        "from nexaai.vlm import VLM\n",
+        "from nexaai.embedder import Embedder, EmbeddingConfig\n",
+        "from nexaai.rerank import Reranker, RerankConfig\n",
+        "from nexaai.cv import CV, CVConfig, CVCapabilities\n",
+        "from nexaai.asr import ASR, ASRConfig\n",
+        "from nexaai.common import ModelConfig, ChatMessage, MultiModalMessage, MultiModalMessageContent\n",
         "\n",
         "if os.environ.get(\"NEXA_TOKEN\") and os.environ[\"NEXA_TOKEN\"] != \"YOUR_NEXA_TOKEN_HERE\":\n",
         "    print(\"NEXA_TOKEN is set successfully!\")\n",
@@ -373,7 +393,101 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 4. ASR (Automatic Speech Recognition) NPU Inference\n",
+        "## 4. CV (Computer Vision) NPU Inference\n",
+        "\n",
+        "Using NPU-accelerated computer vision models for image processing tasks such as OCR (Optical Character Recognition). CV models can perform text detection and recognition on images with NPU acceleration.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# CV NPU Inference Example\n",
+        "def cv_npu_example():\n",
+        "    \"\"\"CV NPU inference example\"\"\"\n",
+        "    print(\"=== CV NPU Inference Example ===\")\n",
+        "    \n",
+        "    # Model configuration\n",
+        "    model_name = \"NexaAI/paddleocr-npu\"  # Example CV model for OCR\n",
+        "    plugin_id = \"npu\"\n",
+        "    device = \"npu\"\n",
+        "    \n",
+        "    print(f\"Loading model: {model_name}\")\n",
+        "    print(f\"Using plugin: {plugin_id}\")\n",
+        "    print(f\"Device: {device}\")\n",
+        "    \n",
+        "    # Create CV instance\n",
+        "    cv = CV.from_(name_or_path=model_name, plugin_id=plugin_id, device_id=device)\n",
+        "    print('CV model loaded successfully!')\n",
+        "    \n",
+        "    # Example image file (replace with your actual image file)\n",
+        "    image_file = \"path/to/your/image.jpg\"  # Replace with actual image file path\n",
+        "    \n",
+        "    print(f\"\\nNote: Please update the image_file path to point to your image file\")\n",
+        "    print(f\"Current image_file: {image_file}\")\n",
+        "    \n",
+        "    # Check if image file exists\n",
+        "    if not os.path.exists(image_file):\n",
+        "        print(f\"Error: Image file not found: {image_file}\")\n",
+        "        print(\"Please provide a valid image file path to test CV functionality.\")\n",
+        "        return None\n",
+        "    \n",
+        "    # Basic CV configuration for OCR\n",
+        "    config = CVConfig(\n",
+        "        capabilities=CVCapabilities.OCR,\n",
+        "        det_model_path=None,  # Will use default detection model\n",
+        "        rec_model_path=None   # Will use default recognition model\n",
+        "    )\n",
+        "    \n",
+        "    print(f\"\\n=== Starting CV Inference ===\")\n",
+        "    start_time = time.time()\n",
+        "    \n",
+        "    # Perform CV inference (OCR)\n",
+        "    result = cv.infer(image_path=image_file, config=config)\n",
+        "    \n",
+        "    end_time = time.time()\n",
+        "    inference_time = end_time - start_time\n",
+        "    \n",
+        "    # Display results\n",
+        "    print(f\"\\n=== CV Inference Results ===\")\n",
+        "    print(f\"Number of detected text regions: {result.result_count}\")\n",
+        "    print(f\"Processing time: {inference_time:.2f} seconds\")\n",
+        "    \n",
+        "    # Display OCR results\n",
+        "    if result.results:\n",
+        "        print(f\"\\nDetected text:\")\n",
+        "        for i, cv_result in enumerate(result.results):\n",
+        "            if cv_result.text:\n",
+        "                confidence = cv_result.confidence\n",
+        "                text = cv_result.text.strip()\n",
+        "                print(f\"  {i+1}. [{confidence:.2f}] {text}\")\n",
+        "                \n",
+        "                # Display bounding box if available\n",
+        "                if cv_result.bbox:\n",
+        "                    bbox = cv_result.bbox\n",
+        "                    print(f\"      Bounding box: x={bbox.x:.1f}, y={bbox.y:.1f}, \"\n",
+        "                          f\"w={bbox.width:.1f}, h={bbox.height:.1f}\")\n",
+        "    else:\n",
+        "        print(\"No text detected in the image.\")\n",
+        "    \n",
+        "    # Get profiling data\n",
+        "    profiling_data = cv.get_profiling_data()\n",
+        "    if profiling_data:\n",
+        "        print(f\"\\nProfiling data: {profiling_data}\")\n",
+        "    \n",
+        "    return result\n",
+        "\n",
+        "# Run CV example\n",
+        "cv_result = cv_npu_example()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 5. ASR (Automatic Speech Recognition) NPU Inference\n",
         "\n",
         "Using NPU-accelerated speech recognition models for speech-to-text transcription. parakeet-npu provides high-quality speech recognition with NPU acceleration.\n"
       ]