diff --git a/bindings/python/npu.ipynb b/bindings/python/npu.ipynb index 221e5790..7a1715bd 100644 --- a/bindings/python/npu.ipynb +++ b/bindings/python/npu.ipynb @@ -12,6 +12,7 @@ "- **VLM (Vision Language Model)**: Multimodal understanding and generation\n", "- **Embedder**: Text vectorization and similarity computation\n", "- **Reranker**: Document reranking\n", + "- **CV (Computer Vision)**: Image processing and OCR tasks\n", "- **ASR (Automatic Speech Recognition)**: Speech-to-text transcription\n", "\n", "## Prerequisites\n", @@ -49,6 +50,12 @@ "```bash\n", "export NEXA_TOKEN=\"YOUR_NEXA_TOKEN_HERE\"\n", "```\n" + "# CV model\n", + "nexa pull NexaAI/paddleocr-npu\n", + "\n", + "# ASR model\n", + "nexa pull NexaAI/parakeet-npu\n", + "```" ] }, { @@ -61,6 +68,19 @@ "\n", "# Replace \"YOUR_NEXA_TOKEN_HERE\" with your actual token from https://sdk.nexa.ai/\n", "os.environ[\"NEXA_TOKEN\"] = \"YOUR_NEXA_TOKEN_HERE\"\n", + "import io\n", + "import time\n", + "import numpy as np\n", + "from typing import List, Optional\n", + "\n", + "# NexaAI SDK imports\n", + "from nexaai.llm import LLM, GenerationConfig\n", + "from nexaai.vlm import VLM\n", + "from nexaai.embedder import Embedder, EmbeddingConfig\n", + "from nexaai.rerank import Reranker, RerankConfig\n", + "from nexaai.cv import CV, CVConfig, CVCapabilities\n", + "from nexaai.asr import ASR, ASRConfig\n", + "from nexaai.common import ModelConfig, ChatMessage, MultiModalMessage, MultiModalMessageContent\n", "\n", "if os.environ.get(\"NEXA_TOKEN\") and os.environ[\"NEXA_TOKEN\"] != \"YOUR_NEXA_TOKEN_HERE\":\n", " print(\"NEXA_TOKEN is set successfully!\")\n", @@ -373,7 +393,101 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. ASR (Automatic Speech Recognition) NPU Inference\n", + "## 4. CV (Computer Vision) NPU Inference\n", + "\n", + "Using NPU-accelerated computer vision models for image processing tasks such as OCR (Optical Character Recognition). CV models can perform text detection and recognition on images with NPU acceleration.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# CV NPU Inference Example\n", + "def cv_npu_example():\n", + " \"\"\"CV NPU inference example\"\"\"\n", + " print(\"=== CV NPU Inference Example ===\")\n", + " \n", + " # Model configuration\n", + " model_name = \"NexaAI/paddleocr-npu\" # Example CV model for OCR\n", + " plugin_id = \"npu\"\n", + " device = \"npu\"\n", + " \n", + " print(f\"Loading model: {model_name}\")\n", + " print(f\"Using plugin: {plugin_id}\")\n", + " print(f\"Device: {device}\")\n", + " \n", + " # Create CV instance\n", + " cv = CV.from_(name_or_path=model_name, plugin_id=plugin_id, device_id=device)\n", + " print('CV model loaded successfully!')\n", + " \n", + " # Example image file (replace with your actual image file)\n", + " image_file = \"path/to/your/image.jpg\" # Replace with actual image file path\n", + " \n", + " print(f\"\\nNote: Please update the image_file path to point to your image file\")\n", + " print(f\"Current image_file: {image_file}\")\n", + " \n", + " # Check if image file exists\n", + " if not os.path.exists(image_file):\n", + " print(f\"Error: Image file not found: {image_file}\")\n", + " print(\"Please provide a valid image file path to test CV functionality.\")\n", + " return None\n", + " \n", + " # Basic CV configuration for OCR\n", + " config = CVConfig(\n", + " capabilities=CVCapabilities.OCR,\n", + " det_model_path=None, # Will use default detection model\n", + " rec_model_path=None # Will use default recognition model\n", + " )\n", + " \n", + " print(f\"\\n=== Starting CV Inference ===\")\n", + " start_time = time.time()\n", + " \n", + " # Perform CV inference (OCR)\n", + " result = cv.infer(image_path=image_file, config=config)\n", + " \n", + " end_time = time.time()\n", + " inference_time = end_time - start_time\n", + " \n", + " # Display results\n", + " print(f\"\\n=== CV Inference Results ===\")\n", + " print(f\"Number of detected text regions: {result.result_count}\")\n", + " print(f\"Processing time: {inference_time:.2f} seconds\")\n", + " \n", + " # Display OCR results\n", + " if result.results:\n", + " print(f\"\\nDetected text:\")\n", + " for i, cv_result in enumerate(result.results):\n", + " if cv_result.text:\n", + " confidence = cv_result.confidence\n", + " text = cv_result.text.strip()\n", + " print(f\" {i+1}. [{confidence:.2f}] {text}\")\n", + " \n", + " # Display bounding box if available\n", + " if cv_result.bbox:\n", + " bbox = cv_result.bbox\n", + " print(f\" Bounding box: x={bbox.x:.1f}, y={bbox.y:.1f}, \"\n", + " f\"w={bbox.width:.1f}, h={bbox.height:.1f}\")\n", + " else:\n", + " print(\"No text detected in the image.\")\n", + " \n", + " # Get profiling data\n", + " profiling_data = cv.get_profiling_data()\n", + " if profiling_data:\n", + " print(f\"\\nProfiling data: {profiling_data}\")\n", + " \n", + " return result\n", + "\n", + "# Run CV example\n", + "cv_result = cv_npu_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. ASR (Automatic Speech Recognition) NPU Inference\n", "\n", "Using NPU-accelerated speech recognition models for speech-to-text transcription. parakeet-npu provides high-quality speech recognition with NPU acceleration.\n" ]