Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 115 additions & 1 deletion bindings/python/npu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"- **VLM (Vision Language Model)**: Multimodal understanding and generation\n",
"- **Embedder**: Text vectorization and similarity computation\n",
"- **Reranker**: Document reranking\n",
"- **CV (Computer Vision)**: Image processing and OCR tasks\n",
"- **ASR (Automatic Speech Recognition)**: Speech-to-text transcription\n",
"\n",
"## Prerequisites\n",
Expand Down Expand Up @@ -49,6 +50,12 @@
"```bash\n",
"export NEXA_TOKEN=\"YOUR_NEXA_TOKEN_HERE\"\n",
"```\n"
"# CV model\n",
"nexa pull NexaAI/paddleocr-npu\n",
"\n",
"# ASR model\n",
"nexa pull NexaAI/parakeet-npu\n",
"```"
]
},
{
Expand All @@ -61,6 +68,19 @@
"\n",
"# Replace \"YOUR_NEXA_TOKEN_HERE\" with your actual token from https://sdk.nexa.ai/\n",
"os.environ[\"NEXA_TOKEN\"] = \"YOUR_NEXA_TOKEN_HERE\"\n",
"import io\n",
"import time\n",
"import numpy as np\n",
"from typing import List, Optional\n",
"\n",
"# NexaAI SDK imports\n",
"from nexaai.llm import LLM, GenerationConfig\n",
"from nexaai.vlm import VLM\n",
"from nexaai.embedder import Embedder, EmbeddingConfig\n",
"from nexaai.rerank import Reranker, RerankConfig\n",
"from nexaai.cv import CV, CVConfig, CVCapabilities\n",
"from nexaai.asr import ASR, ASRConfig\n",
"from nexaai.common import ModelConfig, ChatMessage, MultiModalMessage, MultiModalMessageContent\n",
"\n",
"if os.environ.get(\"NEXA_TOKEN\") and os.environ[\"NEXA_TOKEN\"] != \"YOUR_NEXA_TOKEN_HERE\":\n",
" print(\"NEXA_TOKEN is set successfully!\")\n",
Expand Down Expand Up @@ -373,7 +393,101 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. ASR (Automatic Speech Recognition) NPU Inference\n",
"## 4. CV (Computer Vision) NPU Inference\n",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorrect number,4 has exist

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"\n",
"Using NPU-accelerated computer vision models for image processing tasks such as OCR (Optical Character Recognition). CV models can perform text detection and recognition on images with NPU acceleration.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CV NPU Inference Example\n",
"def cv_npu_example():\n",
" \"\"\"CV NPU inference example\"\"\"\n",
" print(\"=== CV NPU Inference Example ===\")\n",
" \n",
" # Model configuration\n",
" model_name = \"NexaAI/paddleocr-npu\" # Example CV model for OCR\n",
" plugin_id = \"npu\"\n",
" device = \"npu\"\n",
" \n",
" print(f\"Loading model: {model_name}\")\n",
" print(f\"Using plugin: {plugin_id}\")\n",
" print(f\"Device: {device}\")\n",
" \n",
" # Create CV instance\n",
" cv = CV.from_(name_or_path=model_name, plugin_id=plugin_id, device_id=device)\n",
" print('CV model loaded successfully!')\n",
" \n",
" # Example image file (replace with your actual image file)\n",
" image_file = \"path/to/your/image.jpg\" # Replace with actual image file path\n",
" \n",
" print(f\"\\nNote: Please update the image_file path to point to your image file\")\n",
" print(f\"Current image_file: {image_file}\")\n",
" \n",
" # Check if image file exists\n",
" if not os.path.exists(image_file):\n",
" print(f\"Error: Image file not found: {image_file}\")\n",
" print(\"Please provide a valid image file path to test CV functionality.\")\n",
" return None\n",
" \n",
" # Basic CV configuration for OCR\n",
" config = CVConfig(\n",
" capabilities=CVCapabilities.OCR,\n",
" det_model_path=None, # Will use default detection model\n",
" rec_model_path=None # Will use default recognition model\n",
" )\n",
" \n",
" print(f\"\\n=== Starting CV Inference ===\")\n",
" start_time = time.time()\n",
" \n",
" # Perform CV inference (OCR)\n",
" result = cv.infer(image_path=image_file, config=config)\n",
" \n",
" end_time = time.time()\n",
" inference_time = end_time - start_time\n",
" \n",
" # Display results\n",
" print(f\"\\n=== CV Inference Results ===\")\n",
" print(f\"Number of detected text regions: {result.result_count}\")\n",
" print(f\"Processing time: {inference_time:.2f} seconds\")\n",
" \n",
" # Display OCR results\n",
" if result.results:\n",
" print(f\"\\nDetected text:\")\n",
" for i, cv_result in enumerate(result.results):\n",
" if cv_result.text:\n",
" confidence = cv_result.confidence\n",
" text = cv_result.text.strip()\n",
" print(f\" {i+1}. [{confidence:.2f}] {text}\")\n",
" \n",
" # Display bounding box if available\n",
" if cv_result.bbox:\n",
" bbox = cv_result.bbox\n",
" print(f\" Bounding box: x={bbox.x:.1f}, y={bbox.y:.1f}, \"\n",
" f\"w={bbox.width:.1f}, h={bbox.height:.1f}\")\n",
" else:\n",
" print(\"No text detected in the image.\")\n",
" \n",
" # Get profiling data\n",
" profiling_data = cv.get_profiling_data()\n",
" if profiling_data:\n",
" print(f\"\\nProfiling data: {profiling_data}\")\n",
" \n",
" return result\n",
"\n",
"# Run CV example\n",
"cv_result = cv_npu_example()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. ASR (Automatic Speech Recognition) NPU Inference\n",
"\n",
"Using NPU-accelerated speech recognition models for speech-to-text transcription. parakeet-npu provides high-quality speech recognition with NPU acceleration.\n"
]
Expand Down