open-edge-platform
diff --git a/‎usecases/ai/microservices/multiserve/.gitignore‎
Lines changed: 19 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/.gitignore‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎usecases/ai/microservices/multiserve/README.md‎
Lines changed: 113 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/README.md‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎usecases/ai/microservices/multiserve/app.py‎
Lines changed: 77 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/app.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎usecases/ai/microservices/multiserve/app.spec‎
Lines changed: 41 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/app.spec‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎usecases/ai/microservices/multiserve/icon.ico‎
4.19 KB b/‎usecases/ai/microservices/multiserve/icon.ico‎
4.19 KB
diff --git a/‎usecases/ai/microservices/multiserve/images/api.png‎
91.9 KB b/‎usecases/ai/microservices/multiserve/images/api.png‎
91.9 KB
diff --git a/‎usecases/ai/microservices/multiserve/images/dashboard.png‎
188 KB b/‎usecases/ai/microservices/multiserve/images/dashboard.png‎
188 KB
diff --git a/‎usecases/ai/microservices/multiserve/images/tray.png‎
57.6 KB b/‎usecases/ai/microservices/multiserve/images/tray.png‎
57.6 KB
diff --git a/‎usecases/ai/microservices/multiserve/modules/gpu_metrics.py‎
Lines changed: 131 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/modules/gpu_metrics.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎usecases/ai/microservices/multiserve/modules/llamacpp/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎usecases/ai/microservices/multiserve/modules/llamacpp/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,19 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+
+# Vscode 
+.vscode
+
+# Sources
+logs
+models
+engine/*
+config.yaml
@@ -0,0 +1,113 @@
+# Multi-Backend Inference Server
+
+A unified, local inference server for managing, running, and monitoring multiple LLM backends (Llama.cpp, OpenAI-compatible APIs, and more) with a modern web dashboard and tray integration.
+
+## Features
+
+- **Model Management**: Download, verify, start, stop, and delete LLM models from Hugging Face or manually.
+- **Multi-Task Support**: Supports text generation, embeddings, reranking, and multimodal models.
+- **Device Selection**: Run models on CPU, GPU, or NPU (if supported).
+- **Web Dashboard**: Modern UI for status, logs, and model management.
+- **Tray App**: System tray integration for quick access and server control.
+- **OpenAI Proxy**: Exposes OpenAI-compatible endpoints for easy integration.
+- **Cross-Platform**: Windows and Linux support.
+
+## Directory Structure
+
+```
+.
+├── app.py                  # Main FastAPI application entrypoint
+├── modules/                # Core Python modules
+│   ├── llamacpp/           # Llama.cpp management and GGUF downloader
+│   ├── gpu_metrics.py      # XPU/GPU metrics collection
+│   ├── tray_app.py         # System tray integration
+│   └── utils.py            # Utility functions
+├── routers/                # FastAPI routers (API endpoints)
+├── engine/                 # Native binaries, licenses, and XPU headers
+├── static/                 # Web dashboard static files
+├── tests/                  # Example tests
+├── config.yaml             # Model/task configuration
+├── verified.yaml           # List of verified models
+├── pyproject.toml          # Python dependencies
+└── README.md               # This file
+```
+
+## Quick Start
+
+1. **Install dependencies**  
+   Python 3.12+ is required. This project uses `uv` for fast dependency management.
+
+   ```sh
+   # Install uv (if you don't have it)
+   pip install uv
+
+   # Create a virtual environment and install dependencies
+   uv sync
+   ```
+
+2. **Run the server**
+
+   ```sh
+   uv run app.py
+   ```
+
+3. **Support LlamaCPP backend and OVMS backend**
+
+   ```sh
+   uv run app.py --backend ovms # for OVMS backend
+   uv run app.py --backend llamacpp # for LlamaCPP backend
+   ```
+
+4. **Access the dashboard**  
+   Open [http://127.0.0.1:8000](http://127.0.0.1:8000) in your browser.
+
+5. **Tray App**  
+   The tray icon should appear automatically when running on supported platforms.
+
+## Model Management
+
+- **Download**: Use the dashboard or API to download models by Hugging Face repo ID.
+- **Start/Stop**: Start or stop models for different tasks (text generation, embeddings, rerank, multimodal).
+- **Device Selection**: Choose CPU/GPU/NPU for inference (if available).
+- **Logs**: View download and runtime logs in the dashboard.
+
+## API Endpoints
+
+- **/v1/model**: List available/downloaded models
+- **/v1/start**: Start or swap a model
+- **/v1/stop**: Stop a running model
+- **/v1/download**: Download a model
+- **/v1/delete**: Delete a model
+- **/v1/status**: Get server and model status
+- **/v1/chat/completions**: OpenAI-compatible endpoints
+- **/v1/embeddings**: OpenAI-compatible endpoints
+- **/v1/rerank**: OpenAI-compatible endpoints
+
+## Configuration
+
+- **config.yaml**: Controls active models and default parameters.
+- **verified.yaml**: List of models considered "verified" for auto-discovery.
+
+## Building a Standalone App
+
+This project supports PyInstaller for packaging as a standalone executable. See [app.spec](app.spec) for build configuration.
+
+## Running the App
+
+App will launched in tray mode
+
+![tray](./images/tray.png)
+
+Click the **Open Management UI** will launch the dashboard in browser
+
+![dashboard](./images/dashboard.png)
+
+Click the **Open API Docs** will launch the Swagger API docs in browser
+
+![api](./images//api.png)
+
+## License
+
+- Third-party binaries and libraries: See `engine/llama.cpp/` for individual licenses.
+
+---
@@ -0,0 +1,77 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+# if getattr(sys, 'frozen', False):
+#     sys.stdout = open(os.devnull, "w")
+
+import threading
+import argparse
+
+from fastapi import FastAPI
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from contextlib import asynccontextmanager
+
+from modules.utils import get_resource_path
+from modules.tray_app import InferenceServerTrayApp
+
+from modules.llamacpp.cli import LlamaManagerCLI
+from routers.llamacpp_api_router import create_llamacpp_api_router
+from routers.llamacpp_openai_proxy_router import create_llamacpp_openai_proxy_router
+
+from modules.ovms.cli import OVMSManagerCLI
+from routers.ovms_api_router import create_ovms_api_router
+from routers.ovms_openai_proxy_router import create_ovms_openai_proxy_router
+
+argparser = argparse.ArgumentParser()
+argparser.add_argument("--backend", default="llamacpp", help="Inference Backend (eg: ovms / llamacpp)")
+args = argparser.parse_args()
+
+if args.backend == "llamacpp":
+    index_file = "index.html"
+    manager = LlamaManagerCLI(verified_model_path=get_resource_path("verified.yaml"), models_directory="models/GGUF")
+    create_api_router = create_llamacpp_api_router
+    create_openai_proxy_router = create_llamacpp_openai_proxy_router
+else:
+    index_file = "index_ov.html"
+    manager = OVMSManagerCLI(verified_model_path=get_resource_path("verified.yaml"), models_directory="models/OV")
+    create_api_router = create_ovms_api_router
+    create_openai_proxy_router = create_ovms_openai_proxy_router
+
+def start_server():
+    manager.start_server()
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    server_thread = threading.Thread(target=start_server)
+    server_thread.start()
+    
+    yield
+
+    print("FastAPI shutdown: Stopping Inference server thread...")
+
+app = FastAPI(
+    title="Inference Server Manager API", 
+    description="API to control and configure Inference Server (Start, Stop, Download, Config Management).", 
+    version="0.0.1",
+    lifespan=lifespan
+)
+
+app.mount("/static", StaticFiles(directory=get_resource_path("static")), name="static")
+app.mount("/webfonts", StaticFiles(directory=get_resource_path("static/webfonts")), name="webfonts")
+
+@app.get("/", include_in_schema=False)
+async def root():
+    html_path = get_resource_path(f"./static/{index_file}")
+    return FileResponse(html_path)
+
+api_router = create_api_router(manager)
+app.include_router(api_router)
+openai_router = create_openai_proxy_router(manager)
+app.include_router(openai_router)
+
+if __name__ == "__main__":
+    tray_app = InferenceServerTrayApp(app, manager)
+    tray_app.start(False)
@@ -0,0 +1,41 @@
+# -*- mode: python ; coding: utf-8 -*-
+from PyInstaller.utils.hooks import collect_submodules
+
+hiddenimports = collect_submodules('uvicorn')
+
+a = Analysis(
+    ['app.py'],
+    pathex=[],
+    binaries=[('icon.ico', '.'), ('config.yaml', '.'), ('verified.yaml', '.')],
+    datas=[('engine', './engine'), ('static', './static')],
+    hiddenimports=hiddenimports,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+)
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.datas,
+    [],
+    name='InferenceServerManager',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+    icon=['icon.ico'],
+)
@@ -0,0 +1,131 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess # nosec - disable B404:import-subprocess check
+import json
+from pprint import pprint
+from typing import List, Dict, Any, Optional
+
+class XpuManager:
+    def __init__(self):
+        self._key_metrics_map = {
+            "XPUM_STATS_POWER": "power_W",
+            "XPUM_STATS_GPU_UTILIZATION": "gpu_utilization_pct",
+            "XPUM_STATS_MEMORY_USED": "memory_used_MB",
+            "XPUM_STATS_MEMORY_UTILIZATION": "memory_utilization_pct",
+            "XPUM_STATS_GPU_FREQUENCY": "gpu_frequency_MHz",
+            "XPUM_STATS_CORE_TEMPERATURE": "core_temperature_C"
+        }
+
+    def _run_xpusmi_command(self, command_args: List[str]) -> Optional[str]:
+        full_command = ['./engine/xpu-smi/xpu-smi.exe'] + command_args
+        
+        try:
+            result = subprocess.run(
+                full_command,
+                capture_output=True,
+                text=True,
+                check=True,
+                timeout=10
+            )
+            return result.stdout
+        except FileNotFoundError:
+            print("Error: 'xpu-smi' command not found. Ensure it's installed and in your PATH.")
+            return None
+        except subprocess.CalledProcessError as e:
+            return None
+        except subprocess.TimeoutExpired:
+            print("Error: Command timed out.")
+            return None
+
+    def _parse_xpu_stats(self, stats_data: Dict[str, Any]) -> Dict[str, Any]:
+        if not stats_data or "device_level" not in stats_data:
+            return {"error": "Invalid stats data structure."}
+
+        extracted_metrics = {
+            "device_id": stats_data.get("device_id"),
+        }
+        
+        for key in self._key_metrics_map.values():
+            extracted_metrics[key] = None
+
+        for metric in stats_data.get("device_level", []):
+            metric_type = metric.get("metrics_type")
+            metric_value = metric.get("value")
+            
+            if metric_type in self._key_metrics_map:
+                key = self._key_metrics_map[metric_type]
+                if isinstance(metric_value, (float, int)):
+                    extracted_metrics[key] = round(metric_value, 2) if isinstance(metric_value, float) else metric_value
+                else:
+                    extracted_metrics[key] = metric_value
+            
+        return extracted_metrics
+
+    def discover_devices(self) -> List[Dict[str, Any]]:
+        discovery_stdout = self._run_xpusmi_command(['discovery', '-j'])
+        
+        if not discovery_stdout:
+            return []
+
+        try:
+            discovery_data = json.loads(discovery_stdout)
+            return discovery_data.get("device_list", [])
+        except json.JSONDecodeError:
+            print("Failed to parse discovery JSON.")
+            return []
+
+    def get_device_stats(self, device_id: int) -> Optional[Dict[str, Any]]:
+        stats_stdout = self._run_xpusmi_command(['stats', '-d', str(device_id), '-j'])
+        
+        if stats_stdout:
+            try:
+                raw_stats = json.loads(stats_stdout)
+                return self._parse_xpu_stats(raw_stats)
+            except json.JSONDecodeError:
+                return None
+        return None
+
+    def get_all_device_data(self) -> Dict[int, Dict[str, Any]]:
+        devices = self.discover_devices()
+        if not devices:
+            return {}
+
+        all_data: Dict[int, Dict[str, Any]] = {}
+        
+        for device in devices:
+            dev_id = device.get('device_id')
+            if dev_id is not None:
+                stats = self.get_device_stats(dev_id)
+                if stats and "error" not in stats:
+                    full_info = {**device, **stats}
+                    all_data[dev_id] = full_info
+                
+        return all_data
+
+if __name__ == "__main__":
+    manager = XpuManager()
+    
+    print("Initializing XPU Manager and querying all devices...")
+    
+    xpu_data = manager.get_all_device_data()
+
+    print("\n" + "=" * 60)
+    if xpu_data:
+        print(f"Successfully retrieved data for {len(xpu_data)} XPU device(s).")
+        print("=" * 60)
+        
+        for dev_id, data in xpu_data.items():
+            print(f"Device ID {dev_id}: {data.get('device_name', 'N/A')}")
+            print(f"  - Power Draw: {data.get('power_W', 'N/A')} W")
+            print(f"  - GPU Util: {data.get('gpu_utilization_pct', 'N/A')}%")
+            print(f"  - Mem Util: {data.get('memory_utilization_pct', 'N/A')}%")
+            print(f"  - Mem Used: {data.get('memory_used_MB', 'N/A')} MB")
+            print(f"  - Core Temp: {data.get('core_temperature_C', 'N/A')}°C")
+            print("-" * 60)
+            
+        print("\n--- Full Data Dictionary (Combined Discovery and Stats) ---")
+        pprint(xpu_data)
+
+    else:
+        print("No XPU data could be retrieved. Check your XPU-SMI installation and device status.")
@@ -0,0 +1,3 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Copyright (C) 2024 Intel Corporation`
	`2`	`+# SPDX-License-Identifier: Apache-2.0`
	`3`	`+`