diff --git a/.gitignore b/.gitignore
index 9a79cad0..7901ca20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,3 +70,7 @@ inference-api/__pycache__/
 CLAUDE.md
 docs/RAG_PRODUCTIONIZATION_PLAN.md
 docs/DOCKER_CONTROL_SERVICE_PLAN.md
+!app/backend/shared_config/models_from_inference_server.json
+
+request-venv/*
+app/.env-old
diff --git a/app/.env.default b/app/.env.default
index 58a4a7a1..5f01b3f2 100644
--- a/app/.env.default
+++ b/app/.env.default
@@ -13,6 +13,9 @@ TT_INFERENCE_ARTIFACT_VERSION=v0.8.0
 # Security Credentials (REQUIRED - keep secret in production!)
 JWT_SECRET=test-secret-456
 DJANGO_SECRET_KEY=django-insecure-default
+
+# TTS Inference Server API Key (media inference engine)
+TTS_API_KEY=your-tts-api-key
 HF_TOKEN=hf_***
 
 # Docker Control Service (secure Docker operations API)
diff --git a/app/backend/Dockerfile b/app/backend/Dockerfile
index 2ea935eb..9ec42f65 100644
--- a/app/backend/Dockerfile
+++ b/app/backend/Dockerfile
@@ -34,7 +34,7 @@ RUN if [ "$VITE_ENABLE_DEPLOYED" != "true" ]; then \
     . "$HOME/.cargo/env" && \
     # Clone and install tt-smi
     mkdir -p /opt/tenstorrent-tools && \
-    git clone https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \
+    git clone --branch v4.0.0 --depth 1 https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \
     cd /opt/tenstorrent-tools/tt-smi && \
     pip3 install --upgrade pip && \
     pip3 install . && \
diff --git a/app/backend/api/settings.py b/app/backend/api/settings.py
index cf7d799e..06671024 100644
--- a/app/backend/api/settings.py
+++ b/app/backend/api/settings.py
@@ -64,11 +64,6 @@
 # Application definition
 
 INSTALLED_APPS = [
-    "django.contrib.admin",
-    "django.contrib.auth",
-    "django.contrib.contenttypes",
-    "django.contrib.sessions",
-    "django.contrib.messages",
     "django.contrib.staticfiles",
     "docker_control.apps.DockerControlConfig",
     "model_control",
@@ -81,11 +76,8 @@
 MIDDLEWARE = [
     "corsheaders.middleware.CorsMiddleware",
     "django.middleware.security.SecurityMiddleware",
-    "django.contrib.sessions.middleware.SessionMiddleware",
     "django.middleware.common.CommonMiddleware",
     "django.middleware.csrf.CsrfViewMiddleware",
-    "django.contrib.auth.middleware.AuthenticationMiddleware",
-    "django.contrib.messages.middleware.MessageMiddleware",
     "django.middleware.clickjacking.XFrameOptionsMiddleware",
 ]
 
@@ -100,25 +92,12 @@
             "context_processors": [
                 "django.template.context_processors.debug",
                 "django.template.context_processors.request",
-                "django.contrib.auth.context_processors.auth",
-                "django.contrib.messages.context_processors.messages",
             ],
         },
     },
 ]
 
 WSGI_APPLICATION = "api.wsgi.application"
-SESSIONS_ENGINE = "django.contrib.sessions.backends.cache"
-# Database
-# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
-
-# SQLite database for deployment history and other persistent data
-DATABASES = {
-    "default": {
-        "ENGINE": "django.db.backends.sqlite3",
-        "NAME": backend_config.backend_cache_root / "db.sqlite3",
-    }
-}
 
 # local memory thread-safe default
 # the LOCATION for locmem.LocMemCache cache backend is just a name for tracking
@@ -135,24 +114,6 @@
     },
 }
 
-# Password validation
-# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
-
-AUTH_PASSWORD_VALIDATORS = [
-    {
-        "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
-    },
-    {
-        "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
-    },
-    {
-        "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
-    },
-    {
-        "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
-    },
-]
-
 # Internationalization
 # https://docs.djangoproject.com/en/4.2/topics/i18n/
 
diff --git a/app/backend/api/urls.py b/app/backend/api/urls.py
index 441f06b7..34717c7b 100644
--- a/app/backend/api/urls.py
+++ b/app/backend/api/urls.py
@@ -19,12 +19,11 @@
     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
 """
 
-from django.contrib import admin
 from api.views import UpStatusView
 from django.urls import include, path
+from model_control.views import OpenAIAudioSpeechView
 
 urlpatterns = [
-    path("admin/", admin.site.urls),
     path("up/", UpStatusView.as_view()),
     path("docker/", include("docker_control.urls")),
     path("models/", include("model_control.urls")),
@@ -32,4 +31,6 @@
     path("collections/", include("vector_db_control.urls")),
     path("logs/", include("logs_control.urls")),
     path("board/", include("board_control.urls")),
+    # OpenAI-compatible audio endpoint
+    path("v1/audio/speech", OpenAIAudioSpeechView.as_view()),
 ]
diff --git a/app/backend/board_control/services.py b/app/backend/board_control/services.py
index 2c08a231..eaa39b31 100644
--- a/app/backend/board_control/services.py
+++ b/app/backend/board_control/services.py
@@ -16,15 +16,19 @@
 
 class SystemResourceService:
     """Service for monitoring system resources and TT device telemetry"""
-    
+
     # Cache keys and timeout
     TT_SMI_CACHE_KEY = "tt_smi_data"
     TT_SMI_CACHE_TIMEOUT = 3600  # Cache for 1 hour (since we'll refresh on events only)
     BOARD_TYPE_CACHE_KEY = "board_type_data"
     BOARD_TYPE_CACHE_TIMEOUT = 3600  # Cache board type for 1 hour (since it rarely changes)
+
+    # Device state cache keys
+    DEVICE_STATE_CACHE_KEY = "device_state_v2"
+    DEVICE_RESETTING_KEY = "device_resetting"
     
     @staticmethod
-    def get_tt_smi_data(timeout=10):
+    def get_tt_smi_data(timeout=30):
         """Get raw tt-smi data with caching to reduce expensive calls"""
         # Check cache first
         cached_data = cache.get(SystemResourceService.TT_SMI_CACHE_KEY)
@@ -412,9 +416,245 @@ def force_refresh_tt_smi_cache():
         # Clear the existing cache
         cache.delete(SystemResourceService.TT_SMI_CACHE_KEY)
         cache.delete(SystemResourceService.BOARD_TYPE_CACHE_KEY)
-        
+
         # Fetch fresh data
         SystemResourceService.get_tt_smi_data()
         SystemResourceService.get_board_type()
-        
-        logger.info("tt-smi cache refreshed successfully") 
\ No newline at end of file
+
+        logger.info("tt-smi cache refreshed successfully")
+
+    # -------------------------------------------------------------------------
+    # Device State Machine — single source of truth
+    # -------------------------------------------------------------------------
+
+    @staticmethod
+    def _extract_board_type_from_data(data):
+        """Extract canonical board-type string from tt-smi JSON data."""
+        if not data or "device_info" not in data or not data["device_info"]:
+            return "unknown"
+
+        board_types = []
+        for info in data["device_info"]:
+            board_info = info.get("board_info", {})
+            board_types.append(board_info.get("board_type", "unknown"))
+
+        if not board_types:
+            return "unknown"
+
+        # Strip "local"/"remote" suffix if present
+        filtered = [bt.rsplit(" ", 1)[0] for bt in board_types]
+        unique = set(filtered)
+
+        if len(unique) > 1:
+            logger.warning(f"Mixed board types detected: {unique}")
+            return "unknown"
+
+        raw = unique.pop()
+        num_devices = len(data["device_info"])
+        raw_lower = raw.lower()
+
+        if "n150" in raw_lower:
+            return "N150X4" if num_devices >= 4 else "N150"
+        if "n300" in raw_lower:
+            return "T3K" if num_devices >= 4 else "N300"
+        if "p300" in raw_lower:
+            if num_devices >= 8:
+                return "P300Cx4"
+            if num_devices >= 4:
+                return "P300Cx2"
+            return "P300c"
+        if "p150" in raw_lower:
+            if num_devices >= 8:
+                return "P150X8"
+            if num_devices >= 4:
+                return "P150X4"
+            return "P150"
+        if "p100" in raw_lower:
+            return "P100"
+        if "e150" in raw_lower:
+            return "E150"
+        if "galaxy" in raw_lower:
+            return "GALAXY_T3K" if "t3k" in raw_lower else "GALAXY"
+
+        logger.warning(f"Unknown board type string: {raw!r}")
+        return "unknown"
+
+    @staticmethod
+    def _extract_devices_from_data(data):
+        """Extract device summary list from tt-smi JSON data."""
+        devices = []
+        if not data or "device_info" not in data:
+            return devices
+
+        for idx, device in enumerate(data["device_info"]):
+            board_info = device.get("board_info", {})
+            telemetry = device.get("telemetry", {})
+
+            def _f(v):
+                try:
+                    return float(v) if v is not None else 0.0
+                except (TypeError, ValueError):
+                    return 0.0
+
+            devices.append({
+                "index": idx,
+                "board_type": board_info.get("board_type", "Unknown"),
+                "bus_id": board_info.get("bus_id", "N/A"),
+                "temperature": _f(telemetry.get("asic_temperature")),
+                "power": _f(telemetry.get("power")),
+                "voltage": _f(telemetry.get("voltage")),
+            })
+        return devices
+
+    @staticmethod
+    def get_device_state():
+        """
+        Single authoritative device state resolver.
+
+        States:
+          HEALTHY     — tt-smi -s succeeded, devices visible
+          BAD_STATE   — /dev/tenstorrent present but tt-smi timed out / errored
+          RESETTING   — tt-smi -r is actively running
+          NOT_PRESENT — /dev/tenstorrent path does not exist
+          UNKNOWN     — can't determine (startup / tt-smi missing)
+        """
+        # RESETTING takes priority — check before cache
+        if cache.get(SystemResourceService.DEVICE_RESETTING_KEY):
+            return {
+                "state": "RESETTING",
+                "board_type": "unknown",
+                "board_name": "Resetting…",
+                "devices": [],
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }
+
+        # Return cached result if still fresh
+        cached = cache.get(SystemResourceService.DEVICE_STATE_CACHE_KEY)
+        if cached is not None:
+            return cached
+
+        # Check physical device presence
+        if not os.path.exists("/dev/tenstorrent"):
+            result = {
+                "state": "NOT_PRESENT",
+                "board_type": "unknown",
+                "board_name": "Not Present",
+                "devices": [],
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }
+            cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=15)
+            return result
+
+        # Try tt-smi -s with 30-second timeout (Docker cold-start can be slower than host)
+        try:
+            logger.info("Running tt-smi -s for device state check")
+            process = subprocess.Popen(
+                ["tt-smi", "-s"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                stdin=subprocess.DEVNULL,
+                text=True,
+                preexec_fn=os.setsid,
+            )
+
+            try:
+                stdout, stderr = process.communicate(timeout=30)
+            except subprocess.TimeoutExpired:
+                logger.error("tt-smi -s timed out after 30s — board in BAD_STATE")
+                try:
+                    os.killpg(os.getpgid(process.pid), signal.SIGTERM)
+                    process.wait(timeout=2)
+                except Exception:
+                    try:
+                        os.killpg(os.getpgid(process.pid), signal.SIGKILL)
+                    except Exception:
+                        pass
+                result = {
+                    "state": "BAD_STATE",
+                    "board_type": "unknown",
+                    "board_name": "Bad State",
+                    "devices": [],
+                    "last_updated": timezone.now().isoformat(),
+                    "reset_suggested": True,
+                }
+                cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
+                return result
+
+            if process.returncode != 0:
+                logger.error(f"tt-smi -s exit code {process.returncode}: {stderr.strip()!r}")
+                result = {
+                    "state": "BAD_STATE",
+                    "board_type": "unknown",
+                    "board_name": "Bad State",
+                    "devices": [],
+                    "last_updated": timezone.now().isoformat(),
+                    "reset_suggested": True,
+                }
+                cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
+                return result
+
+            try:
+                data = json.loads(stdout)
+            except json.JSONDecodeError as e:
+                logger.error(f"Failed to parse tt-smi JSON: {e}")
+                result = {
+                    "state": "BAD_STATE",
+                    "board_type": "unknown",
+                    "board_name": "Bad State",
+                    "devices": [],
+                    "last_updated": timezone.now().isoformat(),
+                    "reset_suggested": True,
+                }
+                cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
+                return result
+
+            board_type = SystemResourceService._extract_board_type_from_data(data)
+            devices = SystemResourceService._extract_devices_from_data(data)
+            result = {
+                "state": "HEALTHY",
+                "board_type": board_type,
+                "board_name": board_type,
+                "devices": devices,
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }
+            cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=30)
+            return result
+
+        except FileNotFoundError:
+            logger.error("tt-smi command not found")
+            # Don't cache UNKNOWN so each call re-checks (tt-smi may be installed later)
+            return {
+                "state": "UNKNOWN",
+                "board_type": "unknown",
+                "board_name": "Unknown",
+                "devices": [],
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }
+        except Exception as e:
+            logger.error(f"Unexpected error in get_device_state: {e}")
+            return {
+                "state": "UNKNOWN",
+                "board_type": "unknown",
+                "board_name": "Unknown",
+                "devices": [],
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }
+
+    @staticmethod
+    def set_resetting_state():
+        """Mark the device as actively resetting (clears state cache)."""
+        cache.set(SystemResourceService.DEVICE_RESETTING_KEY, True, timeout=120)
+        cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY)
+        logger.info("Device state set to RESETTING")
+
+    @staticmethod
+    def clear_device_state_cache():
+        """Clear device state cache and resetting flag after reset completes."""
+        cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY)
+        cache.delete(SystemResourceService.DEVICE_RESETTING_KEY)
+        logger.info("Device state cache cleared")
\ No newline at end of file
diff --git a/app/backend/board_control/urls.py b/app/backend/board_control/urls.py
index 42e59361..3e2b323d 100644
--- a/app/backend/board_control/urls.py
+++ b/app/backend/board_control/urls.py
@@ -19,4 +19,8 @@
     
     # Cache management
     path("refresh-cache/", views.RefreshCacheView.as_view(), name="refresh-cache"),
-] 
\ No newline at end of file
+
+    # Unified device state & reset (new)
+    path("device-state/", views.DeviceStateView.as_view(), name="device-state"),
+    path("device-reset/", views.DeviceResetView.as_view(), name="device-reset"),
+]
\ No newline at end of file
diff --git a/app/backend/board_control/views.py b/app/backend/board_control/views.py
index f904557c..7dd83428 100644
--- a/app/backend/board_control/views.py
+++ b/app/backend/board_control/views.py
@@ -228,20 +228,78 @@ def patch(self, request, alert_id, *args, **kwargs):
 @method_decorator(csrf_exempt, name='dispatch')
 class RefreshCacheView(APIView):
     """Manual cache refresh endpoint for debugging and manual triggering"""
-    
+
     def post(self, request, *args, **kwargs):
         try:
             logger.info("Manual cache refresh requested")
             SystemResourceService.force_refresh_tt_smi_cache()
-            
+
             return Response({
                 "status": "success",
                 "message": "tt-smi cache refreshed successfully"
             }, status=status.HTTP_200_OK)
-            
+
         except Exception as e:
             logger.error(f"Error manually refreshing cache: {str(e)}")
             return Response(
                 {"error": "Failed to refresh cache", "details": str(e)},
                 status=status.HTTP_500_INTERNAL_SERVER_ERROR
-            ) 
\ No newline at end of file
+            )
+
+
+@method_decorator(csrf_exempt, name='dispatch')
+class DeviceStateView(APIView):
+    """
+    GET /board-api/device-state/
+
+    Single source of truth for board state.  Replaces the need to call
+    /board-api/status/, /board-api/footer-data/, and /docker-api/board-info/
+    separately.  All components should poll this endpoint.
+    """
+
+    def get(self, request, *args, **kwargs):
+        try:
+            state = SystemResourceService.get_device_state()
+            return Response(state, status=status.HTTP_200_OK)
+        except Exception as e:
+            logger.error(f"Error getting device state: {e}")
+            return Response({
+                "state": "UNKNOWN",
+                "board_type": "unknown",
+                "board_name": "Unknown",
+                "devices": [],
+                "last_updated": timezone.now().isoformat(),
+                "reset_suggested": False,
+            }, status=status.HTTP_200_OK)
+
+
+@method_decorator(csrf_exempt, name='dispatch')
+class DeviceResetView(APIView):
+    """
+    POST /board-api/device-reset/
+
+    Dedicated board reset endpoint.  Separated from the Docker-coupled
+    /docker-api/reset_board/ for clarity; the old endpoint keeps working via
+    the same perform_reset() logic.
+    """
+
+    def post(self, request, *args, **kwargs):
+        from docker_control.docker_utils import perform_reset
+        try:
+            logger.info("Device reset requested via /board-api/device-reset/")
+            result = perform_reset()
+            http_status_code = result.pop("http_status", 200)
+
+            success = result.get("status") == "success"
+            return Response({
+                "success": success,
+                "message": result.get("message", ""),
+                "attempts_used": result.get("attempts_used", 0),
+            }, status=http_status_code)
+        except Exception as e:
+            logger.error(f"Error in device reset: {e}")
+            return Response({
+                "success": False,
+                "message": str(e),
+                "attempts_used": 0,
+            }, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
\ No newline at end of file
diff --git a/app/backend/docker_control/admin.py b/app/backend/docker_control/admin.py
index 2c79060a..917beb36 100644
--- a/app/backend/docker_control/admin.py
+++ b/app/backend/docker_control/admin.py
@@ -1,7 +1,3 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-from django.contrib import admin
-
-# Register your models here.
diff --git a/app/backend/docker_control/apps.py b/app/backend/docker_control/apps.py
index 0a263c9c..68dfa377 100644
--- a/app/backend/docker_control/apps.py
+++ b/app/backend/docker_control/apps.py
@@ -14,32 +14,15 @@ class DockerControlConfig(AppConfig):
     def ready(self):
         """Initialize docker control services"""
         logger.info("Docker control app is ready")
-        
-        # Verify database migrations are applied
+
+        # Log how many deployments are already tracked
         try:
-            from django.db import connection
-            
-            # Check if ModelDeployment table exists
-            with connection.cursor() as cursor:
-                cursor.execute("""
-                    SELECT name FROM sqlite_master 
-                    WHERE type='table' AND name='docker_control_modeldeployment'
-                """)
-                table_exists = cursor.fetchone() is not None
-            
-            if not table_exists:
-                logger.warning(
-                    "ModelDeployment table not found. Database migrations may not be applied. "
-                    "Run: python manage.py migrate docker_control"
-                )
-            else:
-                # Count existing deployment records
-                from docker_control.models import ModelDeployment
-                count = ModelDeployment.objects.count()
-                logger.info(f"Deployment history table verified. Existing records: {count}")
+            from docker_control.models import ModelDeployment
+            count = ModelDeployment.objects.count()
+            logger.info(f"Deployment store loaded. Existing records: {count}")
         except Exception as e:
-            logger.warning(f"Could not verify deployment history table: {e}")
-        
+            logger.warning(f"Could not read deployment store: {e}")
+
         # Start container health monitoring service
         try:
             from docker_control.health_monitor import start_health_monitoring
diff --git a/app/backend/docker_control/chip_allocator.py b/app/backend/docker_control/chip_allocator.py
new file mode 100644
index 00000000..ce3c3dfe
--- /dev/null
+++ b/app/backend/docker_control/chip_allocator.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+"""
+Chip slot allocator for automatic device_id assignment.
+
+Manages automatic chip slot allocation based on:
+- Current deployments (from deployment_store)
+- Model chip requirements (single vs multi-chip)
+- Board topology
+"""
+
+import threading
+from typing import Dict, List, Optional, Set
+
+from shared_config.logger_config import get_logger
+from shared_config.model_config import get_model_chip_requirement
+from docker_control.deployment_store import ModelDeployment
+
+logger = get_logger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Exception Classes
+# ---------------------------------------------------------------------------
+
+class AllocationError(Exception):
+    """Base exception for chip slot allocation errors"""
+    pass
+
+
+class MultiChipConflictError(AllocationError):
+    """
+    Exception raised when multi-chip model deployment conflicts with existing deployments.
+
+    Attributes:
+        message: Error message
+        conflicts: List of conflicting deployment info dicts
+    """
+    def __init__(self, message: str, conflicts: List[Dict] = None):
+        super().__init__(message)
+        self.conflicts = conflicts or []
+
+
+# ---------------------------------------------------------------------------
+# Chip Slot Allocator
+# ---------------------------------------------------------------------------
+
+# Board type to slot count mapping (matching frontend MULTI_CHIP_BOARD_SLOTS)
+MULTI_CHIP_BOARD_SLOTS = {
+    "T3K": 4,
+    "T3000": 4,
+    "N150X4": 4,
+    "N300x4": 4,
+    "P150X4": 4,
+    "P150X8": 8,
+    "P300Cx2": 4,
+    "P300Cx4": 8,
+    "GALAXY": 32,
+    "GALAXY_T3K": 32,
+}
+
+
+class ChipSlotAllocator:
+    """
+    Manages automatic chip slot allocation.
+
+    Thread-safe allocator that determines the best chip slot for a model
+    based on current deployments and chip requirements.
+    """
+
+    def __init__(self):
+        """Initialize allocator with current board type and slot count"""
+        self._lock = threading.Lock()
+        self.board_type = self._detect_board_type()
+        self.total_slots = self._get_total_slots()
+        logger.info(f"ChipSlotAllocator initialized: board={self.board_type}, slots={self.total_slots}")
+
+    def _detect_board_type(self) -> str:
+        """Detect current board type"""
+        from docker_control.docker_utils import detect_board_type
+        return detect_board_type()
+
+    def _get_total_slots(self) -> int:
+        """Get total number of chip slots for current board"""
+        # Multi-chip boards have multiple slots
+        if self.board_type in MULTI_CHIP_BOARD_SLOTS:
+            return MULTI_CHIP_BOARD_SLOTS[self.board_type]
+
+        # Single-chip boards (N150, N300, E150, P100, P150, P300c) have 1 slot
+        return 1
+
+    def get_chip_status(self) -> Dict:
+        """
+        Returns current chip slot occupancy status.
+
+        Returns:
+            Dictionary with board_type, total_slots, and per-slot status:
+            {
+              "board_type": "T3K",
+              "total_slots": 4,
+              "slots": [
+                {"slot_id": 0, "status": "occupied", "model_name": "...", "deployment_id": 123, "is_multi_chip": False},
+                {"slot_id": 1, "status": "available"},
+                ...
+              ]
+            }
+        """
+        active_deployments = self._get_active_deployments()
+        slots_info = []
+        occupied_map = {}
+
+        # Build occupied slots map
+        for deployment in active_deployments:
+            model_chips = self._get_chips_required(deployment.model_name)
+
+            if model_chips == 4:
+                # Multi-chip: mark ALL slots as occupied
+                for slot_id in range(min(4, self.total_slots)):  # Multi-chip models use up to 4 slots
+                    occupied_map[slot_id] = {
+                        "model_name": deployment.model_name,
+                        "deployment_id": deployment.id,
+                        "is_multi_chip": True,
+                        "port": deployment.port,
+                    }
+            else:
+                # Single-chip: mark specific slot
+                if deployment.device_id < self.total_slots:
+                    occupied_map[deployment.device_id] = {
+                        "model_name": deployment.model_name,
+                        "deployment_id": deployment.id,
+                        "is_multi_chip": False,
+                        "port": deployment.port,
+                    }
+
+        # Build slot status list
+        for slot_id in range(self.total_slots):
+            if slot_id in occupied_map:
+                slots_info.append({
+                    "slot_id": slot_id,
+                    "status": "occupied",
+                    **occupied_map[slot_id]
+                })
+            else:
+                slots_info.append({
+                    "slot_id": slot_id,
+                    "status": "available"
+                })
+
+        return {
+            "board_type": self.board_type,
+            "total_slots": self.total_slots,
+            "slots": slots_info
+        }
+
+    def allocate_chip_slot(self, model_name: str, manual_override: Optional[int] = None) -> int:
+        """
+        Auto-allocate chip slot or use manual override.
+
+        Args:
+            model_name: Name of the model being deployed
+            manual_override: Optional manual device_id for advanced mode
+
+        Returns:
+            Allocated device_id (0-based slot number)
+
+        Raises:
+            AllocationError: If allocation fails (all slots occupied)
+            MultiChipConflictError: If multi-chip model conflicts with existing deployments
+        """
+        with self._lock:
+            chips_required = self._get_chips_required(model_name)
+
+            # Advanced mode: manual override
+            if manual_override is not None:
+                validation = self._validate_manual_allocation(manual_override, chips_required, model_name)
+                if not validation["valid"]:
+                    raise AllocationError(validation["message"])
+                logger.info(f"Manual allocation: device_id={manual_override} for {model_name}")
+                return manual_override
+
+            # Auto-allocation
+            if chips_required == 4:
+                device_id = self._allocate_multi_chip(model_name)
+            else:
+                device_id = self._allocate_single_chip(model_name)
+
+            logger.info(f"Auto-allocated: device_id={device_id} for {model_name} ({chips_required} chips)")
+            return device_id
+
+    def _allocate_single_chip(self, model_name: str) -> int:
+        """
+        Find first available slot for single-chip model.
+
+        Args:
+            model_name: Name of the model
+
+        Returns:
+            Device ID of first available slot
+
+        Raises:
+            AllocationError: If all slots are occupied
+        """
+        occupied_slots = self._get_occupied_slots()
+
+        for slot_id in range(self.total_slots):
+            if slot_id not in occupied_slots:
+                return slot_id
+
+        raise AllocationError(
+            f"All {self.total_slots} chip slots are occupied. "
+            f"Stop at least one model to free up a slot."
+        )
+
+    def _allocate_multi_chip(self, model_name: str) -> int:
+        """
+        Validate all slots are free for multi-chip model, return 0.
+
+        Args:
+            model_name: Name of the model
+
+        Returns:
+            Device ID 0 (multi-chip models always use device_id=0)
+
+        Raises:
+            MultiChipConflictError: If any slots are occupied
+        """
+        occupied_slots = self._get_occupied_slots()
+
+        if occupied_slots:
+            # Build detailed conflict information
+            active_deployments = self._get_active_deployments()
+            conflicts = []
+
+            for deployment in active_deployments:
+                model_chips = self._get_chips_required(deployment.model_name)
+                conflicts.append({
+                    "model": deployment.model_name,
+                    "deployment_id": deployment.id,
+                    "slot": deployment.device_id,
+                    "chips": model_chips
+                })
+
+            raise MultiChipConflictError(
+                f"{model_name} requires all 4 chip slots. "
+                f"Currently occupied: {len(occupied_slots)} slot(s). "
+                f"Stop all running models first.",
+                conflicts=conflicts
+            )
+
+        return 0  # Multi-chip models always use device_id=0
+
+    def _validate_manual_allocation(self, device_id: int, chips_required: int, model_name: str) -> Dict:
+        """
+        Validate manual chip slot selection in advanced mode.
+
+        Args:
+            device_id: Manually selected device ID
+            chips_required: Number of chips required by model
+            model_name: Name of the model
+
+        Returns:
+            Dictionary with "valid" boolean and optional "message"
+        """
+        # Check bounds
+        if device_id < 0 or device_id >= self.total_slots:
+            return {
+                "valid": False,
+                "message": f"Invalid device_id {device_id}. Must be 0-{self.total_slots - 1}."
+            }
+
+        occupied_slots = self._get_occupied_slots()
+
+        if chips_required == 4:
+            # Multi-chip: ensure all slots are free
+            if occupied_slots:
+                return {
+                    "valid": False,
+                    "message": f"{model_name} requires all 4 chip slots. Currently occupied: {len(occupied_slots)} slot(s)."
+                }
+        else:
+            # Single-chip: ensure selected slot is free
+            if device_id in occupied_slots:
+                # Find which model is using this slot
+                active_deployments = self._get_active_deployments()
+                occupying_model = None
+                for deployment in active_deployments:
+                    if deployment.device_id == device_id:
+                        occupying_model = deployment.model_name
+                        break
+                    # Check if a multi-chip model is occupying all slots
+                    model_chips = self._get_chips_required(deployment.model_name)
+                    if model_chips == 4:
+                        occupying_model = f"{deployment.model_name} (multi-chip)"
+                        break
+
+                return {
+                    "valid": False,
+                    "message": f"Chip slot {device_id} is occupied by {occupying_model or 'another model'}."
+                }
+
+        return {"valid": True}
+
+    def _get_active_deployments(self) -> List[ModelDeployment]:
+        """
+        Get list of active deployments (starting or running status).
+
+        Returns:
+            List of ModelDeployment objects
+        """
+        return list(ModelDeployment.objects.filter(status__in=['starting', 'running']))
+
+    def _get_occupied_slots(self) -> Set[int]:
+        """
+        Returns set of occupied slot IDs.
+
+        Multi-chip deployments occupy slots 0-3.
+        Single-chip deployments occupy their specific device_id slot.
+
+        Returns:
+            Set of occupied slot IDs
+        """
+        active = self._get_active_deployments()
+        occupied = set()
+
+        for deployment in active:
+            chips = self._get_chips_required(deployment.model_name)
+            if chips == 4:
+                # Multi-chip: occupies all 4 slots
+                occupied.update(range(min(4, self.total_slots)))
+            else:
+                # Single-chip: occupies specific slot
+                if deployment.device_id < self.total_slots:
+                    occupied.add(deployment.device_id)
+
+        return occupied
+
+    def _get_chips_required(self, model_name: str) -> int:
+        """
+        Get number of chips required for a model.
+
+        Args:
+            model_name: Name of the model
+
+        Returns:
+            Number of chips required (1 or 4)
+        """
+        return get_model_chip_requirement(model_name)
diff --git a/app/backend/docker_control/deployment_store.py b/app/backend/docker_control/deployment_store.py
new file mode 100644
index 00000000..ba5421fd
--- /dev/null
+++ b/app/backend/docker_control/deployment_store.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+"""
+Thread-safe JSON file store replacing Django ORM for ModelDeployment.
+
+Provides a drop-in ORM-like interface (objects.create, filter, all, get, save)
+backed by a single JSON file in the persistent storage volume.
+"""
+
+import json
+import os
+import threading
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, List, Optional
+
+from shared_config.logger_config import get_logger
+
+logger = get_logger(__name__)
+
+_STORE_PATH = (
+    Path(os.getenv("INTERNAL_PERSISTENT_STORAGE_VOLUME", "/tt_studio_persistent_volume"))
+    / "backend_volume"
+    / "deployments.json"
+)
+
+_lock = threading.Lock()
+
+
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def _parse_dt(s: Optional[str]) -> Optional[datetime]:
+    if s is None:
+        return None
+    try:
+        return datetime.fromisoformat(s)
+    except Exception:
+        return None
+
+
+def _sort_key(record: dict, field: str):
+    """Return a sortable key for a field, handling None and datetime strings."""
+    val = record.get(field)
+    if val is None:
+        return ""
+    return val  # ISO strings sort lexicographically = chronologically
+
+
+def _load_raw() -> dict:
+    if not _STORE_PATH.exists():
+        return {"next_id": 1, "records": []}
+    try:
+        with open(_STORE_PATH, "r") as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning(f"Could not read deployment store, starting fresh: {e}")
+        return {"next_id": 1, "records": []}
+
+
+def _save_raw(data: dict) -> None:
+    _STORE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    tmp = _STORE_PATH.with_suffix(".tmp")
+    try:
+        with open(tmp, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+        os.replace(tmp, _STORE_PATH)
+    except Exception as e:
+        logger.error(f"Failed to save deployment store: {e}")
+        try:
+            tmp.unlink(missing_ok=True)
+        except Exception:
+            pass
+
+
+def _match(record: dict, kwargs: dict) -> bool:
+    """Match a record against filter kwargs, supporting __in and __isnull suffixes."""
+    for key, val in kwargs.items():
+        if key.endswith("__in"):
+            field = key[: -len("__in")]
+            if record.get(field) not in val:
+                return False
+        elif key.endswith("__isnull"):
+            field = key[: -len("__isnull")]
+            is_null = record.get(field) is None
+            if is_null != val:
+                return False
+        else:
+            if record.get(key) != val:
+                return False
+    return True
+
+
+class _QuerySet:
+    def __init__(self, records: List[dict]):
+        self._records = records
+
+    def filter(self, **kwargs) -> "_QuerySet":
+        return _QuerySet([r for r in self._records if _match(r, kwargs)])
+
+    def order_by(self, *fields) -> "_QuerySet":
+        records = list(self._records)
+        for field in reversed(fields):
+            reverse = field.startswith("-")
+            fname = field.lstrip("-")
+            records.sort(key=lambda r: _sort_key(r, fname), reverse=reverse)
+        return _QuerySet(records)
+
+    def first(self) -> Optional["ModelDeployment"]:
+        if not self._records:
+            return None
+        return ModelDeployment._from_dict(self._records[0])
+
+    def exists(self) -> bool:
+        return len(self._records) > 0
+
+    def count(self) -> int:
+        return len(self._records)
+
+    def get(self, **kwargs) -> "ModelDeployment":
+        matches = [r for r in self._records if _match(r, kwargs)]
+        if not matches:
+            raise ModelDeployment.DoesNotExist(f"No record matching {kwargs}")
+        if len(matches) > 1:
+            raise Exception(f"Multiple records matching {kwargs}")
+        return ModelDeployment._from_dict(matches[0])
+
+    def __iter__(self):
+        return (ModelDeployment._from_dict(r) for r in self._records)
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            return _QuerySet(self._records[key])
+        return ModelDeployment._from_dict(self._records[key])
+
+    def __len__(self) -> int:
+        return len(self._records)
+
+
+class _Manager:
+    def create(self, **kwargs) -> "ModelDeployment":
+        with _lock:
+            data = _load_raw()
+            record = {
+                "id": data["next_id"],
+                "container_id": kwargs.get("container_id", ""),
+                "container_name": kwargs.get("container_name", ""),
+                "model_name": kwargs.get("model_name", ""),
+                "device": kwargs.get("device", ""),
+                "deployed_at": _now().isoformat(),
+                "stopped_at": None,
+                "status": kwargs.get("status", "running"),
+                "stopped_by_user": kwargs.get("stopped_by_user", False),
+                "port": kwargs.get("port", None),
+                "device_id": kwargs.get("device_id", 0),
+                "workflow_log_path": kwargs.get("workflow_log_path", None),
+            }
+            data["next_id"] += 1
+            data["records"].append(record)
+            _save_raw(data)
+        return ModelDeployment._from_dict(record)
+
+    def all(self) -> _QuerySet:
+        with _lock:
+            data = _load_raw()
+        return _QuerySet(list(data["records"]))
+
+    def filter(self, **kwargs) -> _QuerySet:
+        return self.all().filter(**kwargs)
+
+    def get(self, **kwargs) -> "ModelDeployment":
+        return self.all().get(**kwargs)
+
+
+class ModelDeployment:
+    class DoesNotExist(Exception):
+        pass
+
+    objects: _Manager  # set below
+
+    def __init__(self):
+        self.id: Optional[int] = None
+        self.container_id: str = ""
+        self.container_name: str = ""
+        self.model_name: str = ""
+        self.device: str = ""
+        self.deployed_at: Optional[datetime] = None
+        self.stopped_at: Optional[datetime] = None
+        self.status: str = "running"
+        self.stopped_by_user: bool = False
+        self.port: Optional[int] = None
+        self.device_id: int = 0
+        self.workflow_log_path: Optional[str] = None
+
+    @classmethod
+    def _from_dict(cls, d: dict) -> "ModelDeployment":
+        obj = cls()
+        obj.id = d.get("id")
+        obj.container_id = d.get("container_id", "")
+        obj.container_name = d.get("container_name", "")
+        obj.model_name = d.get("model_name", "")
+        obj.device = d.get("device", "")
+        obj.deployed_at = _parse_dt(d.get("deployed_at"))
+        obj.stopped_at = _parse_dt(d.get("stopped_at"))
+        obj.status = d.get("status", "running")
+        obj.stopped_by_user = d.get("stopped_by_user", False)
+        obj.port = d.get("port")
+        obj.device_id = d.get("device_id", 0)
+        obj.workflow_log_path = d.get("workflow_log_path")
+        return obj
+
+    def _to_dict(self) -> dict:
+        return {
+            "id": self.id,
+            "container_id": self.container_id,
+            "container_name": self.container_name,
+            "model_name": self.model_name,
+            "device": self.device,
+            "deployed_at": self.deployed_at.isoformat() if self.deployed_at else None,
+            "stopped_at": self.stopped_at.isoformat() if self.stopped_at else None,
+            "status": self.status,
+            "stopped_by_user": self.stopped_by_user,
+            "port": self.port,
+            "device_id": self.device_id,
+            "workflow_log_path": self.workflow_log_path,
+        }
+
+    def save(self) -> None:
+        with _lock:
+            data = _load_raw()
+            for i, r in enumerate(data["records"]):
+                if r.get("id") == self.id:
+                    data["records"][i] = self._to_dict()
+                    _save_raw(data)
+                    return
+            # Not found — append as new (shouldn't happen in normal flow)
+            logger.warning(f"save() called on deployment id={self.id} not found in store; appending")
+            data["records"].append(self._to_dict())
+            _save_raw(data)
+
+    def __str__(self) -> str:
+        return f"{self.model_name} on {self.device} - {self.status}"
+
+
+ModelDeployment.objects = _Manager()
diff --git a/app/backend/docker_control/docker_utils.py b/app/backend/docker_control/docker_utils.py
index 0c4ab8f5..e98d775a 100644
--- a/app/backend/docker_control/docker_utils.py
+++ b/app/backend/docker_control/docker_utils.py
@@ -50,6 +50,33 @@ def _ensure_network():
 # Initialize network on module load
 _ensure_network()
 
+# When deploying a single-chip model on a multi-chip board, the inference
+# server needs the constituent single-chip device name (e.g. "n300" for one
+# chip of a T3K board), not the board-level name ("t3k").
+_BOARD_TO_SINGLE_CHIP_DEVICE = {
+    # Multi-chip Wormhole boards → constituent N300 chip
+    "T3K":    "n300",
+    "T3000":  "n300",
+    "N300x4": "n300",
+    "N150X4": "n150",
+    # Multi-chip Blackhole boards → constituent single-chip device
+    "P150X4":  "p150",
+    "P150X8":  "p150",
+    "P300Cx2": "p300c",
+    "P300Cx4": "p300c",
+    # Galaxy (N300-based)
+    "GALAXY":     "n300",
+    "GALAXY_T3K": "n300",
+    # True single-chip boards are unchanged
+    "N150":  "n150",
+    "N300":  "n300",
+    "E150":  "e150",
+    "P100":  "p100",
+    "P150":  "p150",
+    "P300c": "p300c",
+    "unknown": "cpu",
+}
+
 
 def map_board_type_to_device_name(board_type):
     """Map our internal board type names to TT Inference Server device names"""
@@ -86,217 +113,114 @@ def map_board_type_to_device_name(board_type):
     logger.info(f"Mapped board type '{board_type}' to device name '{device_name}'")
     return device_name
 
-def run_container(impl, weights_id):
+def run_container(impl, weights_id, device_id=0):
     """Run a docker container via TT Inference Server API"""
-    if (impl.model_type == ModelTypes.CHAT):
-        # For chat models, we use the TT Inference Server API to run the container
-        try:
-            logger.info(f"Calling TT Inference Server API")
-            logger.info(f"run_container called for {impl.model_name}")
-
-            board_type = detect_board_type()
+    try:
+        logger.info(f"Calling TT Inference Server API")
+        logger.info(f"run_container called for {impl.model_name}")
+
+        # Determine the correct inference-server device name.
+        # A single-chip model on a multi-chip board (e.g. Llama-8B on T3K)
+        # must use the constituent chip device ("n300"), not the board device
+        # ("t3k"). We use chips_required + board_type to pick the right name.
+        from shared_config.model_config import infer_chips_required
+        board_type = detect_board_type()
+        chips_required = infer_chips_required(impl.device_configurations)
+        if chips_required == 1:
+            device = _BOARD_TO_SINGLE_CHIP_DEVICE.get(board_type, "cpu")
+        else:
             device = map_board_type_to_device_name(board_type)
-            
-            # Create payload for the API call
-            payload = {
-                "model": impl.model_name,
-                "workflow": "server",  # Default workflow for container runs
-                "device": device,  # Use mapped device name
-                "docker_server": True,
-                "dev_mode": True
-            }
-
-            logger.info(f"API payload: {payload}")
-
-            # Make POST request to TT Inference Server API
-            api_url = "http://172.18.0.1:8001/run"
+        logger.info(
+            f"Device name '{device}' for {impl.model_name} "
+            f"(board={board_type}, chips_required={chips_required})"
+        )
+
+        BASE_SERVICE_PORT = 7000
+
+        # Create payload for the API call
+        payload = {
+            "model": impl.model_name,
+            "workflow": "server",  # Default workflow for container runs
+            "device": device,  # Use mapped device name
+            "docker_server": True,
+            "dev_mode": True,
+        }
 
-            response = requests.post(
-                api_url,
-                json=payload,
-                timeout=DEPLOYMENT_TIMEOUT_SECONDS  # 5 hour timeout for container startup and weight downloads
-            )
+        # Only pin to a specific chip slot for multi-chip boards
+        if chips_required > 1:
+            payload["device_id"] = str(device_id)
+            payload["service_port"] = str(BASE_SERVICE_PORT + device_id)
+            service_port = BASE_SERVICE_PORT + device_id
+        else:
+            service_port = BASE_SERVICE_PORT  # single chip always uses base port
 
-            if response.status_code in [200, 202]:
-                api_result = response.json()
-                logger.info(f"API call successful (status {response.status_code}): {api_result}")
-                logger.info(f"api_result contains docker_log_file_path: {'docker_log_file_path' in api_result}")
-                if 'docker_log_file_path' in api_result:
-                    logger.info(f"api_result['docker_log_file_path'] = {api_result.get('docker_log_file_path')}")
-                else:
-                    logger.warning(f"docker_log_file_path NOT found in api_result. Available keys: {list(api_result.keys())}")
+        # media/forge models require skipping hw validation; vLLM models do not
+        if impl.model_type != ModelTypes.CHAT:
+            payload["skip_system_sw_validation"] = True
 
-                # Update deploy cache on success
-                update_deploy_cache()
-                
-                # Notify agent about new container deployment
-                notify_agent_of_new_container(api_result["container_name"])
-                
-                # Save deployment record to database
-                container_id = None
-                container_name = "unknown"
-                try:
-                    container_id = api_result.get("container_id")
-                    container_name = api_result.get("container_name", "unknown")
-                    
-                    # If container_id is not in response, try to get it from Docker by name
-                    if not container_id and container_name:
-                        try:
-                            docker_client = get_docker_client()
-                            container_info = docker_client.get_container(container_name)
-                            container_id = container_info.get("id")
-                            logger.info(f"Retrieved container_id {container_id} from Docker for {container_name}")
-                        except Exception as docker_error:
-                            logger.warning(f"Could not get container_id from Docker: {docker_error}")
-                            # Use container_name as fallback ID if we can't get the actual ID
-                            container_id = container_name
-                    
-                    if container_id:
-                        # Extract workflow log path from API response
-                        workflow_log_path = api_result.get("docker_log_file_path")
-                        logger.info(f"Extracted workflow_log_path from api_result: {workflow_log_path}")
-                        logger.info(f"workflow_log_path type: {type(workflow_log_path)}, is None: {workflow_log_path is None}")
-                        
-                        ModelDeployment.objects.create(
-                            container_id=container_id,
-                            container_name=container_name,
-                            model_name=impl.model_name,
-                            device=device,
-                            status="running",
-                            stopped_by_user=False,
-                            port=7000,  # TT Inference Server default port
-                            workflow_log_path=workflow_log_path
-                        )
-                        logger.info(f"Saved deployment record for {container_name} (ID: {container_id})")
-                        if workflow_log_path:
-                            logger.info(f"Workflow log path saved: {workflow_log_path}")
-                        else:
-                            logger.warning(f"Workflow log path is None/empty for {container_name}")
-                    else:
-                        logger.warning(f"Could not save deployment record: no container_id or container_name")
-                except Exception as e:
-                    import traceback
-                    logger.error(
-                        f"Failed to save deployment record for {container_name} (ID: {container_id}): {type(e).__name__}: {e}\n"
-                        f"Traceback: {traceback.format_exc()}"
-                    )
-                    # Don't fail the deployment if we can't save the record
-
-                return {
-                    "status": "success",
-                    "container_name": api_result["container_name"],
-                    "container_id": api_result.get("container_id"),  # Pass through container_id
-                    "job_id": api_result.get("job_id") or api_result.get("container_id"),  # Use job_id or container_id as fallback
-                    "api_response": api_result
-                }
-            else:
-                error_msg = f"API call failed with status {response.status_code}: {response.text}"
-                logger.error(error_msg)
-                
-                # Try to extract job_id and error details from response
-                job_id = None
-                error_detail = error_msg
-                try:
-                    error_data = response.json()
-                    if isinstance(error_data, dict):
-                        # Extract job_id if present
-                        job_id = error_data.get('job_id')
-                        # Extract error message if present
-                        error_detail = error_data.get('message', error_msg)
-                        logger.info(f"Extracted job_id from error response: {job_id}")
-                except Exception as parse_error:
-                    logger.warning(f"Could not parse error response: {parse_error}")
-                
-                return {
-                    "status": "error",
-                    "message": error_detail,
-                    "job_id": job_id
-                }
-
-        except requests.exceptions.RequestException as e:
-            error_msg = f"Network error calling TT Inference Server API: {str(e)}"
-            logger.error(error_msg)
-            return {"status": "error", "message": error_msg}
-        except Exception as e:
-            error_msg = f"Unexpected error in run_container: {str(e)}"
-            logger.error(error_msg)
-            return {"status": "error", "message": error_msg}
-    else:
-        # For non-chat models, we use the docker client to run the container
-        try:
-            logger.info(f"run_container called for {impl.model_name}")
-
-
-            run_kwargs = copy.deepcopy(impl.docker_config)
-            # handle runtime configuration changes to docker kwargs
-            device_mounts = get_devices_mounts(impl)
-            if device_mounts:
-                run_kwargs.update({"devices": device_mounts})
-            run_kwargs.update({"ports": get_port_mounts(impl)})
-            # add bridge inter-container network
-            run_kwargs.update({"network": backend_config.docker_bridge_network_name})
-            # add unique container name suffixing with host port
-            host_port = list(run_kwargs["ports"].values())[0]
-            logger.info(f"!!!host_port:= {host_port}")
-            run_kwargs.update({"name": f"{impl.container_base_name}_p{host_port}"})
-            run_kwargs.update({"hostname": f"{impl.container_base_name}_p{host_port}"})
-            # add environment variables
-            run_kwargs["environment"]["MODEL_WEIGHTS_ID"] = weights_id
-            # container path, not backend path
-            run_kwargs["environment"]["MODEL_WEIGHTS_PATH"] = get_model_weights_path(
-                impl.model_container_weights_dir, weights_id
-            )
-            logger.info(f"run_kwargs:= {run_kwargs}")
-
-            # Convert run_kwargs to docker-control-service API format
-            docker_client = get_docker_client()
-            api_kwargs = {
-                "image": impl.image_version,
-                "name": run_kwargs.get("name"),
-                "command": run_kwargs.get("command"),
-                "environment": run_kwargs.get("environment", {}),
-                "ports": run_kwargs.get("ports", {}),
-                "volumes": run_kwargs.get("volumes"),
-                "network": run_kwargs.get("network"),
-                "detach": run_kwargs.get("detach", True),
-            }
+        logger.info(f"API payload: {payload}")
 
-            # Add devices if present
-            if "devices" in run_kwargs:
-                api_kwargs["devices"] = run_kwargs["devices"]
+        # Make POST request to TT Inference Server API
+        api_url = "http://172.18.0.1:8001/run"
 
-            # Add hostname if present
-            if "hostname" in run_kwargs:
-                api_kwargs["hostname"] = run_kwargs["hostname"]
+        response = requests.post(
+            api_url,
+            json=payload,
+            timeout=DEPLOYMENT_TIMEOUT_SECONDS  # 5 hour timeout for container startup and weight downloads
+        )
 
-            container_result = docker_client.run_container(**api_kwargs)
-            logger.info(f"Container started via docker-control-service: {container_result}")
+        if response.status_code in [200, 202]:
+            api_result = response.json()
+            logger.info(f"API call successful (status {response.status_code}): {api_result}")
+            logger.info(f"api_result contains docker_log_file_path: {'docker_log_file_path' in api_result}")
+            if 'docker_log_file_path' in api_result:
+                logger.info(f"api_result['docker_log_file_path'] = {api_result.get('docker_log_file_path')}")
+            else:
+                logger.warning(f"docker_log_file_path NOT found in api_result. Available keys: {list(api_result.keys())}")
 
-            # Extract container info from API response
-            container_id = container_result.get("id")
-            container_name = container_result.get("name")
-            # on changes to containers, update deploy cache
+            # Update deploy cache on success
             update_deploy_cache()
 
             # Notify agent about new container deployment
-            notify_agent_of_new_container(container_name)
+            notify_agent_of_new_container(api_result["container_name"])
 
-            # Save deployment record to database
+            # Create the deployment record only after successful API response
+            # (never before — avoids stale "starting" records blocking slots on failure)
+            container_id = None
+            container_name = "unknown"
             try:
-                # Get device from impl configuration
-                device_config = impl.device_configurations[0] if impl.device_configurations else None
-                device_name = device_config.name if device_config else "unknown"
-
-                ModelDeployment.objects.create(
-                    container_id=container_id,
-                    container_name=container_name,
-                    model_name=impl.model_name,
-                    device=device_name,
-                    status="running",
-                    stopped_by_user=False,
-                    port=host_port
-                )
-                logger.info(f"Saved deployment record for {container_name} (ID: {container_id})")
+                container_id = api_result.get("container_id")
+                container_name = api_result.get("container_name", "unknown")
+
+                # If container_id is not in response, try to get it from Docker by name
+                if not container_id and container_name:
+                    try:
+                        docker_client = get_docker_client()
+                        container_info = docker_client.get_container(container_name)
+                        container_id = container_info.get("id")
+                        logger.info(f"Retrieved container_id {container_id} from Docker for {container_name}")
+                    except Exception as docker_error:
+                        logger.warning(f"Could not get container_id from Docker: {docker_error}")
+                        container_id = container_name
+
+                if container_id:
+                    workflow_log_path = api_result.get("docker_log_file_path")
+                    logger.info(f"Extracted workflow_log_path from api_result: {workflow_log_path}")
+
+                    ModelDeployment.objects.create(
+                        container_id=container_id,
+                        container_name=container_name,
+                        model_name=impl.model_name,
+                        device=device,
+                        device_id=device_id,
+                        status="running",
+                        stopped_by_user=False,
+                        port=service_port,
+                        workflow_log_path=workflow_log_path
+                    )
+                    logger.info(f"Saved deployment record for {container_name} (ID: {container_id})")
+                else:
+                    logger.warning(f"Could not save deployment record: no container_id or container_name")
             except Exception as e:
                 import traceback
                 logger.error(
@@ -307,13 +231,43 @@ def run_container(impl, weights_id):
 
             return {
                 "status": "success",
-                "container_id": container_id,
-                "container_name": container_name,
-                "service_route": impl.service_route,
-                "port_bindings": run_kwargs["ports"],
+                "container_name": api_result["container_name"],
+                "container_id": api_result.get("container_id"),  # Pass through container_id
+                "job_id": api_result.get("job_id") or api_result.get("container_id"),  # Use job_id or container_id as fallback
+                "api_response": api_result
+            }
+        else:
+            error_msg = f"API call failed with status {response.status_code}: {response.text}"
+            logger.error(error_msg)
+
+            # Try to extract job_id and error details from response
+            job_id = None
+            error_detail = error_msg
+            try:
+                error_data = response.json()
+                if isinstance(error_data, dict):
+                    # Extract job_id if present
+                    job_id = error_data.get('job_id')
+                    # Extract error message if present
+                    error_detail = error_data.get('message', error_msg)
+                    logger.info(f"Extracted job_id from error response: {job_id}")
+            except Exception as parse_error:
+                logger.warning(f"Could not parse error response: {parse_error}")
+
+            return {
+                "status": "error",
+                "message": error_detail,
+                "job_id": job_id
             }
-        except Exception as e:
-            return {"status": "error", "message": str(e)}
+
+    except requests.exceptions.RequestException as e:
+        error_msg = f"Network error calling TT Inference Server API: {str(e)}"
+        logger.error(error_msg)
+        return {"status": "error", "message": error_msg}
+    except Exception as e:
+        error_msg = f"Unexpected error in run_container: {str(e)}"
+        logger.error(error_msg)
+        return {"status": "error", "message": error_msg}
 
 def run_agent_container(container_name, port_bindings, impl):
     # runs agent container after associated llm container runs
@@ -355,22 +309,47 @@ def get_runtime_device_configuration(device_configurations):
     return next(iter(device_configurations))
 
 
-def get_devices_mounts(impl):
+def get_devices_mounts(impl, device_id=0):
     device_config = get_runtime_device_configuration(impl.device_configurations)
     assert isinstance(device_config, DeviceConfigurations)
-    # TODO: add logic to handle multiple devices and multiple containers
-    single_device_mounts = ["/dev/tenstorrent/0:/dev/tenstorrent/0"]
+
+    # Single-chip device configurations: pin to the requested chip slot
+    single_chip_configs = {
+        DeviceConfigurations.E150,
+        DeviceConfigurations.N150,
+        DeviceConfigurations.N150_WH_ARCH_YAML,
+        DeviceConfigurations.N300,
+        DeviceConfigurations.N300_WH_ARCH_YAML,
+        DeviceConfigurations.P100,
+        DeviceConfigurations.P150,
+        DeviceConfigurations.P300c,
+    }
+
+    # Multi-chip configurations manage their own chip allocation; expose full directory
     all_device_mounts = ["/dev/tenstorrent:/dev/tenstorrent"]
-    device_map = {
-        DeviceConfigurations.E150: single_device_mounts,
-        DeviceConfigurations.N150: single_device_mounts,
-        DeviceConfigurations.N150_WH_ARCH_YAML: single_device_mounts,
-        DeviceConfigurations.N300: single_device_mounts,
-        DeviceConfigurations.N300x4_WH_ARCH_YAML: all_device_mounts,
-        DeviceConfigurations.N300x4: all_device_mounts,
+
+    if device_config in single_chip_configs:
+        return [f"/dev/tenstorrent/{device_id}:/dev/tenstorrent/{device_id}"]
+
+    # Multi-chip (T3K, Galaxy, N300x4, P150X4, P150X8, etc.)
+    multi_chip_configs = {
+        DeviceConfigurations.N150X4,
+        DeviceConfigurations.N300x4,
+        DeviceConfigurations.N300x4_WH_ARCH_YAML,
+        DeviceConfigurations.T3K,
+        DeviceConfigurations.T3K_RING,
+        DeviceConfigurations.T3K_LINE,
+        DeviceConfigurations.P150X4,
+        DeviceConfigurations.P150X8,
+        DeviceConfigurations.P300Cx2,
+        DeviceConfigurations.P300Cx4,
+        DeviceConfigurations.GALAXY,
+        DeviceConfigurations.GALAXY_T3K,
     }
-    device_mounts = device_map.get(device_config)
-    return device_mounts
+    if device_config in multi_chip_configs:
+        return all_device_mounts
+
+    return None
 
 
 def get_port_mounts(impl):
@@ -493,6 +472,15 @@ def parse_env_var_str(env_var_list):
 
 def get_container_status():
     containers = get_managed_containers()
+
+    # Build container_id → device_id lookup from deployment database
+    device_id_lookup: dict = {}
+    try:
+        for dep in ModelDeployment.objects.filter(status__in=["starting", "running"]):
+            device_id_lookup[dep.container_id] = dep.device_id
+    except Exception as e:
+        logger.warning(f"Could not load device_id lookup: {e}")
+
     data = {}
     for con in containers:
         data[con.id] = {
@@ -508,6 +496,7 @@ def get_container_status():
                 for k, v in con.attrs.get("NetworkSettings").get("Networks").items()
             },
             "env_vars": parse_env_var_str(con.attrs.get("Config").get("Env")),
+            "device_id": device_id_lookup.get(con.id),
         }
     return data
 
@@ -550,12 +539,12 @@ def update_deploy_cache():
             if is_tt_inference_container:
                 logger.info(f"Detected TT Inference Server container: {con['name']} (ID: {con_id})")
                 
-                # Try to find the model implementation from the database
+                # Try to find the model implementation from the deployment store
                 deployment_found = False
                 try:
                     from docker_control.models import ModelDeployment
                     deployment = ModelDeployment.objects.filter(container_id=con_id).first()
-                    
+
                     if deployment:
                         # Find the model implementation by model name
                         model_impl = None
@@ -565,11 +554,12 @@ def update_deploy_cache():
                                 logger.info(f"Matched TT Inference Server container to model_impl: {model_impl.model_name}")
                                 deployment_found = True
                                 break
-                        
+
                         if not model_impl:
                             logger.warning(f"Could not find model_impl for {deployment.model_name} in container {con['name']}")
                     else:
-                        logger.warning(f"No deployment record found for TT Inference Server container {con_id}")
+                        # No record by container_id — could be a pre-existing container or still starting up
+                        logger.debug(f"No deployment record found for TT Inference Server container {con_id}")
                 except Exception as e:
                     # Check if this is a migration/database issue
                     error_str = str(e).lower()
@@ -582,13 +572,25 @@ def update_deploy_cache():
                 if not deployment_found:
                     logger.info(f"Using fallback logic to match container {con['name']}")
                     # Try to match by container name
+                    # First try exact match
                     model_impl = None
                     for k, v in model_implmentations.items():
-                        if v.model_name in con["name"]:
+                        if v.model_name == con["name"]:
                             model_impl = v
-                            logger.info(f"Matched container by name to model_impl: {model_impl.model_name}")
+                            logger.info(f"Matched container by exact name to model_impl: {model_impl.model_name}")
                             break
-                    
+
+                    # Fall back to longest-substring match (prevents short names like "Llama-3.1-8B"
+                    # from beating "Llama-3.1-8B-Instruct" on container name "Llama-3.1-8B-Instruct")
+                    if not model_impl:
+                        best_match_len = 0
+                        for k, v in model_implmentations.items():
+                            if v.model_name in con["name"] and len(v.model_name) > best_match_len:
+                                model_impl = v
+                                best_match_len = len(v.model_name)
+                        if model_impl:
+                            logger.info(f"Matched container by name substring to model_impl: {model_impl.model_name}")
+
                     if not model_impl:
                         logger.warning(f"Could not match TT Inference Server container {con['name']} to any model_impl. Skipping.")
                         continue
@@ -625,11 +627,23 @@ def update_deploy_cache():
             hostname = con["networks"][backend_config.docker_bridge_network_name][
                 "DNSNames"
             ][0]
+            # Use the actual container port from port bindings instead of the
+            # static model_impl.service_port (which is always 7000).  Multi-slot
+            # deployments bind to 7000+device_id, so we must resolve the real port.
+            actual_port = model_impl.service_port  # default fallback
+            port_bindings = con.get("port_bindings", {})
+            if port_bindings:
+                container_port_key = next(iter(port_bindings.keys()), None)
+                if container_port_key:
+                    try:
+                        actual_port = int(container_port_key.split("/")[0])
+                    except (ValueError, IndexError):
+                        pass
             con["internal_url"] = (
-                f"{hostname}:{model_impl.service_port}{model_impl.service_route}"
+                f"{hostname}:{actual_port}{model_impl.service_route}"
             )
             con["health_url"] = (
-                f"{hostname}:{model_impl.service_port}{model_impl.health_route}"
+                f"{hostname}:{actual_port}{model_impl.health_route}"
             )
             cache.set(con_id, con, timeout=None)
             logger.info(f"Added container {con['name']} (ID: {con_id[:12]}) to deploy cache")
@@ -655,195 +669,92 @@ def remove_id_prefix(s):
 
 
 def perform_reset():
+    """
+    Reset the TT board using tt-smi -r (up to 2 attempts, 30-second timeout each).
+
+    The tt-smi -s pre-check has been intentionally removed: when the board is in
+    a bad state tt-smi -s itself hangs, which makes recovery worse.  We go
+    straight to tt-smi -r and let the result speak for itself.
+    """
     try:
-        logger.info("Running initial tt-smi -s command to check device detection.")
-
-        # Initial check to see if Tenstorrent devices are detected
-        def check_device_detection():
-            process = subprocess.Popen(
-                ["tt-smi", "-s"],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                stdin=subprocess.DEVNULL,  # Prevents interactive command-line interface
-                text=True,
-            )
-            output = []
-            detected_chips = 0
-            warnings = []
-            for line in iter(process.stdout.readline, ""):
-                logger.info(f"tt-smi output: {line.strip()}")
-                output.append(line)
-                lower_line = line.lower()
-                if "detected chips" in lower_line:
-                    # Expect format like: "Detected Chips: 2"
-                    try:
-                        parts = line.strip().split(":")
-                        if len(parts) == 2:
-                            detected_chips = int(parts[1].strip().split()[0])
-                    except (ValueError, IndexError) as e:
-                        warnings.append(f"Unable to parse detected chips from line: {line.strip()}")
-                        logger.warning(f"Unable to parse detected chips from line '{line.strip()}': {e}")
-                if "response_q out of sync" in lower_line or "rd_ptr" in lower_line:
-                    warnings.append(line.strip())
-                if "No Tenstorrent devices detected" in line:
-                    return {
-                        "status": "error",
-                        "message": "No Tenstorrent devices detected! Please check your hardware and try again.",
-                        "output": "".join(output),
-                        "http_status": 503,  # Service Unavailable
-                    }
-            process.stdout.close()
-            return_code = process.wait()
-            
-            # Parse JSON output if text parsing didn't find chips
-            if detected_chips == 0:
-                full_output = "".join(output)
-                try:
-                    json_data = json.loads(full_output)
-                    if "device_info" in json_data and isinstance(json_data["device_info"], list):
-                        detected_chips = len(json_data["device_info"])
-                        logger.info(f"Detected {detected_chips} chips from JSON output")
-                except json.JSONDecodeError as e:
-                    logger.warning(f"Could not parse tt-smi output as JSON: {e}")
-            
-            # If chips are detected, allow reset but surface warnings/return code
-            if detected_chips > 0:
-                if return_code != 0:
-                    warnings.append(f"tt-smi -s exited with code {return_code}")
-                status_val = "success" if not warnings and return_code == 0 else "warning"
-                return {
-                    "status": status_val,
-                    "output": "".join(output),
-                    "warnings": warnings,
-                    "detected_chips": detected_chips,
-                    "return_code": return_code,
-                }
-            if return_code != 0:
-                return {
-                    "status": "error",
-                    "message": f"tt-smi -s command failed with return code {return_code}. Please check if tt-smi is properly installed.",
-                    "output": "".join(output),
-                    "http_status": 500,  # Internal Server Error
-                }
-            return {
-                "status": "success",
-                "message": "No Tenstorrent devices detected. tt-smi executed successfully.",
-                "output": "".join(output),
-                "detected_chips": 0,
-                "return_code": return_code,
-            }
+        logger.info("Starting board reset — running tt-smi -r directly (no pre-check)")
 
-        # Run the device detection check
-        detection_result = check_device_detection()
-        detection_warnings = detection_result.get("warnings", [])
-        detection_output = detection_result.get("output", "")
-        if detection_result.get("status") == "error":
-            return detection_result
-        if detection_output:
-            cumulative_output = [detection_output]
-        else:
-            cumulative_output = []
-        if detection_warnings:
-            cumulative_output.append("Warnings during device detection:\n")
-            cumulative_output.extend([w + "\n" for w in detection_warnings])
-
-        logger.info("Running tt-smi reset command.")
-
-        def stream_command_output(command):
-            logger.info(f"Executing command: {' '.join(command)}")
-            process = subprocess.Popen(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                stdin=subprocess.DEVNULL,  # Prevents interactive command-line interface
-                text=True,
-            )
-            output = []
-            for line in iter(process.stdout.readline, ""):
-                logger.info(f"Command output: {line.strip()}")
-                output.append(line)
-            process.stdout.close()
-            return_code = process.wait()
-            if return_code != 0:
-                logger.info(f"Command failed with return code {return_code}")
-                output.append(f"Command failed with return code {return_code}")
-                error_message = "tt-smi reset failed. Please check if:\n"
-                error_message += "1. The Tenstorrent device is properly connected\n"
-                error_message += "2. You have the correct permissions to access the device\n"
-                error_message += "3. The tt-smi utility is properly installed\n"
-                error_message += "4. The device firmware is up to date"
-                return {
-                    "status": "error",
-                    "message": error_message,
-                    "output": "".join(output),
-                    "http_status": 500,  # Internal Server Error
-                }
-            else:
-                logger.info(
-                    f"Command completed successfully with return code {return_code}"
+        # Signal that a reset is in progress so the device-state endpoint reports RESETTING
+        SystemResourceService.set_resetting_state()
+
+        MAX_ATTEMPTS = 2
+        last_output = ""
+
+        for attempt in range(1, MAX_ATTEMPTS + 1):
+            logger.info(f"Reset attempt {attempt} of {MAX_ATTEMPTS}")
+            try:
+                process = subprocess.Popen(
+                    ["tt-smi", "-r"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    stdin=subprocess.DEVNULL,
+                    text=True,
+                    preexec_fn=os.setsid,
                 )
-                return {"status": "success", "output": "".join(output)}
-
-        # Attempt software resets first (up to MAX_RESET_ATTEMPTS)
-        MAX_RESET_ATTEMPTS = 3
-        reset_attempts = 0
-        reset_success = False
-
-        # Try tt-smi reset with retries (no reset config file; use default tt-smi behavior)
-        while reset_attempts < MAX_RESET_ATTEMPTS and not reset_success:
-            reset_attempts += 1
-            logger.info(f"Reset attempt {reset_attempts} of {MAX_RESET_ATTEMPTS}")
-            cumulative_output.append(f"Attempting reset {reset_attempts} of {MAX_RESET_ATTEMPTS}...\n")
-
-            # Perform reset using tt-smi default behavior (no reset_config.json)
-            cumulative_output.append("Executing tt-smi -r with default reset configuration.\n")
-            reset_result = stream_command_output(["tt-smi", "-r"])
-            cumulative_output.append(reset_result.get('output', '') + "\n")
-
-            if reset_result.get("status") == "success":
-                logger.info(f"Reset attempt {reset_attempts} succeeded")
-                reset_success = True
-                break
-
-            logger.warning(f"Reset attempt {reset_attempts} failed")
-            # Small delay between attempts
-            time.sleep(2)
-
-        # If all reset attempts failed
-        if not reset_success:
-            all_output = "".join(cumulative_output)
-            logger.error(f"All {MAX_RESET_ATTEMPTS} reset attempts failed")
-            return {
-                "status": "error", 
-                "message": f"All {MAX_RESET_ATTEMPTS} reset attempts failed using tt-smi --reset command.",
-                "output": all_output,
-                "http_status": 500
-            }
 
-        all_output = "".join(cumulative_output)
-        if reset_success:
-            return {
-                "status": "success",
-                "message": f"Reset successful after {reset_attempts} attempt(s)",
-                "output": all_output,
-                "warnings": detection_warnings,
-                "http_status": 200
-            }
-        else:
-            return {
-                "status": "error",
-                "message": "All reset attempts failed with no specific error",
-                "output": all_output,
-                "warnings": detection_warnings,
-                "http_status": 500
-            }
+                try:
+                    stdout, _ = process.communicate(timeout=30)
+                    last_output = stdout
+                    logger.info(f"tt-smi -r attempt {attempt} output: {stdout.strip()!r:.200}")
+
+                    if process.returncode == 0:
+                        logger.info(f"Reset succeeded on attempt {attempt}")
+                        SystemResourceService.clear_device_state_cache()
+                        return {
+                            "status": "success",
+                            "message": f"Board reset successfully after {attempt} attempt(s)",
+                            "attempts_used": attempt,
+                            "output": stdout,
+                            "http_status": 200,
+                        }
+
+                    logger.warning(
+                        f"Reset attempt {attempt} failed: exit code {process.returncode}"
+                    )
+
+                except subprocess.TimeoutExpired:
+                    logger.warning(f"Reset attempt {attempt} timed out after 30s")
+                    try:
+                        os.killpg(os.getpgid(process.pid), signal.SIGTERM)
+                        process.wait(timeout=2)
+                    except Exception:
+                        try:
+                            os.killpg(os.getpgid(process.pid), signal.SIGKILL)
+                        except Exception:
+                            pass
+                    last_output = "(timeout)"
+
+            except Exception as exc:
+                logger.error(f"Reset attempt {attempt} raised exception: {exc}")
+                last_output = str(exc)
+
+        # All attempts failed
+        logger.error(f"All {MAX_ATTEMPTS} reset attempts failed")
+        SystemResourceService.clear_device_state_cache()
+        return {
+            "status": "error",
+            "message": (
+                f"Board did not recover after {MAX_ATTEMPTS} reset attempts. "
+                "Manual intervention may be required."
+            ),
+            "attempts_used": MAX_ATTEMPTS,
+            "output": last_output,
+            "http_status": 500,
+        }
 
     except Exception as e:
-        logger.exception("Exception occurred during reset operation.")
+        logger.exception("Unexpected error during reset operation")
+        SystemResourceService.clear_device_state_cache()
         return {
             "status": "error",
             "message": str(e),
-            "output": "An exception occurred during the reset operation.",
+            "attempts_used": 0,
+            "output": "",
             "http_status": 500,
         }
 
diff --git a/app/backend/docker_control/health_monitor.py b/app/backend/docker_control/health_monitor.py
index 1e1a8e86..7e0faf0b 100644
--- a/app/backend/docker_control/health_monitor.py
+++ b/app/backend/docker_control/health_monitor.py
@@ -16,15 +16,45 @@
 _stop_monitoring = False
 
 
+def _cleanup_stale_starting_records():
+    """Remove pending 'starting' records older than 10 minutes.
+
+    These are left behind when a deployment API call fails after the
+    pending record was already created.  They permanently block their
+    chip slot if not cleaned up.
+    """
+    try:
+        stale_cutoff = timezone.now() - timezone.timedelta(minutes=10)
+        starting_deployments = ModelDeployment.objects.filter(status="starting")
+        for dep in starting_deployments:
+            if (
+                dep.container_id.startswith("pending_")
+                and dep.deployed_at is not None
+                and dep.deployed_at < stale_cutoff
+            ):
+                logger.info(
+                    f"Cleaning up stale 'starting' record: {dep.model_name} "
+                    f"(id={dep.id}, deployed_at={dep.deployed_at})"
+                )
+                dep.status = "failed"
+                dep.stopped_at = timezone.now()
+                dep.save()
+    except Exception as e:
+        logger.error(f"Error cleaning up stale starting records: {e}")
+
+
 def check_container_health():
-    """Check for containers that died unexpectedly"""
+    """Check for containers that died unexpectedly and clean up stale records"""
     try:
+        # Clean up stale pending records that block chip slots
+        _cleanup_stale_starting_records()
+
         # Get all running deployments from database
         running_deployments = ModelDeployment.objects.filter(status="running")
-        
+
         if not running_deployments.exists():
             return
-        
+
         logger.debug(f"Checking health of {running_deployments.count()} running deployments")
 
         # Check actual Docker container status via docker-control-service
diff --git a/app/backend/docker_control/migrations/0001_initial.py b/app/backend/docker_control/migrations/0001_initial.py
deleted file mode 100644
index 0b4c168d..00000000
--- a/app/backend/docker_control/migrations/0001_initial.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Generated by Django 5.0.4 on 2025-11-12 15:18
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    initial = True
-
-    dependencies = [
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name='ModelDeployment',
-            fields=[
-                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('container_id', models.CharField(db_index=True, max_length=255, unique=True)),
-                ('container_name', models.CharField(db_index=True, max_length=255)),
-                ('model_name', models.CharField(db_index=True, max_length=255)),
-                ('device', models.CharField(max_length=50)),
-                ('deployed_at', models.DateTimeField(auto_now_add=True, db_index=True)),
-                ('stopped_at', models.DateTimeField(blank=True, null=True)),
-                ('status', models.CharField(db_index=True, default='running', max_length=50)),
-                ('stopped_by_user', models.BooleanField(default=False)),
-                ('port', models.IntegerField(blank=True, null=True)),
-            ],
-            options={
-                'ordering': ['-deployed_at'],
-                'indexes': [models.Index(fields=['status', '-deployed_at'], name='docker_cont_status_a5afde_idx'), models.Index(fields=['model_name', '-deployed_at'], name='docker_cont_model_n_2ecff9_idx')],
-            },
-        ),
-    ]
diff --git a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py b/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py
deleted file mode 100644
index 518dde93..00000000
--- a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Generated by Django 5.0.4 on 2025-11-12 21:35
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('docker_control', '0001_initial'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='modeldeployment',
-            name='workflow_log_path',
-            field=models.CharField(blank=True, help_text='Path to workflow log file from tt-inference-server', max_length=512, null=True),
-        ),
-    ]
diff --git a/app/backend/docker_control/models.py b/app/backend/docker_control/models.py
index a94f60ff..7f6b1f02 100644
--- a/app/backend/docker_control/models.py
+++ b/app/backend/docker_control/models.py
@@ -2,39 +2,6 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
-from django.db import models
-from django.utils import timezone
+from docker_control.deployment_store import ModelDeployment
 
-
-class ModelDeployment(models.Model):
-    """Track all model deployments with full history"""
-    # Deployment identification
-    container_id = models.CharField(max_length=255, unique=True, db_index=True)
-    container_name = models.CharField(max_length=255, db_index=True)
-    
-    # Model information
-    model_name = models.CharField(max_length=255, db_index=True)
-    device = models.CharField(max_length=50)  # n150, n300, etc.
-    
-    # Deployment metadata
-    deployed_at = models.DateTimeField(auto_now_add=True, db_index=True)
-    stopped_at = models.DateTimeField(null=True, blank=True)
-    
-    # Status tracking
-    status = models.CharField(max_length=50, default="running", db_index=True)
-    # Choices: starting, running, stopped, exited, dead, error
-    stopped_by_user = models.BooleanField(default=False)  # True if user clicked stop/delete
-    
-    # Container details
-    port = models.IntegerField(null=True, blank=True)
-    workflow_log_path = models.CharField(max_length=512, null=True, blank=True, help_text="Path to workflow log file from tt-inference-server")
-    
-    class Meta:
-        ordering = ['-deployed_at']
-        indexes = [
-            models.Index(fields=['status', '-deployed_at']),
-            models.Index(fields=['model_name', '-deployed_at']),
-        ]
-    
-    def __str__(self):
-        return f"{self.model_name} on {self.device} - {self.status}"
+__all__ = ["ModelDeployment"]
diff --git a/app/backend/docker_control/urls.py b/app/backend/docker_control/urls.py
index ae37307f..b94fabd0 100644
--- a/app/backend/docker_control/urls.py
+++ b/app/backend/docker_control/urls.py
@@ -9,6 +9,7 @@
     StopView,
     ContainersView,
     StatusView,
+    ChipStatusView,
     DeployView,
     DeploymentProgressView,
     DeploymentLogsView,
@@ -32,6 +33,7 @@
     path("deploy/progress/stream/<str:job_id>/", views.DeploymentProgressStreamView.as_view(), name="deployment-progress-stream"),
     path("stop/", views.StopView.as_view()),
     path("status/", views.StatusView.as_view()),
+    path("chip-status/", views.ChipStatusView.as_view(), name="chip-status"),
     path("redeploy/", views.RedeployView.as_view()),
     path("reset_board/", views.ResetBoardView.as_view()),
     path("docker/image_status/<str:model_id>/", views.ImageStatusView.as_view(), name="docker-image-status"),
diff --git a/app/backend/docker_control/views.py b/app/backend/docker_control/views.py
index 741a8bcf..bbbb9461 100644
--- a/app/backend/docker_control/views.py
+++ b/app/backend/docker_control/views.py
@@ -11,10 +11,11 @@
 from rest_framework.renderers import JSONRenderer
 from django.views.decorators.csrf import csrf_exempt
 from django.utils.decorators import method_decorator
-import json  
+import json
 import shutil
 import subprocess
 import os
+from pathlib import Path
 
 import re
 import os
@@ -43,6 +44,24 @@
 logger = get_logger(__name__)
 logger.info(f"importing {__name__}")
 
+# Build model_name → status lookup from catalog JSON
+_CATALOG_PATH = Path(__file__).parent.parent / "shared_config/models_from_inference_server.json"
+try:
+    _catalog = json.loads(_CATALOG_PATH.read_text())
+    _status_lookup: dict[str, str | None] = {m["model_name"]: m.get("status") for m in _catalog["models"]}
+except Exception:
+    logger.warning(f"Could not load model catalog from {_CATALOG_PATH}; status will be null for all models")
+    _status_lookup = {}
+
+# Manual compatibility overrides: model names that are always shown as compatible
+# (e.g. when sync JSON device_configurations don't match detected board)
+_OVERRIDE_PATH = Path(__file__).parent.parent / "shared_config/model_compatibility_overrides.json"
+try:
+    _override_data = json.loads(_OVERRIDE_PATH.read_text())
+    _compatibility_override_names: set[str] = set(_override_data.get("model_names", []))
+except Exception:
+    _compatibility_override_names = set()
+
 # Track when deployment started
 deployment_start_times = {}  # {job_id: timestamp} - Track when deployment started
 
@@ -179,16 +198,30 @@ def get(self, request, *args, **kwargs):
             for board, devices in board_to_device_map.items():
                 if board != 'unknown' and bool(set(devices).intersection(impl.device_configurations)):
                     compatible_boards.append(board)
+
+            # Manual override: always show certain models as compatible (e.g. whisper when sync JSON is incomplete)
+            if impl.model_name in _compatibility_override_names:
+                is_compatible = True
+                if current_board != 'unknown' and current_board not in compatible_boards:
+                    compatible_boards = list(compatible_boards) + [current_board]
+                logger.info(f"Model {impl.model_name}: compatibility overridden to True")
             
             logger.info(f"Model {impl.model_name}: compatible={is_compatible}, boards={compatible_boards}")
-            
+
+            # Infer chip requirements for this model
+            from shared_config.model_config import infer_chips_required
+            chips_required = infer_chips_required(impl.device_configurations)
+
             data.append({
                 "id": impl_id,
                 "name": impl.model_name,
                 "is_compatible": is_compatible,
                 "compatible_boards": compatible_boards,
                 "model_type": impl.model_type.value,
-                "current_board": current_board
+                "display_model_type": impl.display_model_type,
+                "current_board": current_board,
+                "status": _status_lookup.get(impl.model_name),
+                "chips_required": chips_required,
             })
         
         return Response(data, status=status.HTTP_200_OK)
@@ -200,6 +233,32 @@ def get(self, request, *args, **kwargs):
         return Response(data, status=status.HTTP_200_OK)
 
 
+class ChipStatusView(APIView):
+    """API endpoint for chip slot occupancy status"""
+
+    def get(self, request, *args, **kwargs):
+        """
+        Get current chip slot status.
+
+        Returns JSON with board type, total slots, and per-slot occupancy info.
+        """
+        try:
+            from docker_control.chip_allocator import ChipSlotAllocator
+
+            allocator = ChipSlotAllocator()
+            status_info = allocator.get_chip_status()
+
+            return Response(status_info, status=status.HTTP_200_OK)
+
+        except Exception as e:
+            logger.error(f"Error getting chip status: {str(e)}")
+            return Response(
+                {
+                    "error": "Failed to get chip status",
+                    "message": str(e)
+                },
+                status=status.HTTP_500_INTERNAL_SERVER_ERROR
+            )
 
 
 
@@ -207,16 +266,55 @@ class DeployView(APIView):
     def post(self, request, *args, **kwargs):
         serializer = DeploymentSerializer(data=request.data)
         if serializer.is_valid():
+            from docker_control.chip_allocator import ChipSlotAllocator, AllocationError, MultiChipConflictError
+
             impl_id = request.data.get("model_id")
             weights_id = request.data.get("weights_id")
+
+            # Get manual override if in advanced mode (optional)
+            manual_device_id = request.data.get("device_id")
+            if manual_device_id is not None:
+                manual_device_id = int(manual_device_id)
+
             impl = model_implmentations[impl_id]
-            response = run_container(impl, weights_id)
-            
+
+            # Auto-allocate chip slot
+            try:
+                allocator = ChipSlotAllocator()
+                device_id = allocator.allocate_chip_slot(
+                    impl.model_name,
+                    manual_override=manual_device_id
+                )
+                logger.info(f"Allocated device_id={device_id} for {impl.model_name}")
+
+            except MultiChipConflictError as e:
+                logger.warning(f"Multi-chip conflict for {impl.model_name}: {str(e)}")
+                return Response({
+                    "status": "error",
+                    "error_type": "multi_chip_conflict",
+                    "message": str(e),
+                    "conflicts": e.conflicts  # List of conflicting deployments
+                }, status=status.HTTP_409_CONFLICT)
+
+            except AllocationError as e:
+                logger.warning(f"Allocation failed for {impl.model_name}: {str(e)}")
+                return Response({
+                    "status": "error",
+                    "error_type": "allocation_failed",
+                    "message": str(e)
+                }, status=status.HTTP_409_CONFLICT)
+
+            # Continue with deployment using allocated device_id
+            response = run_container(impl, weights_id, device_id=device_id)
+
+            # Add allocated_device_id to response
+            response["allocated_device_id"] = device_id
+
             # Ensure job_id is set for progress tracking
             # Use job_id from API response, or fallback to container_id or container_name
             if not response.get("job_id"):
                 response["job_id"] = response.get("container_id") or response.get("container_name")
-            
+
             # Check if deployment failed
             if response.get("status") == "error":
                 logger.error(f"Deployment failed: {response.get('message', 'Unknown error')}")
@@ -224,14 +322,14 @@ def post(self, request, *args, **kwargs):
                     response,
                     status=status.HTTP_500_INTERNAL_SERVER_ERROR
                 )
-            
+
             # Refresh tt-smi cache after successful deployment
             if response.get("status") == "success":
                 try:
                     SystemResourceService.force_refresh_tt_smi_cache()
                 except Exception as e:
                     logger.warning(f"Failed to refresh tt-smi cache after deployment: {e}")
-            
+
             return Response(response, status=status.HTTP_201_CREATED)
         else:
             return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
@@ -706,14 +804,7 @@ def get(self, request, model_id):
             logger.info(f"Checking status for image: {image_name}:{image_tag}")
             image_status = check_image_exists(image_name, image_tag)
             logger.info(f"Image status result: {image_status}")
-            
-            # Add pull progress if available
-            if model_id in pull_progress:
-                image_status['pull_in_progress'] = True
-                image_status['progress'] = pull_progress[model_id]
-            else:
-                image_status['pull_in_progress'] = False
-            
+            image_status['pull_in_progress'] = False
             return Response(image_status, status=status.HTTP_200_OK)
         except KeyError:
             logger.warning(f"Model {model_id} not found in model_implementations")
@@ -1172,6 +1263,7 @@ def get(self, request):
                     'container_name': deployment.container_name,
                     'model_name': deployment.model_name,
                     'device': deployment.device,
+                    'device_id': deployment.device_id,
                     'deployed_at': deployment.deployed_at.isoformat() if deployment.deployed_at else None,
                     'stopped_at': deployment.stopped_at.isoformat() if deployment.stopped_at else None,
                     'status': deployment.status,
diff --git a/app/backend/model_control/metrics_tracker.py b/app/backend/model_control/metrics_tracker.py
index fcc948ee..091d5180 100644
--- a/app/backend/model_control/metrics_tracker.py
+++ b/app/backend/model_control/metrics_tracker.py
@@ -25,6 +25,20 @@ def __init__(self):
         self.prompt_tokens: int = 0
         self.last_token_count: int = 0
 
+    def record_content_token(self) -> None:
+        """Record arrival of a single content token (from delta chunks)"""
+        current_time = time.time()
+        if self.first_token_time is None:
+            self.first_token_time = current_time
+        self.token_times.append(current_time)
+        self.num_tokens += 1
+        self.last_token_count = self.num_tokens
+
+    def set_prompt_tokens(self, prompt_tokens: int) -> None:
+        """Set prompt token count from usage data"""
+        if self.prompt_tokens == 0:
+            self.prompt_tokens = prompt_tokens
+
     def record_token(self, completion_tokens: int, prompt_tokens: int = 0) -> None:
         """
         Record token arrival from usage data
diff --git a/app/backend/model_control/model_utils.py b/app/backend/model_control/model_utils.py
index 4e91e214..9b6454b5 100644
--- a/app/backend/model_control/model_utils.py
+++ b/app/backend/model_control/model_utils.py
@@ -26,6 +26,53 @@
 encoded_jwt = jwt.encode(json_payload, backend_config.jwt_secret, algorithm="HS256")
 AUTH_TOKEN = os.getenv('CLOUD_CHAT_UI_AUTH_TOKEN', '')
 
+def messages_to_prompt(messages: list) -> str:
+    """Convert chat messages list to a plain text prompt for base/completion models."""
+    parts = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        if role == "system":
+            parts.append(content)
+        elif role == "user":
+            parts.append(f"User: {content}")
+        elif role == "assistant":
+            parts.append(f"Assistant: {content}")
+    parts.append("Assistant:")
+    return "\n\n".join(parts)
+
+
+def get_model_name_from_container(internal_url: str, fallback: str) -> str:
+    """Query vLLM /v1/models to get the exact model name loaded in the container.
+
+    Args:
+        internal_url: Raw internal URL from deploy cache (e.g. "container:7000/v1/chat/completions")
+        fallback: Value to return if the query fails (typically hf_model_id)
+
+    Returns:
+        The actual model name reported by vLLM, or fallback on any error.
+    """
+    try:
+        # Strip the route path to get just host:port
+        # e.g. "container:7000/v1/chat/completions" -> "container:7000"
+        base = internal_url.split("/")[0]
+        models_url = f"http://{base}/v1/models"
+        headers = {"Authorization": f"Bearer {encoded_jwt}"}
+        response = requests.get(models_url, headers=headers, timeout=3)
+        if response.status_code == 200:
+            model_id = response.json()["data"][0]["id"]
+            logger.info(f"Resolved actual model name from /v1/models: {model_id}")
+            return model_id
+        else:
+            logger.warning(
+                f"GET {models_url} returned {response.status_code}, using fallback: {fallback}"
+            )
+            return fallback
+    except Exception as e:
+        logger.warning(f"Failed to query /v1/models ({e}), using fallback: {fallback}")
+        return fallback
+
+
 def get_deploy_cache():
     # the cache is initialized when by docker_control is imported
     def get_all_records():
@@ -45,13 +92,32 @@ def health_check(url, json_data, timeout=5):
     try:
         headers = {"Authorization": f"Bearer {encoded_jwt}"}
         response = requests.get(url, json=json_data, headers=headers, timeout=5)
-        response.raise_for_status()
-        logger.info(f"Health check passed: {response.status_code}")
-        return True, response.json() if response.content else {}
+    except requests.exceptions.ConnectionError as e:
+        # Port not yet listening — container is still starting up
+        logger.info(f"Health check: connection refused (starting): {e}")
+        return None, str(e)
     except requests.RequestException as e:
-        logger.error(f"Health check failed: {str(e)}")
+        logger.error(f"Health check failed (network error): {str(e)}")
         return False, str(e)
 
+    if response.status_code == 200:
+        logger.info(f"Health check passed: {response.status_code}")
+        return True, response.json() if response.content else {}
+
+    # 503 with "not ready" means model is still loading (media-server models)
+    if response.status_code == 503:
+        try:
+            body = response.json()
+        except Exception:
+            body = {}
+        detail = body.get("detail", "")
+        if "not ready" in detail.lower():
+            logger.info(f"Health check: model not ready yet (starting): {detail}")
+            return None, detail
+
+    logger.error(f"Health check failed: {response.status_code} {response.text[:200]}")
+    return False, response.text[:200]
+
 def stream_response_from_agent_api(url, json_data):
     logger.info('[TRACE_FLOW_STEP_3_BACKEND_TO_AGENT] stream_response_from_agent_api called', extra={'url': url, 'json_data': json_data})
     try:
@@ -173,7 +239,7 @@ def stream_to_cloud_model(url, json_data):
         json_data["top_k"] = int(top_k) if top_k is not None else 20
         json_data["top_p"] = float(top_p) if top_p is not None else 0.9
         json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512
-        json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True}
+        json_data["stream_options"] = {"include_usage": True}
 
         # Log final parameters being used
         logger.info("=== Final Model Parameters ===")
@@ -231,7 +297,7 @@ def stream_to_cloud_model(url, json_data):
                                     chunk_dict = json.loads(sub_chunk)
                                     logger.info(f"Successfully parsed JSON: {chunk_dict}")
 
-                                    usage = chunk_dict.get("usage", {})
+                                    usage = chunk_dict.get("usage") or {}
                                     completion_tokens = usage.get("completion_tokens", 0)
                                     prompt_tokens = usage.get("prompt_tokens", 0)
                                     logger.info(f"Usage info: {usage}, completion tokens: {completion_tokens}")
@@ -314,7 +380,7 @@ def stream_response_from_external_api(url, json_data):
     json_data["top_k"] = int(top_k) if top_k is not None else 20
     json_data["top_p"] = float(top_p) if top_p is not None else 0.9
     json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512
-    json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True}
+    json_data["stream_options"] = {"include_usage": True}
 
     # Log final parameters being used
     logger.info("=== Final Model Parameters ===")
@@ -366,23 +432,30 @@ def stream_response_from_external_api(url, json_data):
 
                     elif new_chunk != "":
                         chunk_dict = json.loads(new_chunk)
-                        usage = chunk_dict.get("usage", {})
-                        completion_tokens = usage.get("completion_tokens", 0)
-                        prompt_tokens = usage.get("prompt_tokens", 0)
 
-                        # Record token arrival using metrics tracker
-                        if completion_tokens > 0:
-                            tracker.record_token(
-                                completion_tokens=completion_tokens,
-                                prompt_tokens=prompt_tokens
-                            )
-                            logger.info(f"Recorded token: completion={completion_tokens}, TTFT={tracker.get_ttft():.4f}s, TPOT={tracker.get_tpot():.4f}s")
+                        # Track TTFT/TPOT from content delta chunks (accurate per-token timing)
+                        choices = chunk_dict.get("choices") or []
+                        if choices:
+                            delta_content = choices[0].get("delta", {}).get("content", "")
+                            if delta_content:
+                                tracker.record_content_token()
+                                logger.info(f"Recorded token: count={tracker.num_tokens}, TTFT={tracker.get_ttft():.4f}s, TPOT={tracker.get_tpot():.4f}s")
+
+                        # Capture prompt_tokens from usage chunk at the end
+                        usage = chunk_dict.get("usage") or {}
+                        prompt_tokens = usage.get("prompt_tokens", 0)
+                        if prompt_tokens > 0:
+                            tracker.set_prompt_tokens(prompt_tokens)
 
                     # Yield the current chunk
                     yield chunk
 
             logger.info("stream_response_from_external done")
 
+    except requests.exceptions.HTTPError as e:
+        body = e.response.text if e.response is not None else "(no body)"
+        logger.error(f"HTTPError {e.response.status_code}: {body}")
+        yield f"error: {str(e)}"
     except requests.RequestException as e:
         logger.error(f"RequestException: {str(e)}")
         yield f"error: {str(e)}"
diff --git a/app/backend/model_control/pipeline_views.py b/app/backend/model_control/pipeline_views.py
new file mode 100644
index 00000000..d2265dad
--- /dev/null
+++ b/app/backend/model_control/pipeline_views.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+"""
+Voice pipeline view: Whisper STT → LLM → TTS (optional).
+Accepts multipart/form-data and streams SSE events to the client.
+"""
+
+import base64
+import json
+import time
+
+import requests
+from django.http import StreamingHttpResponse
+from rest_framework.views import APIView
+
+from model_control.model_utils import (
+    encoded_jwt,
+    get_deploy_cache,
+    stream_response_from_external_api,
+)
+from shared_config.logger_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class VoicePipelineView(APIView):
+    """
+    POST /models-api/pipeline/voice/
+
+    Multipart fields:
+        audio_file        – audio blob
+        whisper_deploy_id – deploy_id of running Whisper
+        llm_deploy_id     – deploy_id of running LLM
+        tts_deploy_id     – (optional) deploy_id of running speecht5_tts
+        system_prompt     – (optional) string
+    """
+
+    def post(self, request, *args, **kwargs):
+        audio_file = request.FILES.get("audio_file")
+        whisper_deploy_id = request.data.get("whisper_deploy_id")
+        llm_deploy_id = request.data.get("llm_deploy_id")
+        tts_deploy_id = request.data.get("tts_deploy_id")
+        system_prompt = request.data.get(
+            "system_prompt",
+            "You are a helpful assistant. Be concise.",
+        )
+
+        if not audio_file:
+            from rest_framework.response import Response
+            from rest_framework import status
+            return Response(
+                {"error": "audio_file is required"},
+                status=status.HTTP_400_BAD_REQUEST,
+            )
+        if not whisper_deploy_id or not llm_deploy_id:
+            from rest_framework.response import Response
+            from rest_framework import status
+            return Response(
+                {"error": "whisper_deploy_id and llm_deploy_id are required"},
+                status=status.HTTP_400_BAD_REQUEST,
+            )
+
+        def event_stream():
+            headers = {"Authorization": f"Bearer {encoded_jwt}"}
+            deploy_cache = get_deploy_cache()
+
+            # ------------------------------------------------------------------
+            # Step 1: STT (Whisper)
+            # ------------------------------------------------------------------
+            try:
+                whisper_deploy = deploy_cache[whisper_deploy_id]
+                whisper_url = "http://" + whisper_deploy["internal_url"]
+                file_payload = {
+                    "file": (audio_file.name, audio_file, audio_file.content_type)
+                }
+                stt_resp = requests.post(
+                    whisper_url, files=file_payload, headers=headers, timeout=60
+                )
+                stt_resp.raise_for_status()
+                transcript = stt_resp.json().get("text", "")
+                yield f"data: {json.dumps({'type': 'transcript', 'text': transcript})}\n\n"
+            except Exception as exc:
+                logger.error(f"STT step failed: {exc}")
+                yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': str(exc)})}\n\n"
+                return
+
+            if not transcript:
+                yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': 'Empty transcript'})}\n\n"
+                return
+
+            # ------------------------------------------------------------------
+            # Step 2: LLM streaming
+            # ------------------------------------------------------------------
+            llm_deploy = deploy_cache[llm_deploy_id]
+            llm_url = "http://" + llm_deploy["internal_url"]
+            hf_model_id = llm_deploy["model_impl"].hf_model_id
+
+            messages = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": transcript})
+
+            llm_payload = {
+                "model": hf_model_id,
+                "messages": messages,
+                "stream": True,
+                "max_tokens": 512,
+            }
+
+            llm_full_text = ""
+            try:
+                for chunk in stream_response_from_external_api(llm_url, llm_payload):
+                    if isinstance(chunk, bytes):
+                        chunk = chunk.decode("utf-8")
+                    llm_full_text += chunk
+                    yield f"data: {json.dumps({'type': 'llm_chunk', 'text': chunk})}\n\n"
+            except Exception as exc:
+                logger.error(f"LLM step failed: {exc}")
+                yield f"data: {json.dumps({'type': 'error', 'stage': 'llm', 'message': str(exc)})}\n\n"
+                return
+
+            # ------------------------------------------------------------------
+            # Step 3: TTS (optional)
+            # ------------------------------------------------------------------
+            if tts_deploy_id and llm_full_text.strip():
+                try:
+                    tts_deploy = deploy_cache[tts_deploy_id]
+                    tts_url = "http://" + tts_deploy["internal_url"]
+                    model_impl = tts_deploy.get("model_impl")
+                    model_name = getattr(model_impl, "model_name", None) if model_impl else None
+                    
+                    # Determine if this is OpenAI-style or enqueue-style endpoint
+                    is_openai_style = "/v1/audio/speech" in tts_url
+                    
+                    if is_openai_style:
+                        # OpenAI-style: POST directly and get audio back
+                        payload = {"model": model_name, "text": llm_full_text.strip(), "voice": "default"}
+                        tts_resp = requests.post(tts_url, json=payload, headers=headers, timeout=120)
+                        tts_resp.raise_for_status()
+                        
+                        audio_b64 = base64.b64encode(tts_resp.content).decode("utf-8")
+                        content_type = tts_resp.headers.get("Content-Type", "audio/wav")
+                        data_uri = f"data:{content_type};base64,{audio_b64}"
+                        yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n"
+                    else:
+                        # Enqueue-style: POST → poll status → fetch audio
+                        tts_resp = requests.post(
+                            tts_url,
+                            json={"text": llm_full_text.strip()},
+                            headers=headers,
+                            timeout=30,
+                        )
+                        
+                        # If 404 on enqueue, try fallback to /v1/audio/speech
+                        if tts_resp.status_code == 404 and "/enqueue" in tts_url:
+                            logger.info(f"Pipeline TTS 404 on {tts_url}, trying /v1/audio/speech")
+                            fallback_url = tts_url.replace("/enqueue", "/v1/audio/speech")
+                            payload = {"model": model_name, "text": llm_full_text.strip(), "voice": "default"}
+                            tts_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120)
+                            tts_resp.raise_for_status()
+                            
+                            audio_b64 = base64.b64encode(tts_resp.content).decode("utf-8")
+                            content_type = tts_resp.headers.get("Content-Type", "audio/wav")
+                            data_uri = f"data:{content_type};base64,{audio_b64}"
+                            yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n"
+                        else:
+                            tts_resp.raise_for_status()
+                            
+                            task_id = tts_resp.json().get("task_id")
+                            status_url = tts_url.replace("/enqueue", f"/status/{task_id}")
+
+                            # Poll for completion
+                            for _ in range(120):
+                                st = requests.get(status_url, headers=headers, timeout=10)
+                                if st.status_code != 404 and st.json().get("status") == "Completed":
+                                    break
+                                time.sleep(1)
+
+                            audio_url = tts_url.replace("/enqueue", f"/fetch_audio/{task_id}")
+                            audio_resp = requests.get(audio_url, headers=headers, timeout=30)
+                            audio_resp.raise_for_status()
+
+                            audio_b64 = base64.b64encode(audio_resp.content).decode("utf-8")
+                            content_type = audio_resp.headers.get("Content-Type", "audio/wav")
+                            data_uri = f"data:{content_type};base64,{audio_b64}"
+                            yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n"
+                            
+                except Exception as exc:
+                    logger.error(f"TTS step failed: {exc}")
+                    yield f"data: {json.dumps({'type': 'error', 'stage': 'tts', 'message': str(exc)})}\n\n"
+                    # Don't abort — transcript and LLM response were already sent
+
+            yield f"data: {json.dumps({'type': 'done'})}\n\n"
+
+        response = StreamingHttpResponse(event_stream(), content_type="text/event-stream")
+        response["Cache-Control"] = "no-cache"
+        response["X-Accel-Buffering"] = "no"
+        return response
diff --git a/app/backend/model_control/test_tts_fallback.py b/app/backend/model_control/test_tts_fallback.py
new file mode 100644
index 00000000..a46da61d
--- /dev/null
+++ b/app/backend/model_control/test_tts_fallback.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+"""
+Tests for TTS inference view fallback behavior.
+"""
+
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from rest_framework.test import APIRequestFactory
+from rest_framework import status as http_status
+
+from model_control.views import TtsInferenceView, OpenAIAudioSpeechView
+
+
+class TestTtsInferenceFallback:
+    """Test TTS inference view with fallback to /v1/audio/speech on 404."""
+    
+    @patch('model_control.views.get_deploy_cache')
+    @patch('model_control.views.requests.post')
+    def test_tts_fallback_on_404_from_enqueue(self, mock_post, mock_cache):
+        """When /enqueue returns 404 for TTS media model, should retry with /v1/audio/speech."""
+        # Setup mock deploy cache
+        mock_impl = Mock()
+        mock_impl.model_name = "speecht5_tts"
+        mock_impl.inference_engine = "media"
+        
+        mock_cache.return_value = {
+            "test_deploy_id": {
+                "internal_url": "speecht5_tts:7000/enqueue",
+                "model_impl": mock_impl
+            }
+        }
+        
+        # First call returns 404, second call succeeds
+        mock_resp_404 = Mock()
+        mock_resp_404.status_code = 404
+        
+        mock_resp_success = Mock()
+        mock_resp_success.status_code = 200
+        mock_resp_success.headers = {"Content-Type": "audio/wav"}
+        mock_resp_success.content = b"fake_audio_data"
+        
+        mock_post.side_effect = [mock_resp_404, mock_resp_success]
+        
+        # Create request
+        factory = APIRequestFactory()
+        request = factory.post('/models-api/tts/', {
+            'deploy_id': 'test_deploy_id',
+            'text': 'Hello world'
+        }, format='json')
+        
+        # Call view
+        view = TtsInferenceView.as_view()
+        response = view(request)
+        
+        # Verify fallback was attempted
+        assert mock_post.call_count == 2
+        first_call_url = mock_post.call_args_list[0][0][0]
+        second_call_url = mock_post.call_args_list[1][0][0]
+        
+        assert "enqueue" in first_call_url
+        assert "/v1/audio/speech" in second_call_url
+        assert response.status_code == 200
+    
+    @patch('model_control.views.get_deploy_cache')
+    @patch('model_control.views.requests.post')
+    def test_tts_success_without_fallback(self, mock_post, mock_cache):
+        """When initial request succeeds, should not retry."""
+        # Setup mock deploy cache
+        mock_impl = Mock()
+        mock_impl.model_name = "speecht5_tts"
+        mock_impl.inference_engine = "media"
+        
+        mock_cache.return_value = {
+            "test_deploy_id": {
+                "internal_url": "speecht5_tts:7000/v1/audio/speech",
+                "model_impl": mock_impl
+            }
+        }
+        
+        # First call succeeds
+        mock_resp_success = Mock()
+        mock_resp_success.status_code = 200
+        mock_resp_success.headers = {"Content-Type": "audio/wav"}
+        mock_resp_success.content = b"fake_audio_data"
+        
+        mock_post.return_value = mock_resp_success
+        
+        # Create request
+        factory = APIRequestFactory()
+        request = factory.post('/models-api/tts/', {
+            'deploy_id': 'test_deploy_id',
+            'text': 'Hello world'
+        }, format='json')
+        
+        # Call view
+        view = TtsInferenceView.as_view()
+        response = view(request)
+        
+        # Verify no fallback was needed
+        assert mock_post.call_count == 1
+        assert response.status_code == 200
+
+
+class TestOpenAIAudioSpeechFallback:
+    """Test OpenAI audio/speech view with fallback to /v1/audio/speech on 404."""
+    
+    @patch('model_control.views.get_deploy_cache')
+    @patch('model_control.views.requests.post')
+    def test_openai_audio_fallback_on_404(self, mock_post, mock_cache):
+        """OpenAI endpoint should also retry with /v1/audio/speech on 404."""
+        # Setup mock deploy cache
+        mock_impl = Mock()
+        mock_impl.model_name = "speecht5_tts"
+        mock_impl.inference_engine = "media"
+        
+        mock_cache.return_value = {
+            "deploy_1": {
+                "internal_url": "speecht5_tts:7000/enqueue",
+                "model_impl": mock_impl
+            }
+        }
+        
+        # First call returns 404, second call succeeds
+        mock_resp_404 = Mock()
+        mock_resp_404.status_code = 404
+        
+        mock_resp_success = Mock()
+        mock_resp_success.status_code = 200
+        mock_resp_success.headers = {"Content-Type": "audio/wav"}
+        mock_resp_success.content = b"fake_audio_data"
+        
+        mock_post.side_effect = [mock_resp_404, mock_resp_success]
+        
+        # Create request
+        factory = APIRequestFactory()
+        request = factory.post('/v1/audio/speech', {
+            'model': 'speecht5_tts',
+            'input': 'Hello world'
+        }, format='json')
+        
+        # Call view
+        view = OpenAIAudioSpeechView.as_view()
+        response = view(request)
+        
+        # Verify fallback was attempted
+        assert mock_post.call_count == 2
+        second_call_url = mock_post.call_args_list[1][0][0]
+        assert "/v1/audio/speech" in second_call_url
+        assert response.status_code == 200
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/app/backend/model_control/urls.py b/app/backend/model_control/urls.py
index 158dfde3..74590b91 100644
--- a/app/backend/model_control/urls.py
+++ b/app/backend/model_control/urls.py
@@ -5,6 +5,7 @@
 # model_control/urls.py
 from django.urls import path
 from . import views
+from .pipeline_views import VoicePipelineView
 
 urlpatterns = [
     path("inference/", views.InferenceView.as_view()),
@@ -18,6 +19,8 @@
     path("object-detection-cloud/", views.ObjectDetectionInferenceCloudView.as_view()),
     path("speech-recognition/", views.SpeechRecognitionInferenceView.as_view()),
     path("speech-recognition-cloud/", views.SpeechRecognitionInferenceCloudView.as_view()),
+    path("tts/", views.TtsInferenceView.as_view()),
+    path("pipeline/voice/", VoicePipelineView.as_view()),
     path("health/", views.ModelHealthView.as_view()),
     path("inference_cloud/", views.InferenceCloudView.as_view()),
     path("logs/<str:container_id>/", views.ContainerLogsView.as_view(), name="container-logs"),
diff --git a/app/backend/model_control/views.py b/app/backend/model_control/views.py
index 64bdc46b..7c03c671 100644
--- a/app/backend/model_control/views.py
+++ b/app/backend/model_control/views.py
@@ -42,6 +42,8 @@ def select_renderer(self, request, renderers, format_suffix):
 from model_control.model_utils import (
     encoded_jwt,
     get_deploy_cache,
+    get_model_name_from_container,
+    messages_to_prompt,
     stream_response_from_external_api,
     stream_response_from_agent_api,
     health_check,
@@ -57,6 +59,7 @@ def select_renderer(self, request, renderers, format_suffix):
 
 
 
+TTS_API_KEY = os.environ.get("TTS_API_KEY", "")
 CLOUD_CHAT_UI_URL =os.environ.get("CLOUD_CHAT_UI_URL")
 CLOUD_YOLOV4_API_URL = os.environ.get("CLOUD_YOLOV4_API_URL")
 CLOUD_YOLOV4_API_AUTH_TOKEN = os.environ.get("CLOUD_YOLOV4_API_AUTH_TOKEN")
@@ -85,8 +88,18 @@ def post(self, request, *args, **kwargs):
             internal_url = "http://" + deploy["internal_url"]
             logger.info(f"internal_url:= {internal_url}")
             logger.info(f"using vllm model:= {deploy["model_impl"].model_name}")
-            data["model"] = deploy["model_impl"].hf_model_id
-            
+            data["model"] = get_model_name_from_container(
+                deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id
+            )
+
+            # Route base/completion models to /v1/completions with a plain prompt
+            service_route = deploy["model_impl"].service_route
+            logger.info(f"service_route:= {service_route}")
+            if service_route == "/v1/completions":
+                messages = data.pop("messages", [])
+                data["prompt"] = messages_to_prompt(messages)
+                data.pop("stream_options", None)
+
             # Create a generator that can be cancelled
             def generate_response():
                 try:
@@ -116,7 +129,9 @@ def post(self, request, *agrs, **kwargs):
         if deploy_id and deploy_id in deploy_cache:
             deploy = deploy_cache[deploy_id]
             logger.info(f"using vllm model:= {deploy['model_impl'].model_name}")
-            data["model"] = deploy["model_impl"].hf_model_id
+            data["model"] = get_model_name_from_container(
+                deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id
+            )
         else:
             logger.info("No valid deployment found, proceeding with agent-only mode (cloud LLM)")
             # Remove deploy_id from data since it's not needed for agent
@@ -195,9 +210,12 @@ def get(self, request, *args, **kwargs):
             deploy = get_deploy_cache()[deploy_id]
             health_url = "http://" + deploy["health_url"]
             check_passed, health_content = health_check(health_url, json_data=None)
-            if check_passed:
+            if check_passed is True:
                 ret_status = status.HTTP_200_OK
                 content = {"message": "Healthy", "details": health_content}
+            elif check_passed is None:
+                ret_status = status.HTTP_202_ACCEPTED
+                content = {"message": "Starting", "details": health_content}
             else:
                 ret_status = status.HTTP_503_SERVICE_UNAVAILABLE
                 content = {"message": "Unavailable", "details": health_content}
@@ -615,10 +633,111 @@ def post(self, request, *args, **kwargs):
 
         return Response(inference_data.json(), status=status.HTTP_200_OK)
 
+class TtsInferenceView(APIView):
+    """Text-to-speech inference: supports both OpenAI-style and enqueue-style endpoints."""
+    def post(self, request, *args, **kwargs):
+        data = request.data
+        logger.info(f"{self.__class__.__name__} data:={data}")
+        serializer = InferenceSerializer(data=data)
+        if serializer.is_valid():
+            deploy_id = data.get("deploy_id")
+            text = data.get("text") or data.get("prompt")
+            if not text:
+                return Response({"error": "text is required"}, status=status.HTTP_400_BAD_REQUEST)
+            deploy = get_deploy_cache()[deploy_id]
+            internal_url = "http://" + deploy["internal_url"]
+            try:
+                model_impl = deploy.get("model_impl")
+                model_name = getattr(model_impl, "model_name", None) if model_impl else None
+                inference_engine = getattr(model_impl, "inference_engine", None)
+                
+                if inference_engine == "media":
+                    headers = {"Authorization": f"Bearer {TTS_API_KEY}"}
+                    payload = {"model": model_name, "text": text, "voice": "default"}
+                else:
+                    headers = {"Authorization": f"Bearer {encoded_jwt}"}
+                    payload = {"model": model_name, "input": text, "voice": "default"}
+                
+                audio_resp = requests.post(internal_url, json=payload, headers=headers, timeout=120)
+                
+                # If 404 on /enqueue for TTS media model, retry with /v1/audio/speech
+                if audio_resp.status_code == 404 and inference_engine == "media" and "/enqueue" in internal_url:
+                    logger.info(f"TTS 404 on {internal_url}, retrying with /v1/audio/speech")
+                    fallback_url = internal_url.replace("/enqueue", "/v1/audio/speech")
+                    audio_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120)
+                
+                audio_resp.raise_for_status()
+
+                content_type = audio_resp.headers.get("Content-Type", "audio/wav")
+                django_response = HttpResponse(audio_resp.content, content_type=content_type)
+                django_response["Content-Disposition"] = "attachment; filename=tts_output.wav"
+                return django_response
+
+            except requests.exceptions.HTTPError as http_err:
+                logger.error(f"TTS HTTP error: {http_err}")
+                return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
+        else:
+            return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
+
+
+class OpenAIAudioSpeechView(APIView):
+    """OpenAI-compatible POST /v1/audio/speech — looks up deployed TTS model by name."""
+    def post(self, request, *args, **kwargs):
+        data = request.data
+        model_name = data.get("model")
+        text = data.get("input") or data.get("text")
+        if not model_name:
+            return Response({"error": "model is required"}, status=status.HTTP_400_BAD_REQUEST)
+        if not text:
+            return Response({"error": "input is required"}, status=status.HTTP_400_BAD_REQUEST)
+
+        # Find a running TTS deployment matching the requested model name
+        deploy = None
+        for entry in get_deploy_cache().values():
+            impl = entry.get("model_impl")
+            if impl and getattr(impl, "model_name", None) == model_name:
+                deploy = entry
+                break
+        if deploy is None:
+            return Response(
+                {"error": f"No running deployment found for model '{model_name}'"},
+                status=status.HTTP_404_NOT_FOUND,
+            )
+
+        internal_url = "http://" + deploy["internal_url"]
+        try:
+            model_impl = deploy.get("model_impl")
+            inference_engine = getattr(model_impl, "inference_engine", None)
+            
+            if inference_engine == "media":
+                headers = {"Authorization": f"Bearer {TTS_API_KEY}"}
+                payload = {"model": model_name, "text": text, "voice": data.get("voice", "default")}
+            else:
+                headers = {"Authorization": f"Bearer {encoded_jwt}"}
+                payload = {"model": model_name, "input": text, "voice": data.get("voice", "default")}
+            
+            audio_resp = requests.post(internal_url, json=payload, headers=headers, timeout=120)
+            
+            # If 404 on /enqueue for TTS media model, retry with /v1/audio/speech
+            if audio_resp.status_code == 404 and inference_engine == "media" and "/enqueue" in internal_url:
+                logger.info(f"OpenAI audio/speech 404 on {internal_url}, retrying with /v1/audio/speech")
+                fallback_url = internal_url.replace("/enqueue", "/v1/audio/speech")
+                audio_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120)
+            
+            audio_resp.raise_for_status()
+
+            content_type = audio_resp.headers.get("Content-Type", "audio/wav")
+            return HttpResponse(audio_resp.content, content_type=content_type)
+
+        except requests.exceptions.HTTPError as http_err:
+            logger.error(f"OpenAI audio/speech HTTP error: {http_err}")
+            return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+
 class ContainerLogsView(View):
     # Define event detection configuration before the get method
     SIMPLE_EVENT_KEYWORDS = [
-        '[ERROR]', '[FATAL]', '[CRITICAL]', 
+        '[ERROR]', '[FATAL]', '[CRITICAL]',
         '[WARN]', '[WARNING]',
         'RESPONSE_Q OUT OF SYNC',
         'ABORTED', 'CORE DUMPED',
diff --git a/app/backend/shared_config/model_config.py b/app/backend/shared_config/model_config.py
index 9d9a7e93..21545fda 100644
--- a/app/backend/shared_config/model_config.py
+++ b/app/backend/shared_config/model_config.py
@@ -2,6 +2,7 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+import json
 import os
 from dataclasses import dataclass, asdict
 from typing import Set, Dict, Any, Union
@@ -11,7 +12,6 @@
 from shared_config.backend_config import backend_config
 from shared_config.setup_config import SetupTypes
 from shared_config.model_type_config import ModelTypes
-from shared_config.model_type_config import ModelTypes
 from shared_config.logger_config import get_logger
 
 logger = get_logger(__name__)
@@ -62,6 +62,8 @@ class ModelImpl:
     service_port: int = 7000
     env_file: str = ""
     health_route: str = "/health"
+    display_model_type: str = "LLM"
+    inference_engine: str = "vllm"
 
     def __post_init__(self):
         # _init methods compute values that are dependent on other values
@@ -216,54 +218,113 @@ def base_docker_config():
     }
 
 
-# model_ids are unique strings to define a model, they could be uuids but
-# using friendly strings prefixed with id_ is more helpful for debugging
+# ---------------------------------------------------------------------------
+# JSON-based model loader
+# ---------------------------------------------------------------------------
+
+CATALOG_JSON = Path(__file__).parent / "models_from_inference_server.json"
+
+# device_type strings in the catalog → DeviceConfigurations member names
+# (only names that actually exist in the enum; others are skipped)
+_CATALOG_DEVICE_MAP = {
+    "N150": "N150",
+    "N300": "N300",
+    "T3K": "T3K",
+    "N150X4": "N150X4",
+    "P100": "P100",
+    "P150": "P150",
+    "P150X4": "P150X4",
+    "P150X8": "P150X8",
+    "GALAXY": "GALAXY",
+    "GALAXY_T3K": "GALAXY_T3K",
+}
+
+
+def load_model_implementations_from_json(json_path: Path) -> list:
+    with open(json_path) as f:
+        catalog = json.load(f)
+    impls = []
+    for entry in catalog["models"]:
+        docker_image = entry.get("docker_image") or ""
+        if ":" in docker_image:
+            image_name, image_tag = docker_image.rsplit(":", 1)
+        else:
+            image_name, image_tag = docker_image, "latest"
+
+        device_configs = {
+            DeviceConfigurations[_CATALOG_DEVICE_MAP[d]]
+            for d in entry.get("device_configurations", [])
+            if d in _CATALOG_DEVICE_MAP
+        }
+
+        try:
+            model_type = ModelTypes[entry["model_type"]]
+        except KeyError:
+            model_type = ModelTypes.CHAT
+
+        try:
+            setup_type = SetupTypes[entry["setup_type"]]
+        except KeyError:
+            setup_type = SetupTypes.TT_INFERENCE_SERVER
+
+        cfg = base_docker_config()
+        cfg["environment"].update(entry.get("env_vars") or {})
+
+        impl = ModelImpl(
+            model_name=entry["model_name"],
+            hf_model_id=entry.get("hf_model_id"),
+            image_name=image_name,
+            image_tag=image_tag,
+            device_configurations=device_configs,
+            docker_config=cfg,
+            service_route=entry["service_route"],
+            setup_type=setup_type,
+            model_type=model_type,
+            version=entry.get("version", "0.0.1"),
+            shm_size=entry.get("shm_size", "32G"),
+            display_model_type=entry.get("display_model_type", "LLM"),
+            inference_engine=entry.get("inference_engine", "vllm"),
+        )
+        impls.append(impl)
+    return impls
 
-# Helper device configuration sets for easier management
-N150_N300 = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML}
-ALL_BOARDS = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML, DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML}
-T3000_ONLY = {DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML}
 
-model_implmentations_list = [
-    # Speech Recognition - Can run on N150 and N300
+# ---------------------------------------------------------------------------
+# Hardcoded models NOT present in tt-inference-server catalog
+# ---------------------------------------------------------------------------
+
+_ALL_WH_BOARDS = {
+    DeviceConfigurations.N150,
+    DeviceConfigurations.N150_WH_ARCH_YAML,
+    DeviceConfigurations.N300,
+    DeviceConfigurations.N300_WH_ARCH_YAML,
+    DeviceConfigurations.N300x4,
+    DeviceConfigurations.N300x4_WH_ARCH_YAML,
+}
+
+_hardcoded_impls = [
+    # Object Detection - legacy YOLOv4 (not in tt-inference-server catalog)
     ModelImpl(
-        model_name="Whisper-Distil-Large-v3",
-        model_id="id_whisper_distil_large_v3_v0.1.0",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-whisper-distil-large-v3-dev",
-        image_tag="v0.0.1-tt-metal-1a1a9e2bb102",
-        device_configurations=ALL_BOARDS,  # Can run on N150 and N300
+        model_name="YOLOv4",
+        model_id="id_yolov4v0.0.1",
+        image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-yolov4-src-base",
+        image_tag="v0.0.1-tt-metal-65d246482b3f",
+        device_configurations=_ALL_WH_BOARDS,
         docker_config=base_docker_config(),
         shm_size="32G",
         service_port=7000,
-        service_route="/inference",
-        health_route="/",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.SPEECH_RECOGNITION,
+        service_route="/objdetection_v2",
+        setup_type=SetupTypes.NO_SETUP,
+        model_type=ModelTypes.OBJECT_DETECTION,
+        display_model_type="CNN",
     ),
-    # TODO: add this model back in when its in tt-inference-server-main branch
-    # Image Generation - Can run on N150 and N300
-    # ModelImpl(
-    #     model_name="Stable-Diffusion-3.5-medium",
-    #     model_id="id_stable_diffusion_3.5_mediumv0.1.0",
-    #     image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-3.5-src-base",
-    #     image_tag="v0.0.1-tt-metal-a0560feb3eed",
-    #     device_configurations=ALL_BOARDS,  # Can run on N150 and N300
-    #     docker_config=base_docker_config(),
-    #     shm_size="32G",
-    #     service_port=7000,
-    #     service_route="/enqueue",
-    #     health_route="/",
-    #     setup_type=SetupTypes.TT_INFERENCE_SERVER,
-    #     model_type=ModelTypes.IMAGE_GENERATION,
-    # ),
-
-    # Image Generation - Can run on N150 and N300
+    # Legacy Stable-Diffusion-1.4 (not in tt-inference-server catalog)
     ModelImpl(
         model_name="Stable-Diffusion-1.4",
         model_id="id_stable_diffusionv0.1.0",
         image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-1.4-src-base",
         image_tag="v0.0.1-tt-metal-cc8b4e1dac99",
-        device_configurations=ALL_BOARDS,  # Can run on N150 and N300
+        device_configurations=_ALL_WH_BOARDS,
         docker_config=base_docker_config(),
         shm_size="32G",
         service_port=7000,
@@ -271,148 +332,96 @@ def base_docker_config():
         health_route="/",
         setup_type=SetupTypes.TT_INFERENCE_SERVER,
         model_type=ModelTypes.IMAGE_GENERATION,
+        display_model_type="IMAGE",
     ),
+]
 
-    # Object Detection - Can run on all boards
-    ModelImpl(
-        model_name="YOLOv4",
-        model_id="id_yolov4v0.0.1",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-yolov4-src-base",
-        image_tag="v0.0.1-tt-metal-65d246482b3f",
-        device_configurations=ALL_BOARDS,  # Can run on all boards
-        docker_config=base_docker_config(),
-        shm_size="32G",
-        service_port=7000,
-        service_route="/objdetection_v2",
-        setup_type=SetupTypes.NO_SETUP,
-        model_type=ModelTypes.OBJECT_DETECTION
-    ),
 
-    # Mock Chat 
-    # TODO: currently not working.
-    # remove this model for now until its in tt-inference-server-main branch
-    #  TODO: add / make a new mock model
-    # ModelImpl(
-    #     hf_model_id="meta-llama/Llama-3.1-70B-Instruct",
-    #     model_name="Mock-Llama-3.1-70B-Instruct",
-    #     model_id="id_mock_vllm_modelv0.0.1",
-    #     image_name="ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api",
-    #     image_tag="v0.0.1-tt-metal-385904186f81-384f1790c3be",
-    #     device_configurations={DeviceConfigurations.CPU},
-    #     docker_config=base_docker_config(),
-    #     shm_size="1G",
-    #     service_port=7000,
-    #     service_route="/v1/chat/completions",
-    #     setup_type=SetupTypes.MAKE_VOLUMES,
-    #     model_type=ModelTypes.MOCK
-    # ),
-
-    # --- Chat Models ---
-
-    # 1B, 3B, 8B, 11B models - Can run on all boards
-    ModelImpl(
-        hf_model_id="meta-llama/Llama-3.2-1B-Instruct",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations=ALL_BOARDS,  # Can run on all boards
-        docker_config=base_docker_config(),
-        service_route="/v1/chat/completions",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
+def validate_model_implemenation_config(impl):
+    # no / in model_id strings, model_id will be used in path names
+    assert "/" not in impl.model_id
 
-    ),
-    ModelImpl(
-        hf_model_id="meta-llama/Llama-3.2-3B-Instruct",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations=ALL_BOARDS,  # Can run on all boards
-        docker_config=base_docker_config(),
-        service_route="/v1/chat/completions",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
-  
-    ),
-    ModelImpl(
-        hf_model_id="meta-llama/Llama-3.1-8B-Instruct",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations=ALL_BOARDS | {DeviceConfigurations.P300Cx2},
-        docker_config=base_docker_config(),
-        service_route="/v1/chat/completions",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
 
-    ),
-    # TODO: add this model back in when its in tt-inference-server-main branch
-    # ModelImpl(
-    #     hf_model_id="meta-llama/Llama-3.2-11B-Vision-Instruct",
-    #     image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-    #     image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-    #     device_configurations=ALL_BOARDS,  # Can run on all boards
-    #     docker_config=base_docker_config(),
-    #     service_route="/v1/chat/completions",
-    #     setup_type=SetupTypes.TT_INFERENCE_SERVER,
-    #     model_type=ModelTypes.CHAT
- 
-    # ),
-
-    # 32B models - T3000 and P300Cx2
-    ModelImpl(
-        hf_model_id="Qwen/Qwen3-32B",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations={DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML, DeviceConfigurations.P300Cx2},
-        docker_config=base_docker_config(),
-        service_route="/v1/chat/completions",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
-    ),
+# ---------------------------------------------------------------------------
+# Build final model_implmentations dict
+# ---------------------------------------------------------------------------
 
-    # 70B models - Only T3000
+_json_impls = load_model_implementations_from_json(CATALOG_JSON)
 
-    ModelImpl(
-        hf_model_id="meta-llama/Llama-3.1-70B-Instruct",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations=T3000_ONLY,  # Only T3000
-        docker_config=base_docker_config(),
-        shm_size="32G",
-        service_port=7000,
-        service_route="/v1/chat/completions",
-        env_file=os.environ.get("VLLM_LLAMA31_ENV_FILE"),
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
-    ),
-    # ModelImpl(
-    #     hf_model_id="meta-llama/Llama-3.1-70B-Instruct",
-    #     image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-    #     image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-    #     device_configurations=T3000_ONLY,  # Only T3000
-    #     docker_config=base_docker_config(),
-    #     service_route="/v1/chat/completions",
-    #     setup_type=SetupTypes.TT_INFERENCE_SERVER,
-    #     model_type=ModelTypes.CHAT
-    # ),
-    ModelImpl(
-        hf_model_id="meta-llama/Llama-3.3-70B-Instruct",
-        image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64",
-        image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc",
-        device_configurations=T3000_ONLY | {DeviceConfigurations.P300Cx2},
-        docker_config=base_docker_config(),
-        service_route="/v1/chat/completions",
-        setup_type=SetupTypes.TT_INFERENCE_SERVER,
-        model_type=ModelTypes.CHAT
-    ),
-    #! Add new model vLLM model implementations here
-]
+model_implmentations = {}
+for impl in _json_impls + _hardcoded_impls:
+    validate_model_implemenation_config(impl)
+    model_implmentations[impl.model_id] = impl
 
-def validate_model_implemenation_config(impl):
-    # no / in model_id strings, model_id will be used in path names
-    assert not "/" in impl.model_id
 
+# ---------------------------------------------------------------------------
+# Chip Requirement Inference
+# ---------------------------------------------------------------------------
 
-# build and validate the model_implmentations config
-model_implmentations = {}
-for impl in model_implmentations_list:
-    validate_model_implemenation_config(impl)
-    model_implmentations[impl.model_id] = impl
\ No newline at end of file
+# Board type classifications for chip allocation
+SINGLE_CHIP_BOARDS_STR = {"N150", "N300", "E150", "P100", "P150", "P300c"}
+MULTI_CHIP_ONLY_BOARDS_STR = {
+    "T3K", "GALAXY", "GALAXY_T3K", "P150X4", "P150X8",
+    "N150X4", "N300x4", "P300Cx2", "P300Cx4"
+}
+
+
+def infer_chips_required(device_configurations: Set[DeviceConfigurations]) -> int:
+    """
+    Infer chip requirements from device_configurations set.
+
+    Logic:
+    - If model supports ANY single-chip board → requires 1 chip
+    - If model ONLY supports multi-chip boards → requires 4 chips
+    - Default to 1 chip for unknown configurations
+
+    Args:
+        device_configurations: Set of DeviceConfigurations enum values
+
+    Returns:
+        Number of chips required (1 or 4)
+
+    Examples:
+        Single-chip model (supports N150, N300, etc.):
+            infer_chips_required({DeviceConfigurations.N150, DeviceConfigurations.T3K}) → 1
+
+        Multi-chip only model (only T3K, Galaxy, P150X4):
+            infer_chips_required({DeviceConfigurations.T3K, DeviceConfigurations.GALAXY}) → 4
+    """
+    if not device_configurations:
+        return 1  # Default to single chip
+
+    # Convert DeviceConfigurations to string names for comparison
+    config_names = {cfg.name for cfg in device_configurations}
+
+    # If ANY single-chip board is supported → 1 chip
+    if config_names.intersection(SINGLE_CHIP_BOARDS_STR):
+        return 1
+
+    # If ONLY multi-chip boards supported → 4 chips
+    if config_names.intersection(MULTI_CHIP_ONLY_BOARDS_STR):
+        return 4
+
+    return 1  # Default to single chip for unknown boards
+
+
+def get_model_chip_requirement(model_name: str) -> int:
+    """
+    Get chip requirement for a specific model by name.
+
+    Searches through model_implmentations and infers chip requirement
+    based on device_configurations.
+
+    Args:
+        model_name: Name of the model (e.g., "Llama-3.1-70B-Instruct")
+
+    Returns:
+        Number of chips required (1 or 4)
+    """
+    for impl in model_implmentations.values():
+        if impl.model_name == model_name:
+            return infer_chips_required(impl.device_configurations)
+
+    # Model not found, default to 1 chip
+    logger.warning(f"Model {model_name} not found in model_implmentations, defaulting to 1 chip")
+    return 1
\ No newline at end of file
diff --git a/app/backend/shared_config/model_type_config.py b/app/backend/shared_config/model_type_config.py
index 769754c1..a5d0e584 100644
--- a/app/backend/shared_config/model_type_config.py
+++ b/app/backend/shared_config/model_type_config.py
@@ -9,4 +9,9 @@ class ModelTypes(Enum):
     CHAT = "chat"
     OBJECT_DETECTION = "object_detection"
     IMAGE_GENERATION = "image_generation"
-    SPEECH_RECOGNITION = "speech_recognition"
\ No newline at end of file
+    SPEECH_RECOGNITION = "speech_recognition"
+    VLM = "vlm"
+    TTS = "tts"
+    VIDEO = "video_generation"
+    EMBEDDING = "embedding"
+    CNN = "cnn"
\ No newline at end of file
diff --git a/app/backend/shared_config/models_from_inference_server.json b/app/backend/shared_config/models_from_inference_server.json
new file mode 100644
index 00000000..b228e375
--- /dev/null
+++ b/app/backend/shared_config/models_from_inference_server.json
@@ -0,0 +1,1740 @@
+{
+  "source": {
+    "artifact_version": "0.9.0",
+    "generated_at": "2026-03-03T15:24:14.263161+00:00"
+  },
+  "total_models": 60,
+  "models": [
+    {
+      "model_name": "DeepSeek-R1-Distill-Llama-70B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 70
+    },
+    {
+      "model_name": "distil-large-v3",
+      "model_type": "SPEECH_RECOGNITION",
+      "display_model_type": "AUDIO",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "T3K"
+      ],
+      "hf_model_id": "distil-whisper/distil-large-v3",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "FLUX.1-dev",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "black-forest-labs/FLUX.1-dev",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "FLUX.1-schnell",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "black-forest-labs/FLUX.1-schnell",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Llama-3.1-70B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.1-70B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 70
+    },
+    {
+      "model_name": "Llama-3.1-70B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.1-70B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 70
+    },
+    {
+      "model_name": "Llama-3.1-8B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "N150",
+        "N300",
+        "P100",
+        "P150",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.1-8B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 8
+    },
+    {
+      "model_name": "Llama-3.1-8B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "N150",
+        "N300",
+        "P100",
+        "P150",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.1-8B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 8
+    },
+    {
+      "model_name": "Llama-3.3-70B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "P150X4",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.3-70B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 70
+    },
+    {
+      "model_name": "Mistral-7B-Instruct-v0.3",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 7
+    },
+    {
+      "model_name": "mochi-1-preview",
+      "model_type": "VIDEO",
+      "display_model_type": "VIDEO",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "genmo/mochi-1-preview",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Motif-Image-6B-Preview",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "Motif-Technologies/Motif-Image-6B-Preview",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 6
+    },
+    {
+      "model_name": "Qwen3-32B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "P150X8",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen3-32B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 32
+    },
+    {
+      "model_name": "speecht5_tts",
+      "model_type": "TTS",
+      "display_model_type": "TEXT_TO_SPEECH",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "microsoft/speecht5_tts",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-a9b09e0",
+      "service_route": "/v1/audio/speech",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "stable-diffusion-3.5-large",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "stabilityai/stable-diffusion-3.5-large",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "stable-diffusion-xl-1.0-inpainting-0.1",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.5.0-fbbbd2da8cfab49ddf43d28dd9c0813a3c3ee2bd",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "stable-diffusion-xl-base-1.0",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "stable-diffusion-xl-base-1.0-img-2-img",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0-img-2-img",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Wan2.2-T2V-A14B-Diffusers",
+      "model_type": "VIDEO",
+      "display_model_type": "VIDEO",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 14
+    },
+    {
+      "model_name": "whisper-large-v3",
+      "model_type": "SPEECH_RECOGNITION",
+      "display_model_type": "AUDIO",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "T3K"
+      ],
+      "hf_model_id": "openai/whisper-large-v3",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "COMPLETE",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Llama-3.2-11B-Vision",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-11B-Vision",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 11
+    },
+    {
+      "model_name": "Llama-3.2-11B-Vision-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-11B-Vision-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 11
+    },
+    {
+      "model_name": "Llama-3.2-1B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-1B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 1
+    },
+    {
+      "model_name": "Llama-3.2-1B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-1B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 1
+    },
+    {
+      "model_name": "Llama-3.2-3B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-3B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 3
+    },
+    {
+      "model_name": "Llama-3.2-3B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-3B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 3
+    },
+    {
+      "model_name": "Llama-3.2-90B-Vision",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-90B-Vision",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "MAX_PREFILL_CHUNK_SIZE": 16
+      },
+      "param_count": 90
+    },
+    {
+      "model_name": "Llama-3.2-90B-Vision-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "T3K"
+      ],
+      "hf_model_id": "meta-llama/Llama-3.2-90B-Vision-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "MAX_PREFILL_CHUNK_SIZE": 16
+      },
+      "param_count": 90
+    },
+    {
+      "model_name": "Qwen-Image",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen-Image",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Qwen-Image-2512",
+      "model_type": "IMAGE_GENERATION",
+      "display_model_type": "IMAGE",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen-Image-2512",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Qwen2.5-72B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-72B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+        "MAX_PREFILL_CHUNK_SIZE": "16"
+      },
+      "param_count": 72
+    },
+    {
+      "model_name": "Qwen2.5-72B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-72B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+        "MAX_PREFILL_CHUNK_SIZE": "16"
+      },
+      "param_count": 72
+    },
+    {
+      "model_name": "Qwen2.5-VL-72B-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-VL-72B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 72
+    },
+    {
+      "model_name": "Qwen3-8B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen3-8B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 8
+    },
+    {
+      "model_name": "QwQ-32B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/QwQ-32B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "FUNCTIONAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 32
+    },
+    {
+      "model_name": "AFM-4.5B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "arcee-ai/AFM-4.5B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-ae65ee5-35f023f",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": 4
+    },
+    {
+      "model_name": "bge-large-en-v1.5",
+      "model_type": "EMBEDDING",
+      "display_model_type": "EMBEDDING",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "BAAI/bge-large-en-v1.5",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM__MAX_NUM_BATCHED_TOKENS": "3072",
+        "VLLM__MAX_MODEL_LENGTH": "384",
+        "VLLM__MIN_CONTEXT_LENGTH": "32",
+        "VLLM__MAX_NUM_SEQS": "8",
+        "MAX_BATCH_SIZE": "8",
+        "DEFAULT_THROTTLE_LEVEL": "0"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "DeepSeek-R1-0528",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY"
+      ],
+      "hf_model_id": "deepseek-ai/DeepSeek-R1-0528",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e3d97e5-a186bf4",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "efficientnet",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "efficientnet",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "gemma-3-1b-it",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150"
+      ],
+      "hf_model_id": "google/gemma-3-1b-it",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 1
+    },
+    {
+      "model_name": "gemma-3-27b-it",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "google/gemma-3-27b-it",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 27
+    },
+    {
+      "model_name": "gemma-3-4b-it",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "google/gemma-3-4b-it",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 4
+    },
+    {
+      "model_name": "gpt-oss-120b",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "T3K"
+      ],
+      "hf_model_id": "openai/gpt-oss-120b",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 120
+    },
+    {
+      "model_name": "gpt-oss-20b",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "openai/gpt-oss-20b",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-60ffb199-3499ffa1",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 20
+    },
+    {
+      "model_name": "medgemma-27b-it",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "GALAXY",
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "google/medgemma-27b-it",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 27
+    },
+    {
+      "model_name": "medgemma-4b-it",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "google/medgemma-4b-it",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_USE_V1": "1"
+      },
+      "param_count": 4
+    },
+    {
+      "model_name": "mobilenetv2",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "mobilenetv2",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "Qwen2.5-7B",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150X4",
+        "N300"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-7B",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff",
+      "service_route": "/v1/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 7
+    },
+    {
+      "model_name": "Qwen2.5-7B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "N150X4",
+        "N300"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-7B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 7
+    },
+    {
+      "model_name": "Qwen2.5-Coder-32B-Instruct",
+      "model_type": "CHAT",
+      "display_model_type": "LLM",
+      "device_configurations": [
+        "GALAXY_T3K",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-Coder-32B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-17a5973-aa4ae1e",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 32
+    },
+    {
+      "model_name": "Qwen2.5-VL-32B-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-VL-32B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 32
+    },
+    {
+      "model_name": "Qwen2.5-VL-3B-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-VL-3B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 3
+    },
+    {
+      "model_name": "Qwen2.5-VL-7B-Instruct",
+      "model_type": "VLM",
+      "display_model_type": "VLM",
+      "device_configurations": [
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "inference_engine": "vLLM",
+      "supported_modalities": [
+        "text",
+        "image"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1
+      },
+      "param_count": 7
+    },
+    {
+      "model_name": "Qwen3-Embedding-4B",
+      "model_type": "EMBEDDING",
+      "display_model_type": "EMBEDDING",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen3-Embedding-4B",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM__MAX_NUM_BATCHED_TOKENS": "1024",
+        "VLLM__MAX_MODEL_LENGTH": "1024",
+        "VLLM__MIN_CONTEXT_LENGTH": "32",
+        "VLLM__MAX_NUM_SEQS": "1",
+        "MAX_BATCH_SIZE": "1",
+        "DEFAULT_THROTTLE_LEVEL": "0"
+      },
+      "param_count": 4
+    },
+    {
+      "model_name": "Qwen3-Embedding-8B",
+      "model_type": "EMBEDDING",
+      "display_model_type": "EMBEDDING",
+      "device_configurations": [
+        "GALAXY",
+        "N150",
+        "N300",
+        "T3K"
+      ],
+      "hf_model_id": "Qwen/Qwen3-Embedding-8B",
+      "inference_engine": "media",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756",
+      "service_route": "/enqueue",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1",
+        "VLLM__MAX_NUM_BATCHED_TOKENS": "1024",
+        "VLLM__MAX_MODEL_LENGTH": "1024",
+        "VLLM__MIN_CONTEXT_LENGTH": "32",
+        "VLLM__MAX_NUM_SEQS": "1"
+      },
+      "param_count": 8
+    },
+    {
+      "model_name": "resnet-50",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "resnet-50",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "segformer",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "segformer",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "unet",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "unet",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "vit",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "vit",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    },
+    {
+      "model_name": "vovnet",
+      "model_type": "CNN",
+      "display_model_type": "CNN",
+      "device_configurations": [
+        "N150",
+        "N300"
+      ],
+      "hf_model_id": "vovnet",
+      "inference_engine": "forge",
+      "supported_modalities": [
+        "text"
+      ],
+      "status": "EXPERIMENTAL",
+      "version": "0.9.0",
+      "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673",
+      "service_route": "/v1/chat/completions",
+      "shm_size": "32G",
+      "setup_type": "TT_INFERENCE_SERVER",
+      "env_vars": {
+        "VLLM_CONFIGURE_LOGGING": "1",
+        "VLLM_RPC_TIMEOUT": "900000",
+        "VLLM_TARGET_DEVICE": "tt",
+        "TORCHDYNAMO_DISABLE": "1"
+      },
+      "param_count": null
+    }
+  ]
+}
diff --git a/app/backend/shared_config/sync_models_from_inference_server.py b/app/backend/shared_config/sync_models_from_inference_server.py
new file mode 100644
index 00000000..7fce24c0
--- /dev/null
+++ b/app/backend/shared_config/sync_models_from_inference_server.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+
+"""
+Sync script: reads ../../tt-inference-server/model_specs_output.json and
+normalizes it into models_from_inference_server.json (co-located with this script).
+
+Run from any directory:
+    python app/backend/shared_config/sync_models_from_inference_server.py
+"""
+
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+SCRIPT_DIR = Path(__file__).parent
+OUTPUT_JSON = SCRIPT_DIR / "models_from_inference_server.json"
+
+# Source JSON resolution order:
+#   1. Explicit --source CLI argument
+#   2. TT_INFERENCE_ARTIFACT_PATH env var (set by run.py after artifact download)
+#   3. .artifacts/tt-inference-server/ next to repo root (artifact default location)
+#   4. tt-inference-server/ next to repo root (legacy submodule path)
+_REPO_ROOT = SCRIPT_DIR / "../../.."
+_CANDIDATE_SOURCES = [
+    _REPO_ROOT / ".artifacts/tt-inference-server/model_specs_output.json",
+    _REPO_ROOT / "tt-inference-server/model_specs_output.json",
+]
+
+
+def resolve_source_json(override: str | None = None) -> Path:
+    """Return the path to model_specs_output.json, trying candidates in order."""
+    if override:
+        p = Path(override)
+        if not p.exists():
+            raise FileNotFoundError(f"--source path not found: {p}")
+        return p.resolve()
+
+    # Check env var set by run.py
+    artifact_path = os.environ.get("TT_INFERENCE_ARTIFACT_PATH")
+    if artifact_path:
+        p = Path(artifact_path) / "model_specs_output.json"
+        if p.exists():
+            return p.resolve()
+
+    # Try static candidates
+    for candidate in _CANDIDATE_SOURCES:
+        if candidate.exists():
+            return candidate.resolve()
+
+    raise FileNotFoundError(
+        "Cannot find model_specs_output.json. Tried:\n"
+        + "\n".join(f"  {c.resolve()}" for c in _CANDIDATE_SOURCES)
+    )
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+DEVICE_SPECIFIC_ENV_KEYS = {"WH_ARCH_YAML", "MESH_DEVICE", "ARCH_NAME"}
+
+STATUS_ORDER = {"COMPLETE": 3, "FUNCTIONAL": 2, "EXPERIMENTAL": 1}
+
+# device_type string (from tt-inference-server) → DeviceConfigurations member name
+# Only include device_types that exist in DeviceConfigurations enum
+DEVICE_TYPE_TO_CONFIG = {
+    "N150": "N150",
+    "N300": "N300",
+    "T3K": "T3K",
+    "N150X4": "N150X4",
+    "P100": "P100",
+    "P150": "P150",
+    "P150X4": "P150X4",
+    "P150X8": "P150X8",
+    "GALAXY": "GALAXY",
+    "GALAXY_T3K": "GALAXY_T3K",
+}
+
+
+def map_model_type(raw_model_type: str, inference_engine: str) -> str:
+    """Map tt-inference-server model_type + inference_engine to tt-studio ModelTypes."""
+    if raw_model_type == "LLM" and inference_engine == "vLLM":
+        return "CHAT"
+    if raw_model_type == "VLM":
+        return "VLM"
+    if raw_model_type == "IMAGE":
+        return "IMAGE_GENERATION"
+    if raw_model_type == "AUDIO":
+        return "SPEECH_RECOGNITION"
+    if raw_model_type == "TEXT_TO_SPEECH" or raw_model_type == "TTS":
+        return "TTS"
+    if raw_model_type == "VIDEO":
+        return "VIDEO"
+    if raw_model_type == "EMBEDDING":
+        return "EMBEDDING"
+    # CNN + media engine = image generation (FLUX, Motif, etc.)
+    if raw_model_type == "CNN" and inference_engine == "media":
+        return "IMAGE_GENERATION"
+    # CNN + forge = computer vision / object detection (resnet, vit, etc.)
+    if raw_model_type == "CNN" and inference_engine == "forge":
+        return "CNN"
+    return "CHAT"
+
+
+CHAT_CAPABLE_PATTERNS = [
+    "instruct", "-chat", "chat-", "-it-", "-it", "assistant",
+    # Reasoning / thinking models that do have chat templates
+    "deepseek-r1", "qwq", "qwen3", "gpt-oss",
+]
+
+
+def is_chat_capable(hf_model_id: str) -> bool:
+    lower = hf_model_id.lower()
+    return any(p in lower for p in CHAT_CAPABLE_PATTERNS)
+
+
+def map_service_route(inference_engine: str, hf_model_id: str = "", raw_model_type: str = "") -> str:
+    """Derive service_route from inference_engine, model type, and model id.
+    
+    Args:
+        inference_engine: Engine type (vLLM, media, forge)
+        hf_model_id: HuggingFace model ID (for vLLM chat detection)
+        raw_model_type: Raw model type from inference server (TEXT_TO_SPEECH, TTS, etc.)
+    """
+    if inference_engine == "vLLM":
+        return "/v1/chat/completions" if is_chat_capable(hf_model_id) else "/v1/completions"
+    if inference_engine == "media":
+        # TTS models use OpenAI-compatible /v1/audio/speech endpoint
+        if raw_model_type in ("TEXT_TO_SPEECH", "TTS"):
+            return "/v1/audio/speech"
+        # Other media models (image gen, speech recognition, etc.) use enqueue
+        return "/enqueue"
+    if inference_engine == "forge":
+        return "/v1/chat/completions"
+    return "/v1/chat/completions"
+
+
+def filter_env_vars(env_vars: dict) -> dict:
+    """Strip device-specific env vars that ModelImpl.__post_init__ handles."""
+    return {k: v for k, v in env_vars.items() if k not in DEVICE_SPECIFIC_ENV_KEYS}
+
+
+def pick_higher_status(current: str | None, candidate: str) -> str:
+    """Return whichever status is higher priority."""
+    if current is None:
+        return candidate
+    return current if STATUS_ORDER.get(current, 0) >= STATUS_ORDER.get(candidate, 0) else candidate
+
+
+def normalize(source_path: Path) -> list[dict]:
+    with open(source_path) as f:
+        raw = json.load(f)
+
+    # group by model_name, skipping GPU entries
+    by_model: dict[str, list[dict]] = {}
+    for entry in raw.values():
+        if entry.get("device_type") == "GPU":
+            continue
+        name = entry["model_name"]
+        by_model.setdefault(name, []).append(entry)
+
+    models = []
+    for model_name, entries in by_model.items():
+        # Use first entry as the canonical source for model-level fields
+        first = entries[0]
+
+        # Aggregate device_types
+        device_configurations = sorted(
+            {
+                DEVICE_TYPE_TO_CONFIG[e["device_type"]]
+                for e in entries
+                if e.get("device_type") in DEVICE_TYPE_TO_CONFIG
+            }
+        )
+
+        # Pick highest status
+        status = None
+        for e in entries:
+            status = pick_higher_status(status, e.get("status", "EXPERIMENTAL"))
+
+        # Model-level env_vars (from first entry, strip device-specific keys)
+        env_vars = filter_env_vars(first.get("env_vars") or {})
+
+        inference_engine = first.get("inference_engine", "vLLM")
+        raw_model_type = first.get("model_type", "LLM")
+
+        models.append({
+            "model_name": model_name,
+            "model_type": map_model_type(raw_model_type, inference_engine),
+            "display_model_type": raw_model_type,
+            "device_configurations": device_configurations,
+            "hf_model_id": first.get("hf_model_repo"),
+            "inference_engine": inference_engine,
+            "supported_modalities": first.get("supported_modalities", ["text"]),
+            "status": status,
+            "version": first.get("version", "0.0.1"),
+            "docker_image": first.get("docker_image"),
+            "service_route": map_service_route(inference_engine, hf_model_id=first.get("hf_model_repo", ""), raw_model_type=raw_model_type),
+            "shm_size": "32G",
+            "setup_type": "TT_INFERENCE_SERVER",
+            "env_vars": env_vars,
+            "param_count": first.get("param_count"),
+        })
+
+    # Sort: by status (highest first), then alphabetically by model_name
+    models.sort(key=lambda m: (-STATUS_ORDER.get(m["status"], 0), m["model_name"].lower()))
+    return models
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Sync model catalog from tt-inference-server")
+    parser.add_argument("--source", default=None, help="Path to model_specs_output.json (overrides auto-detection)")
+    args = parser.parse_args()
+
+    source_path = resolve_source_json(args.source)
+    print(f"Reading: {source_path}")
+
+    if not source_path.exists():
+        raise FileNotFoundError(f"Source not found: {source_path}")
+
+    models = normalize(source_path)
+
+    # Resolve artifact version from VERSION file or env vars (avoid leaking absolute paths)
+    artifact_version = None
+    version_file = source_path.parent / "VERSION"
+    if version_file.exists():
+        artifact_version = version_file.read_text().strip()
+    if not artifact_version:
+        artifact_version = (
+            os.environ.get("TT_INFERENCE_ARTIFACT_VERSION")
+            or os.environ.get("TT_INFERENCE_ARTIFACT_BRANCH")
+            or "unknown"
+        )
+
+    catalog = {
+        "source": {
+            "artifact_version": artifact_version,
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+        },
+        "total_models": len(models),
+        "models": models,
+    }
+
+    out_path = OUTPUT_JSON.resolve()
+    with open(out_path, "w") as f:
+        json.dump(catalog, f, indent=2)
+        f.write("\n")
+
+    print(f"Written {len(models)} models → {out_path}")
+
+    # Print a summary
+    from collections import Counter
+    status_counts = Counter(m["status"] for m in models)
+    type_counts = Counter(m["model_type"] for m in models)
+    display_type_counts = Counter(m["display_model_type"] for m in models)
+    print(f"  Status distribution:       {dict(status_counts)}")
+    print(f"  Type distribution:         {dict(type_counts)}")
+    print(f"  Display type distribution: {dict(display_type_counts)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app/backend/shared_config/test_sync_models.py b/app/backend/shared_config/test_sync_models.py
new file mode 100644
index 00000000..e7a18802
--- /dev/null
+++ b/app/backend/shared_config/test_sync_models.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+"""
+Tests for sync_models_from_inference_server.py route derivation logic.
+"""
+
+import pytest
+from sync_models_from_inference_server import map_service_route
+
+
+class TestServiceRouteMapping:
+    """Test that service routes are correctly derived for different model types."""
+    
+    def test_vllm_chat_capable_models(self):
+        """vLLM chat-capable models should use /v1/chat/completions."""
+        assert map_service_route("vLLM", "meta-llama/Llama-3.1-8B-Instruct", "") == "/v1/chat/completions"
+        assert map_service_route("vLLM", "mistralai/Mistral-7B-Instruct-v0.3", "") == "/v1/chat/completions"
+        assert map_service_route("vLLM", "Qwen/QwQ-32B", "") == "/v1/chat/completions"
+    
+    def test_vllm_base_models(self):
+        """vLLM base models should use /v1/completions."""
+        assert map_service_route("vLLM", "meta-llama/Llama-3.1-70B", "") == "/v1/completions"
+        assert map_service_route("vLLM", "meta-llama/Llama-3.2-1B", "") == "/v1/completions"
+    
+    def test_tts_media_models_use_openai_endpoint(self):
+        """TTS media models should use /v1/audio/speech (OpenAI-compatible)."""
+        assert map_service_route("media", "", "TEXT_TO_SPEECH") == "/v1/audio/speech"
+        assert map_service_route("media", "", "TTS") == "/v1/audio/speech"
+    
+    def test_non_tts_media_models_use_enqueue(self):
+        """Non-TTS media models should use /enqueue."""
+        assert map_service_route("media", "", "IMAGE") == "/enqueue"
+        assert map_service_route("media", "", "AUDIO") == "/enqueue"
+        assert map_service_route("media", "", "VIDEO") == "/enqueue"
+        assert map_service_route("media", "", "CNN") == "/enqueue"
+        assert map_service_route("media", "", "EMBEDDING") == "/enqueue"
+    
+    def test_forge_models_use_chat_completions(self):
+        """Forge models should use /v1/chat/completions."""
+        assert map_service_route("forge", "", "") == "/v1/chat/completions"
+        assert map_service_route("forge", "", "CNN") == "/v1/chat/completions"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/app/docker-compose.dev-mode.yml b/app/docker-compose.dev-mode.yml
index 9cfccb6b..6fd24a51 100644
--- a/app/docker-compose.dev-mode.yml
+++ b/app/docker-compose.dev-mode.yml
@@ -10,7 +10,8 @@ services:
     volumes:
       # Mount the local api directory for live code changes
       - ./backend:/backend
-    command: python ./manage.py runserver 0.0.0.0:8000
+    command: >
+      python manage.py runserver 0.0.0.0:8000
     environment:
       - DEBUG=True
     # Allow container to access host services (docker-control-service)
diff --git a/app/docker-compose.yml b/app/docker-compose.yml
index 19f07ba6..74bbedae 100644
--- a/app/docker-compose.yml
+++ b/app/docker-compose.yml
@@ -20,7 +20,8 @@ services:
       - tt_studio_network
     ports:
       - "8000:8000"
-    command: gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application
+    command: >
+      gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application
     depends_on:
       tt_studio_chroma:
         condition: service_healthy
@@ -35,6 +36,7 @@ services:
       - INTERNAL_PERSISTENT_STORAGE_VOLUME
       - BACKEND_API_HOSTNAME
       - JWT_SECRET
+      - TTS_API_KEY
       - TAVILY_API_KEY
       - CLOUD_CHAT_UI_URL
       - CLOUD_CHAT_UI_AUTH_TOKEN
@@ -58,7 +60,7 @@ services:
       # Mount the local api directory for live code changes
       - ./backend:/backend
       # Mount tt-inference-server workflow logs for viewing deployment logs
-      - ${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:ro
+      - ${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:ro
 
     healthcheck:
       # On first application load resources for transformers/etc
diff --git a/app/frontend/src/api/modelsDeployedApis.ts b/app/frontend/src/api/modelsDeployedApis.ts
index 0bc1cd28..d9e77830 100644
--- a/app/frontend/src/api/modelsDeployedApis.ts
+++ b/app/frontend/src/api/modelsDeployedApis.ts
@@ -30,6 +30,7 @@ interface ContainerData {
   image_name: string;
   port_bindings: { [key: string]: PortBinding[] };
   networks: { [key: string]: Network };
+  device_id?: number | null;
 }
 
 interface StopResponse {
@@ -48,19 +49,55 @@ interface DeployedModelInfo {
   id: string;
   modelName: string;
   status: string;
+  model_type?: string;
   internal_url?: string;
   health_url?: string;
   model_impl?: {
     model_name?: string;
     hf_model_id?: string;
+    model_type?: string;
   };
 }
 
 export const ModelType = {
   ChatModel: "ChatModel",
+  VLM: "VLM",
   ImageGeneration: "ImageGeneration",
+  VideoGeneration: "VideoGeneration",
   ObjectDetectionModel: "ObjectDetectionModel",
   SpeechRecognitionModel: "SpeechRecognitionModel",
+  TTS: "TTS",
+  Embedding: "Embedding",
+  CNN: "CNN",
+};
+
+/**
+ * Map backend model_type strings (from catalog/API) to frontend ModelType constants.
+ * Falls back to ChatModel for unknown types.
+ */
+export const getModelTypeFromBackendType = (backendType: string): string => {
+  switch (backendType) {
+    case "chat":
+      return ModelType.ChatModel;
+    case "vlm":
+      return ModelType.VLM;
+    case "image_generation":
+      return ModelType.ImageGeneration;
+    case "video_generation":
+      return ModelType.VideoGeneration;
+    case "object_detection":
+      return ModelType.ObjectDetectionModel;
+    case "speech_recognition":
+      return ModelType.SpeechRecognitionModel;
+    case "tts":
+      return ModelType.TTS;
+    case "embedding":
+      return ModelType.Embedding;
+    case "cnn":
+      return ModelType.CNN;
+    default:
+      return ModelType.ChatModel;
+  }
 };
 
 export const fetchModels = async (): Promise<Model[]> => {
@@ -114,6 +151,7 @@ export const fetchModels = async (): Promise<Model[]> => {
         health: container.health || "unknown",
         ports: portMapping,
         name: container.name || "Unnamed container",
+        device_id: container.device_id ?? null,
       };
     });
 
@@ -214,12 +252,13 @@ export const handleRedeploy = (modelName: string): void => {
 export const handleModelNavigationClick = (
   modelID: string,
   modelName: string,
-  navigate: NavigateFunction
+  navigate: NavigateFunction,
+  modelType?: string
 ): void => {
-  const modelType = getModelTypeFromName(modelName);
-  const destination = getDestinationFromModelType(modelType);
-  console.log(`${modelType} button clicked for model: ${modelID}`);
-  console.log(`Opening ${modelType} for model: ${modelName}`);
+  const resolvedModelType = modelType ?? getModelTypeFromName(modelName);
+  const destination = getDestinationFromModelType(resolvedModelType);
+  console.log(`${resolvedModelType} button clicked for model: ${modelID}`);
+  console.log(`Opening ${resolvedModelType} for model: ${modelName}`);
   customToast.success(`${destination.slice(1)} page opened!`);
 
   navigate(destination, {
@@ -233,14 +272,125 @@ export const getDestinationFromModelType = (modelType: string): string => {
   switch (modelType) {
     case ModelType.ChatModel:
       return "/chat";
+    case ModelType.VLM:
+      return "/chat"; // VLM reuses the chat UI (supports image content)
     case ModelType.ImageGeneration:
       return "/image-generation";
+    case ModelType.VideoGeneration:
+      return "/chat"; // placeholder until video UI exists
     case ModelType.ObjectDetectionModel:
       return "/object-detection";
     case ModelType.SpeechRecognitionModel:
       return "/speech-to-text";
+    case ModelType.TTS:
+      return "/tts";
+    case ModelType.Embedding:
+      return "/chat"; // placeholder
+    case ModelType.CNN:
+      return "/object-detection"; // CNN reuses object detection UI
     default:
-      return "/chat"; // /chat is the default
+      return "/chat";
+  }
+};
+
+// ----- deployModel with device_id support -----
+export const deployModel = async (
+  modelId: string,
+  weightsId: string,
+  deviceId: number = 0,
+): Promise<{ job_id?: string; status?: string; message?: string }> => {
+  const payload = JSON.stringify({
+    model_id: modelId,
+    weights_id: weightsId,
+    device_id: deviceId,
+  });
+  const response = await fetch("/docker-api/deploy/", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: payload,
+  });
+  return response.json();
+};
+
+// ----- TTS Inference -----
+export const runTTSInference = async (
+  deployId: string,
+  text: string,
+): Promise<Blob> => {
+  const response = await fetch("/models-api/tts/", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ deploy_id: deployId, text }),
+  });
+  if (!response.ok) {
+    throw new Error(`TTS request failed: HTTP ${response.status}`);
+  }
+  return response.blob();
+};
+
+// ----- Voice Pipeline -----
+export interface VoicePipelineRequest {
+  audioFile: File;
+  whisperDeployId: string;
+  llmDeployId: string;
+  ttsDeployId?: string;
+  systemPrompt?: string;
+}
+
+/**
+ * Calls the voice pipeline endpoint and returns an SSE EventSource.
+ * The caller is responsible for closing the EventSource when done.
+ */
+export const runVoicePipeline = async (
+  req: VoicePipelineRequest,
+  onTranscript: (text: string) => void,
+  onLlmChunk: (text: string) => void,
+  onAudio: (dataUrl: string) => void,
+  onError: (stage: string, message: string) => void,
+  onDone: () => void,
+): Promise<void> => {
+  const form = new FormData();
+  form.append("audio_file", req.audioFile);
+  form.append("whisper_deploy_id", req.whisperDeployId);
+  form.append("llm_deploy_id", req.llmDeployId);
+  if (req.ttsDeployId) form.append("tts_deploy_id", req.ttsDeployId);
+  if (req.systemPrompt) form.append("system_prompt", req.systemPrompt);
+
+  const response = await fetch("/models-api/pipeline/voice/", {
+    method: "POST",
+    body: form,
+  });
+
+  if (!response.ok || !response.body) {
+    onError("pipeline", `HTTP ${response.status}`);
+    return;
+  }
+
+  const reader = response.body.getReader();
+  const decoder = new TextDecoder();
+  let buffer = "";
+
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    buffer += decoder.decode(value, { stream: true });
+
+    const lines = buffer.split("\n");
+    buffer = lines.pop() ?? "";
+
+    for (const line of lines) {
+      if (!line.startsWith("data: ")) continue;
+      try {
+        const evt = JSON.parse(line.slice(6));
+        if (evt.type === "transcript") onTranscript(evt.text);
+        else if (evt.type === "llm_chunk") onLlmChunk(evt.text);
+        else if (evt.type === "audio_url") onAudio(evt.url);
+        else if (evt.type === "error") onError(evt.stage ?? "unknown", evt.message);
+        else if (evt.type === "done") onDone();
+      } catch {
+        // skip malformed lines
+      }
+    }
   }
 };
 
@@ -252,6 +402,8 @@ export const getModelTypeFromName = (modelName: string): string => {
     modelType = ModelType.ImageGeneration;
   } else if (modelName.toLowerCase().includes("whisper")) {
     modelType = ModelType.SpeechRecognitionModel;
+  } else if (modelName.toLowerCase().includes("tts")) {
+    modelType = ModelType.TTS;
   } else {
     modelType = ModelType.ChatModel;
   }
@@ -298,6 +450,7 @@ export const fetchDeployedModelsInfo = async (): Promise<
           modelData.model_impl?.hf_model_id ||
           "Unknown Model",
         status: "deployed",
+        model_type: modelData.model_impl?.model_type,
         internal_url: modelData.internal_url,
         health_url: modelData.health_url,
         model_impl: modelData.model_impl,
diff --git a/app/frontend/src/components/ChipConfigStep.tsx b/app/frontend/src/components/ChipConfigStep.tsx
new file mode 100644
index 00000000..a98d32b0
--- /dev/null
+++ b/app/frontend/src/components/ChipConfigStep.tsx
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import { useState, useEffect } from "react";
+import axios from "axios";
+import { Cpu, Layers } from "lucide-react";
+import { useStepper } from "./ui/stepper";
+import { ChipStatusDisplay } from "./ChipStatusDisplay";
+import { Button } from "./ui/button";
+
+interface ChipSlot {
+  slot_id: number;
+  status: "available" | "occupied";
+  model_name?: string;
+  deployment_id?: number;
+  is_multi_chip?: boolean;
+}
+
+interface ChipStatus {
+  board_type: string;
+  total_slots: number;
+  slots: ChipSlot[];
+}
+
+interface ChipConfigStepProps {
+  onConfirm: (mode: "single" | "multi", slotId: number) => void;
+}
+
+export function ChipConfigStep({ onConfirm }: ChipConfigStepProps) {
+  const { nextStep } = useStepper();
+  const [selectedMode, setSelectedMode] = useState<"single" | "multi" | null>(
+    null
+  );
+  const [selectedSlot, setSelectedSlot] = useState<number | null>(null);
+  const [chipStatus, setChipStatus] = useState<ChipStatus | null>(null);
+
+  // Fetch chip status on mount and poll every 7 minutes
+  useEffect(() => {
+    const fetchChipStatus = async () => {
+      try {
+        const response = await axios.get("/docker-api/chip-status/");
+        setChipStatus(response.data);
+      } catch (error) {
+        console.error("Error fetching chip status:", error);
+      }
+    };
+
+    fetchChipStatus();
+    const interval = setInterval(fetchChipStatus, 7 * 60 * 1000);
+    return () => clearInterval(interval);
+  }, []);
+
+  const handleModeSelect = (mode: "single" | "multi") => {
+    setSelectedMode(mode);
+    setSelectedSlot(null); // reset slot when mode changes
+  };
+
+  const needsSlotPicker =
+    selectedMode === "single" &&
+    chipStatus !== null &&
+    chipStatus.total_slots > 1;
+
+  const isConfirmDisabled =
+    !selectedMode || (needsSlotPicker && selectedSlot === null);
+
+  const handleConfirm = () => {
+    if (isConfirmDisabled || !selectedMode) return;
+    // Multi-chip always uses device_id 0; single uses the chosen slot
+    const slotId =
+      selectedMode === "multi" ? 0 : (selectedSlot ?? 0);
+    onConfirm(selectedMode, slotId);
+    nextStep();
+  };
+
+  return (
+    <div className="w-full px-8 py-6 space-y-8">
+      {/* Header */}
+      <div>
+        <h2 className="text-xl font-semibold text-gray-900 dark:text-white mb-1">
+          Choose Chip Configuration
+        </h2>
+        <p className="text-sm text-gray-500 dark:text-gray-400">
+          Select how many chips to use. This determines which models are
+          available in the next step.
+        </p>
+      </div>
+
+      {/* Mode selection cards */}
+      <div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
+        {/* 1 Chip card */}
+        <button
+          type="button"
+          onClick={() => handleModeSelect("single")}
+          className={`
+            relative text-left p-6 rounded-xl border-2 transition-all duration-200 cursor-pointer
+            ${
+              selectedMode === "single"
+                ? "border-TT-purple-accent bg-TT-purple-shade/30 shadow-[0_0_20px_rgba(124,104,250,0.25)]"
+                : "border-gray-700 bg-[#0d1117] hover:border-TT-purple-accent/60 hover:bg-TT-purple-shade/10"
+            }
+          `}
+        >
+          {selectedMode === "single" && (
+            <div className="absolute top-3 right-3 w-3 h-3 rounded-full bg-TT-purple-accent shadow-[0_0_8px_rgba(124,104,250,0.8)]" />
+          )}
+          <div className="flex items-center gap-3 mb-3">
+            <div
+              className={`p-2 rounded-lg ${selectedMode === "single" ? "bg-TT-purple-shade/60" : "bg-gray-800"}`}
+            >
+              <Cpu
+                className={`w-6 h-6 ${selectedMode === "single" ? "text-TT-purple-accent" : "text-gray-400"}`}
+              />
+            </div>
+            <div>
+              <div
+                className={`font-mono font-bold text-base ${selectedMode === "single" ? "text-TT-purple" : "text-gray-200"}`}
+              >
+                1 Chip
+              </div>
+              <div className="text-xs text-gray-500 font-mono">
+                N150 / N300
+              </div>
+            </div>
+          </div>
+          <p className="text-sm text-gray-400 leading-relaxed">
+            Deploy on a single chip. Best for 8B–13B parameter models.
+          </p>
+          <div className="mt-3 flex flex-wrap gap-1">
+            {["Llama-3.1-8B", "Mistral-7B", "Qwen-2.5"].map((tag) => (
+              <span
+                key={tag}
+                className="text-xs px-2 py-0.5 bg-gray-800 text-gray-400 rounded font-mono"
+              >
+                {tag}
+              </span>
+            ))}
+          </div>
+        </button>
+
+        {/* All Chips / T3K card */}
+        <button
+          type="button"
+          onClick={() => handleModeSelect("multi")}
+          className={`
+            relative text-left p-6 rounded-xl border-2 transition-all duration-200 cursor-pointer
+            ${
+              selectedMode === "multi"
+                ? "border-TT-purple-accent bg-TT-purple-shade/30 shadow-[0_0_20px_rgba(124,104,250,0.25)]"
+                : "border-gray-700 bg-[#0d1117] hover:border-TT-purple-accent/60 hover:bg-TT-purple-shade/10"
+            }
+          `}
+        >
+          {selectedMode === "multi" && (
+            <div className="absolute top-3 right-3 w-3 h-3 rounded-full bg-TT-purple-accent shadow-[0_0_8px_rgba(124,104,250,0.8)]" />
+          )}
+          <div className="flex items-center gap-3 mb-3">
+            <div
+              className={`p-2 rounded-lg ${selectedMode === "multi" ? "bg-TT-purple-shade/60" : "bg-gray-800"}`}
+            >
+              <Layers
+                className={`w-6 h-6 ${selectedMode === "multi" ? "text-TT-purple-accent" : "text-gray-400"}`}
+              />
+            </div>
+            <div>
+              <div
+                className={`font-mono font-bold text-base ${selectedMode === "multi" ? "text-TT-purple" : "text-gray-200"}`}
+              >
+                All Chips (T3K)
+              </div>
+              <div className="text-xs text-gray-500 font-mono">4 × chips</div>
+            </div>
+          </div>
+          <p className="text-sm text-gray-400 leading-relaxed">
+            Deploy across all 4 chips. Required for 70B+ large models.
+          </p>
+          <div className="mt-3 flex flex-wrap gap-1">
+            {["Llama-3.1-70B", "DeepSeek-R1-70B", "FLUX.1"].map((tag) => (
+              <span
+                key={tag}
+                className="text-xs px-2 py-0.5 bg-gray-800 text-gray-400 rounded font-mono"
+              >
+                {tag}
+              </span>
+            ))}
+          </div>
+        </button>
+      </div>
+
+      {/* Slot picker — only shown when "1 Chip" is selected on a multi-slot board */}
+      {needsSlotPicker && chipStatus && (
+        <div>
+          <h3 className="text-sm font-mono font-semibold text-gray-400 uppercase tracking-widest mb-3">
+            Select Chip Slot
+          </h3>
+          <div className="flex flex-row justify-center gap-3 flex-wrap">
+            {chipStatus.slots.map((slot) => {
+              const isAvailable = slot.status === "available";
+              const isSelected = selectedSlot === slot.slot_id;
+              return (
+                <button
+                  key={slot.slot_id}
+                  type="button"
+                  disabled={!isAvailable}
+                  onClick={() => setSelectedSlot(slot.slot_id)}
+                  className={`
+                    flex flex-col items-center px-5 py-4 rounded-lg border-2 transition-all duration-200 min-w-[90px]
+                    ${
+                      isSelected
+                        ? "border-TT-purple-accent bg-TT-purple-shade/40 shadow-[0_0_14px_rgba(124,104,250,0.3)]"
+                        : isAvailable
+                          ? "border-gray-700 bg-[#0d1117] hover:border-TT-purple-accent/50 hover:bg-TT-purple-shade/10 cursor-pointer"
+                          : "border-gray-800 bg-[#0a0e14] opacity-40 cursor-not-allowed"
+                    }
+                  `}
+                >
+                  <Cpu
+                    className={`w-6 h-6 mb-1 ${isSelected ? "text-TT-purple-accent" : isAvailable ? "text-gray-400" : "text-gray-700"}`}
+                    strokeWidth={1.4}
+                  />
+                  <span
+                    className={`text-xs font-mono font-bold tracking-wider ${isSelected ? "text-TT-purple" : "text-gray-400"}`}
+                  >
+                    SLOT {String(slot.slot_id).padStart(2, "0")}
+                  </span>
+                  <span
+                    className={`text-[10px] font-mono mt-0.5 ${
+                      isSelected
+                        ? "text-TT-purple-accent"
+                        : isAvailable
+                          ? "text-gray-500"
+                          : "text-gray-700"
+                    }`}
+                  >
+                    {isAvailable ? "IDLE" : "IN USE"}
+                  </span>
+                </button>
+              );
+            })}
+          </div>
+          {selectedSlot !== null && (
+            <p className="mt-2 text-xs font-mono text-TT-purple-accent">
+              ✓ Slot {selectedSlot} selected — model will run on{" "}
+              <code className="bg-gray-800 px-1 rounded">
+                /dev/tenstorrent/{selectedSlot}
+              </code>
+            </p>
+          )}
+        </div>
+      )}
+
+      {/* Chip slot status */}
+      <div>
+        <h3 className="text-sm font-mono font-semibold text-gray-400 uppercase tracking-widest mb-3">
+          Current Slot Status
+        </h3>
+        {chipStatus ? (
+          <ChipStatusDisplay
+            boardType={chipStatus.board_type}
+            totalSlots={chipStatus.total_slots}
+            slots={chipStatus.slots}
+          />
+        ) : (
+          <div className="p-4 rounded-lg border border-gray-700 bg-[#0d1117] text-gray-500 text-sm font-mono animate-pulse">
+            Fetching hardware status...
+          </div>
+        )}
+      </div>
+
+      {/* Confirm button */}
+      <div className="flex justify-end pt-2">
+        <Button
+          type="button"
+          onClick={handleConfirm}
+          disabled={isConfirmDisabled}
+          className={`
+            px-6 py-2 font-mono font-semibold transition-all duration-200
+            ${
+              !isConfirmDisabled
+                ? "bg-TT-purple-accent hover:bg-TT-purple text-white shadow-[0_0_12px_rgba(124,104,250,0.3)]"
+                : "bg-gray-800 text-gray-600 cursor-not-allowed"
+            }
+          `}
+        >
+          Continue →
+        </Button>
+      </div>
+    </div>
+  );
+}
diff --git a/app/frontend/src/components/ChipStatusDisplay.tsx b/app/frontend/src/components/ChipStatusDisplay.tsx
new file mode 100644
index 00000000..581e5d0e
--- /dev/null
+++ b/app/frontend/src/components/ChipStatusDisplay.tsx
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import React from "react";
+import { Cpu } from "lucide-react";
+
+interface ChipSlot {
+  slot_id: number;
+  status: "available" | "occupied";
+  model_name?: string;
+  deployment_id?: number;
+  is_multi_chip?: boolean;
+}
+
+interface ChipStatusDisplayProps {
+  boardType: string;
+  totalSlots: number;
+  slots: ChipSlot[];
+  onStopModel?: (deploymentId: number) => void;
+  className?: string;
+}
+
+export function ChipStatusDisplay({
+  boardType,
+  totalSlots,
+  slots,
+  onStopModel,
+  className = "",
+}: ChipStatusDisplayProps) {
+  const availableCount = slots.filter((s) => s.status === "available").length;
+
+  // Check if two adjacent slots are both occupied and multi-chip (connector line)
+  const hasConnector = (index: number): boolean => {
+    if (index >= slots.length - 1) return false;
+    const curr = slots[index];
+    const next = slots[index + 1];
+    return (
+      curr.status === "occupied" &&
+      next.status === "occupied" &&
+      !!curr.is_multi_chip &&
+      !!next.is_multi_chip
+    );
+  };
+
+  return (
+    <div
+      className={`p-4 rounded-lg border border-gray-700/50 bg-[#0a0e14] ${className}`}
+    >
+      {/* Header row */}
+      <div className="flex items-center justify-between mb-4">
+        <div className="flex items-center gap-2">
+          <span className="text-xs font-mono font-bold text-gray-400 uppercase tracking-widest">
+            {boardType}
+          </span>
+          <span className="text-[10px] px-1.5 py-0.5 rounded bg-gray-800 text-gray-500 font-mono">
+            {totalSlots} SLOTS
+          </span>
+        </div>
+        <span className="text-xs font-mono text-gray-500">
+          {availableCount}/{totalSlots} IDLE
+        </span>
+      </div>
+
+      {/* Slot cards row */}
+      <div className="flex flex-row gap-3 flex-wrap">
+        {slots.map((slot, index) => {
+          const isOccupied = slot.status === "occupied";
+          const showConnector = hasConnector(index);
+
+          return (
+            <React.Fragment key={slot.slot_id}>
+              {/* Slot card */}
+              <div
+                className={`
+                  relative flex flex-col items-center p-4 rounded-lg min-w-[110px] flex-1
+                  border transition-all duration-300
+                  ${
+                    isOccupied
+                      ? "bg-[#0d1117] border-TT-purple-accent/70 shadow-[0_0_16px_rgba(124,104,250,0.3)]"
+                      : "bg-[#0d1117] border-TT-purple/30 shadow-[0_0_12px_rgba(188,179,247,0.15)]"
+                  }
+                `}
+              >
+                {/* Slot header row */}
+                <div className="flex items-center justify-between w-full mb-3">
+                  <span className="text-xs font-mono font-bold text-gray-400 tracking-wider">
+                    SLOT {String(slot.slot_id).padStart(2, "0")}
+                  </span>
+                  <span
+                    className={`
+                      text-[10px] font-mono font-semibold px-2 py-0.5 rounded-full
+                      ${
+                        isOccupied
+                          ? "bg-TT-purple-shade/60 text-TT-purple border border-TT-purple-accent/40"
+                          : "bg-gray-800/80 text-gray-400 border border-gray-600/40"
+                      }
+                    `}
+                  >
+                    {isOccupied ? "IN USE" : "IDLE"}
+                  </span>
+                </div>
+
+                {/* Chip icon */}
+                <div
+                  className={`
+                    my-2 p-2 rounded-lg
+                    ${isOccupied ? "bg-TT-purple-shade/50" : "bg-gray-800/50"}
+                  `}
+                >
+                  <Cpu
+                    className={`w-12 h-12 ${isOccupied ? "text-TT-purple-accent" : "text-gray-600"}`}
+                    strokeWidth={1.2}
+                  />
+                </div>
+
+                {/* Model name (only if occupied) */}
+                {isOccupied && slot.model_name && (
+                  <div className="mt-2 w-full text-center">
+                    <span
+                      className="text-[10px] font-mono text-TT-purple/80 truncate block px-1"
+                      title={slot.model_name}
+                    >
+                      {slot.model_name}
+                    </span>
+                  </div>
+                )}
+
+                {/* Stop button (optional) */}
+                {isOccupied && onStopModel && slot.deployment_id && (
+                  <button
+                    onClick={() => onStopModel(slot.deployment_id!)}
+                    className="mt-2 text-[10px] font-mono text-red-400 hover:text-red-300 underline"
+                  >
+                    STOP
+                  </button>
+                )}
+              </div>
+
+              {/* Connector between adjacent multi-chip slots */}
+              {showConnector && (
+                <div className="flex items-center self-center flex-shrink-0">
+                  <div className="w-2 h-2 rounded-sm border border-TT-purple-accent/60 bg-TT-purple-shade/40" />
+                  <div className="w-4 h-px bg-TT-purple-accent/40" />
+                  <div className="w-2 h-2 rounded-sm border border-TT-purple-accent/60 bg-TT-purple-shade/40" />
+                </div>
+              )}
+            </React.Fragment>
+          );
+        })}
+      </div>
+    </div>
+  );
+}
diff --git a/app/frontend/src/components/DeployModelStep.tsx b/app/frontend/src/components/DeployModelStep.tsx
index 17e3e553..acbb53dd 100644
--- a/app/frontend/src/components/DeployModelStep.tsx
+++ b/app/frontend/src/components/DeployModelStep.tsx
@@ -8,8 +8,7 @@ import { useStepper } from "./ui/stepper";
 import { StepperFormActions } from "./StepperFormActions";
 import { useModels } from "../hooks/useModels";
 import { useRefresh } from "../hooks/useRefresh";
-import { Cpu, AlertTriangle, ExternalLink } from "lucide-react";
-import { checkCurrentlyDeployedModels } from "../api/modelsDeployedApis";
+import { Cpu, AlertTriangle, ExternalLink, Info } from "lucide-react";
 import { Button } from "./ui/button";
 import { useNavigate } from "react-router-dom";
 import axios from "axios";
@@ -17,23 +16,25 @@ import axios from "axios";
 export function DeployModelStep({
   handleDeploy,
   selectedModel,
+  selectedDeviceId,
 }: {
   selectedModel: string | null;
   handleDeploy: () => Promise<{ success: boolean; job_id?: string }>;
+  selectedDeviceId?: number;
 }) {
   const { nextStep, isLastStep } = useStepper();
   const { refreshModels } = useModels();
   const { triggerRefresh, triggerHardwareRefresh } = useRefresh();
   const navigate = useNavigate();
   const [modelName, setModelName] = useState<string | null>(null);
-  const [deployedInfo, setDeployedInfo] = useState<{
-    hasDeployedModels: boolean;
-    count: number;
-    modelNames: string[];
+  const [slotInfo, setSlotInfo] = useState<{
+    totalSlots: number;
+    availableSlots: number;
+    occupiedDetails: { slot_id: number; model_name: string; port?: number }[];
   }>({
-    hasDeployedModels: false,
-    count: 0,
-    modelNames: [],
+    totalSlots: 0,
+    availableSlots: 0,
+    occupiedDetails: [],
   });
 
   // Track deployment error state that persists even after deployment stops
@@ -157,38 +158,53 @@ export function DeployModelStep({
   }, [selectedModel]);
 
   useEffect(() => {
-    // Don't check for deployed models while deployment is in progress
+    // Don't check slot status while deployment is in progress
     // This prevents the blocking UI from showing immediately after a successful deployment
     if (isDeploymentInProgress) {
       return;
     }
 
-    const checkDeployedModels = async () => {
+    const fetchSlotStatus = async () => {
       try {
-        const info = await checkCurrentlyDeployedModels();
-        setDeployedInfo(info);
+        const response = await axios.get("/docker-api/chip-status/");
+        const data = response.data as {
+          total_slots: number;
+          slots: { slot_id: number; status: string; model_name?: string; port?: number }[];
+        };
+        const occupied = data.slots.filter((s) => s.status === "occupied");
+        setSlotInfo({
+          totalSlots: data.total_slots,
+          availableSlots: data.total_slots - occupied.length,
+          occupiedDetails: occupied.map((s) => ({
+            slot_id: s.slot_id,
+            model_name: s.model_name || "Unknown",
+            port: s.port,
+          })),
+        });
       } catch (error) {
-        console.error("Error checking deployed models:", error);
+        console.error("Error fetching chip status:", error);
       }
     };
 
-    checkDeployedModels();
+    fetchSlotStatus();
   }, [isDeploymentInProgress]);
 
+  const allSlotsOccupied = slotInfo.totalSlots > 0 && slotInfo.availableSlots === 0;
+
   const deployButtonText = useMemo(() => {
-    if (deployedInfo.hasDeployedModels) {
-      return "Delete Existing Models First";
+    if (allSlotsOccupied) {
+      return "All Slots Occupied";
     }
     if (!selectedModel) return "Select a Model";
     return "Deploy Model";
   }, [
     selectedModel,
-    deployedInfo.hasDeployedModels,
+    allSlotsOccupied,
   ]);
 
   const isDeployDisabled =
     !selectedModel ||
-    deployedInfo.hasDeployedModels;
+    allSlotsOccupied;
 
   const onDeploy = useCallback(async () => {
     if (isDeployDisabled) return { success: false };
@@ -196,13 +212,11 @@ export function DeployModelStep({
     // Mark deployment as in progress to prevent blocking UI
     setIsDeploymentInProgress(true);
     
-    // Clear deployed info to prevent blocking UI from showing during deployment
-    // This ensures users see the "working" state instead of the error message
-    setDeployedInfo({
-      hasDeployedModels: false,
-      count: 0,
-      modelNames: [],
-    });
+    // Optimistically mark a slot as taken to prevent blocking UI during deployment
+    setSlotInfo((prev) => ({
+      ...prev,
+      availableSlots: Math.max(0, prev.availableSlots - 1),
+    }));
 
     // Reset error state and polling flag when starting a new deployment
     setDeploymentError({
@@ -263,9 +277,10 @@ export function DeployModelStep({
     // Note: The AnimatedDeployButton will reset its state when onDeploy is called again
   };
 
-  // Show a warning banner if models are deployed, but don't block the entire UI
-  // The deploy button will be disabled, providing a better UX than the blocking error
-  const showDeployedWarning = deployedInfo.hasDeployedModels && !isDeploymentInProgress;
+  // Show blocking warning only when ALL slots are occupied
+  const showSlotsFullWarning = allSlotsOccupied && !isDeploymentInProgress;
+  // Show informational status when some slots are in use but others are available
+  const showSlotInfo = !allSlotsOccupied && slotInfo.occupiedDetails.length > 0 && !isDeploymentInProgress;
 
   return (
     <>
@@ -273,21 +288,24 @@ export function DeployModelStep({
         className="flex flex-col items-center justify-center p-6 overflow-hidden"
         style={{ minHeight: "200px" }}
       >
-        {/* Show warning banner when models are already deployed */}
-        {showDeployedWarning && (
+        {/* Show blocking warning when ALL chip slots are occupied */}
+        {showSlotsFullWarning && (
           <div className="w-full max-w-2xl mb-6">
             <div className="bg-yellow-50 dark:bg-yellow-900/20 border border-yellow-200 dark:border-yellow-800 rounded-lg p-4">
               <div className="flex items-start gap-3">
                 <AlertTriangle className="h-5 w-5 text-yellow-600 dark:text-yellow-400 mt-0.5 flex-shrink-0" />
                 <div className="flex-1">
                   <h4 className="text-sm font-semibold text-yellow-800 dark:text-yellow-200 mb-1">
-                    Model Already Deployed
+                    All Chip Slots Occupied
                   </h4>
                   <p className="text-sm text-yellow-700 dark:text-yellow-300">
-                    {deployedInfo.count} model{deployedInfo.count > 1 ? "s are" : " is"} currently deployed: {deployedInfo.modelNames.join(", ")}
+                    All {slotInfo.totalSlots} slots are in use:{" "}
+                    {slotInfo.occupiedDetails
+                      .map((s) => `${s.model_name} (slot ${s.slot_id}${s.port ? ` :${s.port}` : ""})`)
+                      .join(", ")}
                   </p>
                   <p className="text-sm text-yellow-700 dark:text-yellow-300 mt-1">
-                    Delete existing model{deployedInfo.count > 1 ? "s" : ""} before deploying a new one.
+                    Free up a slot before deploying a new model.
                   </p>
                   <Button
                     onClick={handleGoToDeployedModels}
@@ -304,6 +322,22 @@ export function DeployModelStep({
           </div>
         )}
 
+        {/* Informational slot status when some slots are in use but more are available */}
+        {showSlotInfo && (
+          <div className="w-full max-w-2xl mb-4">
+            <div className="bg-blue-50 dark:bg-blue-900/15 border border-blue-200 dark:border-blue-800/50 rounded-lg px-4 py-3">
+              <div className="flex items-center gap-2">
+                <Info className="h-4 w-4 text-blue-500 dark:text-blue-400 flex-shrink-0" />
+                <span className="text-sm text-blue-700 dark:text-blue-300">
+                  {slotInfo.occupiedDetails.length}/{slotInfo.totalSlots} slot{slotInfo.occupiedDetails.length > 1 ? "s" : ""} in use
+                  {" \u2014 "}
+                  {slotInfo.availableSlots} available
+                </span>
+              </div>
+            </div>
+          </div>
+        )}
+
         {/* Show prominent error alert when deployment fails */}
         {deploymentError.hasError && (
           <div className="w-full max-w-2xl mb-6">
@@ -362,7 +396,7 @@ export function DeployModelStep({
           disabled={isDeployDisabled}
           onDeploymentComplete={onDeploymentComplete}
         />
-        <div className="mt-6 flex flex-col items-start justify-center space-y-4">
+        <div className="mt-6 flex flex-col items-start justify-center space-y-2">
           {modelName && (
             <div className="flex items-center space-x-2">
               <Cpu className="text-TT-purple-accent" />
@@ -374,6 +408,17 @@ export function DeployModelStep({
               </span>
             </div>
           )}
+          {selectedDeviceId !== undefined && (
+            <div className="flex items-center space-x-2">
+              <Cpu className="text-TT-purple-accent" />
+              <span className="text-sm text-gray-800 dark:text-gray-400">
+                Slot:
+              </span>
+              <span className="text-sm font-medium text-gray-900 dark:text-gray-200">
+                {selectedDeviceId}
+              </span>
+            </div>
+          )}
         </div>
       </div>
       <StepperFormActions removeDynamicSteps={() => {}} />
diff --git a/app/frontend/src/components/FirstStepForm.tsx b/app/frontend/src/components/FirstStepForm.tsx
index 0dbe346f..627f7d94 100644
--- a/app/frontend/src/components/FirstStepForm.tsx
+++ b/app/frontend/src/components/FirstStepForm.tsx
@@ -8,15 +8,10 @@ import axios from "axios";
 import { useEffect, useState } from "react";
 import {
   Bot,
-  // Cpu,
-  // CheckCircle,
   XCircle,
-  MessageSquare,
-  // Image,
-  Eye,
-  Mic,
-  Palette,
-  // Camera,
+  CheckCircle2,
+  Zap,
+  FlaskConical,
 } from "lucide-react";
 import {
   Tooltip,
@@ -48,59 +43,62 @@ import BoardBadge from "./BoardBadge";
 import { DeployedModelsWarning } from "./DeployedModelsWarning";
 import { useModels } from "../hooks/useModels";
 
-// Model type configuration with icons and labels
-const MODEL_TYPE_CONFIG = {
-  chat: {
-    label: "Chat & Language Models",
-    icon: MessageSquare,
+// Status configuration with icons and labels
+const STATUS_CONFIG = {
+  COMPLETE: {
+    label: "Complete",
+    icon: CheckCircle2,
+    color: "text-green-600",
+    bgColor: "bg-green-50 dark:bg-green-900/20",
+    borderColor: "border-green-200 dark:border-green-800",
+  },
+  FUNCTIONAL: {
+    label: "Functional",
+    icon: Zap,
     color: "text-blue-500",
     bgColor: "bg-blue-50 dark:bg-blue-900/20",
     borderColor: "border-blue-200 dark:border-blue-800",
   },
-  image_generation: {
-    label: "Image Generation",
-    icon: Palette,
-    color: "text-purple-500",
-    bgColor: "bg-purple-50 dark:bg-purple-900/20",
-    borderColor: "border-purple-200 dark:border-purple-800",
-  },
-  object_detection: {
-    label: "Object Detection",
-    icon: Eye,
-    color: "text-emerald-500",
-    bgColor: "bg-emerald-50 dark:bg-emerald-900/20",
-    borderColor: "border-emerald-200 dark:border-emerald-800",
-  },
-  speech_recognition: {
-    label: "Speech Recognition",
-    icon: Mic,
-    color: "text-orange-500",
-    bgColor: "bg-orange-50 dark:bg-orange-900/20",
-    borderColor: "border-orange-200 dark:border-orange-800",
-  },
-  mock: {
-    label: "Test Models",
-    icon: Bot,
-    color: "text-gray-500",
-    bgColor: "bg-gray-50 dark:bg-gray-900/20",
-    borderColor: "border-gray-200 dark:border-gray-800",
+  EXPERIMENTAL: {
+    label: "Experimental",
+    icon: FlaskConical,
+    color: "text-amber-500",
+    bgColor: "bg-amber-50 dark:bg-amber-900/20",
+    borderColor: "border-amber-200 dark:border-amber-800",
   },
 };
 
+// Model type configuration for grouping by inference server type
+const TYPE_CONFIG: Record<string, { label: string; order: number }> = {
+  LLM:            { label: "LLM Models",       order: 1 },
+  VLM:            { label: "VLM Models",        order: 2 },
+  VIDEO:          { label: "Video Models",      order: 3 },
+  IMAGE:          { label: "Image Models",      order: 4 },
+  AUDIO:          { label: "Audio Models",      order: 5 },
+  TEXT_TO_SPEECH: { label: "TTS Models",        order: 6 },
+  EMBEDDING:      { label: "Embedding Models",  order: 7 },
+  CNN:            { label: "CNN Models",         order: 8 },
+};
+
 const FirstFormSchema = z.object({
   model: z.string().nonempty("Please select a model."),
 });
 
+
 export function FirstStepForm({
   setSelectedModel,
   setFormError,
+  setSelectedDeviceId,
   autoDeployModel,
   isAutoDeploying,
+  chipMode,
 }: {
   setSelectedModel: (model: string) => void;
   setFormError: (hasError: boolean) => void;
+  setSelectedDeviceId?: (deviceId: number) => void;
   autoDeployModel?: string | null;
   isAutoDeploying?: boolean;
+  chipMode?: "single" | "multi";
 }) {
   const { nextStep } = useStepper();
   const {
@@ -185,7 +183,7 @@ export function FirstStepForm({
 
         console.log(
           "📝 FirstStepForm: Setting selectedModel to:",
-          selectedModel.id
+          selectedModel.id,
         );
         setSelectedModel(selectedModel.id);
         console.log(
@@ -235,33 +233,44 @@ export function FirstStepForm({
     }
   }, [autoDeployModel, models, isAutoDeploying, form, onSubmit]);
 
-  // Get current board info and group models by type and compatibility
+  // Get current board info and group models by status and compatibility
   const currentBoard = models[0]?.current_board || "unknown";
 
-  // Group models by type and compatibility
+  // Status priority order for sorting
+  const STATUS_ORDER: Record<string, number> = {
+    COMPLETE: 3,
+    FUNCTIONAL: 2,
+    EXPERIMENTAL: 1,
+  };
+
+  // Filter models by chip mode
+  const filteredModels = chipMode
+    ? models.filter((m) =>
+        chipMode === "single"
+          ? (m.chips_required ?? 1) === 1
+          : (m.chips_required ?? 1) > 1
+      )
+    : models;
+
+  // Group models by display type, then by status, then by hardware compatibility
+  type CompatibilityGroup = { compatible: Model[]; incompatible: Model[]; unknown: Model[] };
   const groupModelsByType = () => {
-    const grouped: Record<
-      string,
-      {
-        compatible: Model[];
-        incompatible: Model[];
-        unknown: Model[];
-      }
-    > = {};
+    const grouped: Record<string, Record<string, CompatibilityGroup>> = {};
 
-    models.forEach((model) => {
-      const modelType = model.model_type || "unknown";
+    filteredModels.forEach((model) => {
+      const displayType = model.display_model_type || "LLM";
+      const modelStatus = model.status || "EXPERIMENTAL";
 
-      if (!grouped[modelType]) {
-        grouped[modelType] = { compatible: [], incompatible: [], unknown: [] };
-      }
+      if (!grouped[displayType]) grouped[displayType] = {};
+      if (!grouped[displayType][modelStatus])
+        grouped[displayType][modelStatus] = { compatible: [], incompatible: [], unknown: [] };
 
       if (model.is_compatible === true) {
-        grouped[modelType].compatible.push(model);
+        grouped[displayType][modelStatus].compatible.push(model);
       } else if (model.is_compatible === false) {
-        grouped[modelType].incompatible.push(model);
+        grouped[displayType][modelStatus].incompatible.push(model);
       } else {
-        grouped[modelType].unknown.push(model);
+        grouped[displayType][modelStatus].unknown.push(model);
       }
     });
 
@@ -270,7 +279,7 @@ export function FirstStepForm({
 
   const groupedModels = groupModelsByType();
   const allModelsUnknown =
-    models.length > 0 && models.every((model) => model.is_compatible === null);
+    filteredModels.length > 0 && filteredModels.every((model) => model.is_compatible === null);
 
   return (
     <Form {...form}>
@@ -340,111 +349,118 @@ export function FirstStepForm({
                     </div>
                   )}
 
-                  {/* Render models grouped by type */}
-                  {Object.entries(groupedModels).map(
-                    ([modelType, modelsByCompatibility], typeIndex) => {
-                      const typeConfig =
-                        MODEL_TYPE_CONFIG[
-                          modelType as keyof typeof MODEL_TYPE_CONFIG
-                        ];
-                      const hasModels =
-                        modelsByCompatibility.compatible.length +
-                          modelsByCompatibility.incompatible.length +
-                          modelsByCompatibility.unknown.length >
-                        0;
-
-                      if (!hasModels) return null;
-
-                      const IconComponent = typeConfig?.icon || Bot;
+                  {/* Render models grouped by type, then by status */}
+                  {Object.entries(groupedModels)
+                    .sort(([a], [b]) => {
+                      const orderA = TYPE_CONFIG[a]?.order ?? 99;
+                      const orderB = TYPE_CONFIG[b]?.order ?? 99;
+                      return orderA - orderB;
+                    })
+                    .map(([displayType, statusGroups], typeIndex) => {
+                      const typeConfig = TYPE_CONFIG[displayType];
+                      const typeLabel = typeConfig?.label || `${displayType} Models`;
 
                       return (
-                        <div key={modelType}>
-                          {/* Model Type Header */}
+                        <div key={displayType}>
+                          {/* Type Group Header */}
                           {typeIndex > 0 && (
-                            <div className="h-px bg-gray-200 dark:bg-gray-700 my-2" />
+                            <div className="h-[2px] bg-gray-300 dark:bg-gray-600 my-2" />
                           )}
-                          <div
-                            className={`flex items-center gap-2 px-2 py-2 text-xs font-semibold ${typeConfig?.color || "text-gray-600"} ${typeConfig?.bgColor || "bg-gray-50 dark:bg-gray-900/20"}`}
-                          >
-                            <IconComponent className="w-4 h-4" />
-                            <span>{typeConfig?.label || modelType}</span>
+                          <div className="flex items-center gap-2 px-2 py-2 text-sm font-bold text-gray-800 dark:text-gray-200 bg-gray-100 dark:bg-gray-800/50">
+                            <span>{typeLabel}</span>
                           </div>
 
-                          {/* Compatible Models */}
-                          {modelsByCompatibility.compatible.map((model) => (
-                            <SelectItem
-                              key={model.id}
-                              value={model.name}
-                              className="pl-6 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
-                            >
-                              <div className="flex items-center w-full">
-                                <span className="text-green-500 mr-2 text-xs">
-                                  ●
-                                </span>
-                                <span className="flex-1">{model.name}</span>
-                                <span className="text-xs text-green-600 ml-2">
-                                  Compatible
-                                </span>
-                              </div>
-                            </SelectItem>
-                          ))}
-
-                          {/* Incompatible Models */}
-                          {modelsByCompatibility.incompatible.map((model) => (
-                            <SelectItem
-                              key={model.id}
-                              value={model.name}
-                              disabled={true}
-                              className="pl-6 opacity-50 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
-                            >
-                              <div className="flex items-center w-full">
-                                <span className="text-red-500 mr-2 text-xs">
-                                  ●
-                                </span>
-                                <span className="text-gray-500 flex-1">
-                                  {model.name}
-                                </span>
-                                <span className="text-xs text-red-500 ml-2">
-                                  Incompatible
-                                </span>
-                              </div>
-                            </SelectItem>
-                          ))}
-
-                          {/* Unknown Compatibility Models */}
-                          {modelsByCompatibility.unknown.map((model) => (
-                            <SelectItem
-                              key={model.id}
-                              value={model.name}
-                              className="pl-6 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
-                            >
-                              <div className="flex items-center w-full">
-                                <span className="text-yellow-500 mr-2 text-xs">
-                                  ●
-                                </span>
-                                <span className="flex-1">{model.name}</span>
-                                <span className="text-xs text-yellow-600 ml-2">
-                                  Unknown
-                                </span>
-                              </div>
-                            </SelectItem>
-                          ))}
+                          {/* Status sub-groups within this type */}
+                          {Object.entries(statusGroups)
+                            .sort(
+                              ([a], [b]) =>
+                                (STATUS_ORDER[b] ?? 0) - (STATUS_ORDER[a] ?? 0)
+                            )
+                            .map(([modelStatus, modelsByCompatibility]) => {
+                              const statusConfig =
+                                STATUS_CONFIG[modelStatus as keyof typeof STATUS_CONFIG];
+                              const hasModels =
+                                modelsByCompatibility.compatible.length +
+                                modelsByCompatibility.incompatible.length +
+                                modelsByCompatibility.unknown.length > 0;
+
+                              if (!hasModels) return null;
+
+                              const IconComponent = statusConfig?.icon || Bot;
+
+                              return (
+                                <div key={`${displayType}-${modelStatus}`}>
+                                  {/* Status Sub-Header */}
+                                  <div
+                                    className={`flex items-center gap-2 px-3 py-1.5 text-xs font-semibold ${statusConfig?.color || "text-gray-600"} ${statusConfig?.bgColor || "bg-gray-50 dark:bg-gray-900/20"}`}
+                                  >
+                                    <IconComponent className="w-3 h-3" />
+                                    <span>{statusConfig?.label || modelStatus}</span>
+                                  </div>
+
+                                  {/* Compatible Models */}
+                                  {modelsByCompatibility.compatible.map((model: Model) => (
+                                    <SelectItem
+                                      key={model.id}
+                                      value={model.name}
+                                      className="pl-8 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
+                                    >
+                                      <div className="flex items-center w-full">
+                                        <span className="text-green-500 mr-2 text-xs">●</span>
+                                        <span className="flex-1">{model.name}</span>
+                                        <span className="text-xs text-green-600 ml-2">Compatible</span>
+                                      </div>
+                                    </SelectItem>
+                                  ))}
+
+                                  {/* Incompatible Models */}
+                                  {modelsByCompatibility.incompatible.map((model: Model) => (
+                                    <SelectItem
+                                      key={model.id}
+                                      value={model.name}
+                                      disabled={true}
+                                      className="pl-8 opacity-50 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
+                                    >
+                                      <div className="flex items-center w-full">
+                                        <span className="text-red-500 mr-2 text-xs">●</span>
+                                        <span className="text-gray-500 flex-1">{model.name}</span>
+                                        <span className="text-xs text-red-500 ml-2">Incompatible</span>
+                                      </div>
+                                    </SelectItem>
+                                  ))}
+
+                                  {/* Unknown Compatibility Models */}
+                                  {modelsByCompatibility.unknown.map((model: Model) => (
+                                    <SelectItem
+                                      key={model.id}
+                                      value={model.name}
+                                      className="pl-8 [&>*:first-child]:hidden [&_svg]:hidden [&_[data-radix-select-item-indicator]]:hidden"
+                                    >
+                                      <div className="flex items-center w-full">
+                                        <span className="text-yellow-500 mr-2 text-xs">●</span>
+                                        <span className="flex-1">{model.name}</span>
+                                        <span className="text-xs text-yellow-600 ml-2">Unknown</span>
+                                      </div>
+                                    </SelectItem>
+                                  ))}
+                                </div>
+                              );
+                            })}
                         </div>
                       );
-                    }
-                  )}
+                    })}
 
                   {/* If no models loaded yet */}
-                  {models.length === 0 && !isLoading && (
+                  {filteredModels.length === 0 && !isLoading && (
                     <div className="px-2 py-4 text-center text-gray-500">
-                      No models available
+                      {models.length === 0 ? "No models available" : "No models available for selected chip mode"}
                     </div>
                   )}
                 </SelectContent>
               </Select>
 
               {/* Summary info */}
-              {models.length > 0 && !isLoading && (
+              {filteredModels.length > 0 && !isLoading && (
                 <div className="mt-4 p-4 rounded-lg border-2 border-stone-200 bg-white text-stone-950 shadow-sm dark:border-stone-800 dark:bg-stone-950 dark:text-stone-50 hover:border-stone-400 dark:hover:border-stone-700 hover:shadow-md transition-all duration-200">
                   <div className="flex items-center justify-between text-sm mb-3">
                     <span className="text-gray-600 dark:text-gray-300">
@@ -496,7 +512,7 @@ export function FirstStepForm({
                               <span className="text-green-500 text-xs">●</span>
                               <span className="text-gray-700 dark:text-gray-200">
                                 {
-                                  models.filter(
+                                  filteredModels.filter(
                                     (model) => model.is_compatible === true
                                   ).length
                                 }{" "}
@@ -507,7 +523,7 @@ export function FirstStepForm({
                               <span className="text-red-500 text-xs">●</span>
                               <span className="text-gray-700 dark:text-gray-200">
                                 {
-                                  models.filter(
+                                  filteredModels.filter(
                                     (model) => model.is_compatible === false
                                   ).length
                                 }{" "}
@@ -518,7 +534,7 @@ export function FirstStepForm({
                               <span className="text-yellow-500 text-xs">●</span>
                               <span className="text-gray-700 dark:text-gray-200">
                                 {
-                                  models.filter(
+                                  filteredModels.filter(
                                     (model) => model.is_compatible === null
                                   ).length
                                 }{" "}
diff --git a/app/frontend/src/components/Footer.tsx b/app/frontend/src/components/Footer.tsx
index 513e9fe3..f60aa5eb 100644
--- a/app/frontend/src/components/Footer.tsx
+++ b/app/frontend/src/components/Footer.tsx
@@ -7,6 +7,7 @@ import { Badge } from "./ui/badge";
 import { useTheme } from "../hooks/useTheme";
 import { useNavigate, useLocation } from "react-router-dom";
 import { useModels } from "../hooks/useModels";
+import { useDeviceState } from "../hooks/useDeviceState";
 import {
   Tooltip,
   TooltipContent,
@@ -31,34 +32,19 @@ interface FooterProps {
   className?: string;
 }
 
-interface SystemStatus {
+interface SystemResources {
   cpuUsage: number;
   memoryUsage: number;
   memoryTotal: string;
-  boardName: string;
-  temperature: number;
-  devices: Array<{
-    index: number;
-    board_type: string;
-    temperature: number;
-    power: number;
-    voltage: number;
-  }>;
-  hardware_status?: "healthy" | "error" | "unknown";
-  hardware_error?: string;
-  error?: string;
 }
 
 const REFRESH_COOLDOWN_MS = 2 * 60 * 1000; // 2 minutes cooldown between manual refreshes
 
 const Footer: React.FC<FooterProps> = ({ className }) => {
-  const [systemStatus, setSystemStatus] = useState<SystemStatus>({
+  const [systemResources, setSystemResources] = useState<SystemResources>({
     cpuUsage: 0,
     memoryUsage: 0,
     memoryTotal: "0 GB",
-    boardName: "Unknown",
-    temperature: 0,
-    devices: [],
   });
   const [loading, setLoading] = useState(true);
   const [error, setError] = useState<string | null>(null);
@@ -67,6 +53,7 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
   const [showTTStudioModal, setShowTTStudioModal] = useState(false);
   const [bugReportLoading, setBugReportLoading] = useState(false);
   const { models } = useModels();
+  const { deviceState, refresh: refreshDeviceState } = useDeviceState();
   const navigate = useNavigate();
   const location = useLocation();
   const { theme } = useTheme();
@@ -82,8 +69,8 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
   // Check if we should hide the footer
   const shouldHideFooter = location.pathname === "/chat";
 
-  // Fetch system status from API
-  const fetchSystemStatus = async () => {
+  // Fetch only CPU/memory resources (board info comes from DeviceStateContext)
+  const fetchSystemResources = async () => {
     try {
       const response = await fetch("/board-api/footer-data/");
       if (!response.ok) {
@@ -96,18 +83,15 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
       }
 
       const data = await response.json();
-      setSystemStatus(data);
+      setSystemResources({
+        cpuUsage: data.cpuUsage ?? 0,
+        memoryUsage: data.memoryUsage ?? 0,
+        memoryTotal: data.memoryTotal ?? "0 GB",
+      });
       setError(null);
     } catch (err) {
-      console.error("Failed to fetch system status:", err);
+      console.error("Failed to fetch system resources:", err);
       setError(err instanceof Error ? err.message : "Unknown error");
-      // Keep previous data or use fallback
-      setSystemStatus((prev) => ({
-        ...prev,
-        boardName: prev.hardware_status === "error" ? prev.boardName : "Error",
-        hardware_status: prev.hardware_status === "error" ? "error" : "unknown",
-        error: err instanceof Error ? err.message : "Unknown error",
-      }));
     } finally {
       setLoading(false);
     }
@@ -129,18 +113,8 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
 
     try {
       setRefreshing(true);
-      const response = await fetch("/board-api/refresh-cache/", {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-        },
-      });
-
-      if (!response.ok) {
-        throw new Error(`HTTP error! status: ${response.status}`);
-      }
-
-      await fetchSystemStatus();
+      // Trigger an immediate re-poll of device state via context
+      refreshDeviceState();
     } catch (err) {
       console.error("Failed to refresh board detection:", err);
       setError(err instanceof Error ? err.message : "Unknown error");
@@ -151,26 +125,57 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
   };
 
   useEffect(() => {
-    // Initial fetch on mount only
-    fetchSystemStatus();
-
-    // No more timer-based polling - will refresh on model deployment events
+    // Fetch CPU/memory once on mount (board info is handled by DeviceStateContext)
+    fetchSystemResources();
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, []);
 
   const textColor = theme === "dark" ? "text-zinc-300" : "text-gray-700";
   const borderColor = theme === "dark" ? "border-zinc-700" : "border-gray-200";
   const bgColor = theme === "dark" ? "bg-zinc-900/95" : "bg-white/95";
   const mutedTextColor = theme === "dark" ? "text-zinc-400" : "text-gray-500";
-  const normalizedBoardName = systemStatus.boardName?.toLowerCase();
+
+  // Derive board info from DeviceStateContext
+  const boardName = deviceState?.board_name ?? "Unknown";
+  const deviceStateName = deviceState?.state ?? "UNKNOWN";
+  const devices = deviceState?.devices ?? [];
+  const avgTemperature =
+    devices.length > 0
+      ? Math.round(
+          (devices.reduce((sum, d) => sum + (d.temperature ?? 0), 0) /
+            devices.length) *
+            10
+        ) / 10
+      : 0;
+  const isHardwareHealthy = deviceStateName === "HEALTHY";
+  const isHardwareError =
+    deviceStateName === "BAD_STATE" || deviceStateName === "NOT_PRESENT";
+  const normalizedBoardName = boardName.toLowerCase();
   const isBoardDetectionIssue =
-    systemStatus.hardware_status === "error" ||
+    isHardwareError ||
     !!error ||
     normalizedBoardName === "error" ||
-    normalizedBoardName === "unknown";
+    normalizedBoardName === "unknown" ||
+    normalizedBoardName === "not present" ||
+    normalizedBoardName === "bad state";
   const remainingCooldownMs = getRemainingCooldownMs();
   const isInCooldown = remainingCooldownMs > 0;
   const cooldownSeconds = Math.ceil(remainingCooldownMs / 1000);
 
+  // Legacy-compatible derived values used by bug-report and render
+  const hardwareStatus: "healthy" | "error" | "unknown" =
+    deviceStateName === "HEALTHY"
+      ? "healthy"
+      : deviceStateName === "BAD_STATE" || deviceStateName === "NOT_PRESENT"
+        ? "error"
+        : "unknown";
+  const hardwareError =
+    deviceStateName === "BAD_STATE"
+      ? "Board is in a bad state (unresponsive). Reset recommended."
+      : deviceStateName === "NOT_PRESENT"
+        ? "No Tenstorrent device detected. Check hardware connection."
+        : null;
+
   // Handle click on deployed models section
   const handleDeployedModelsClick = () => {
     navigate("/models-deployed");
@@ -302,19 +307,19 @@ const Footer: React.FC<FooterProps> = ({ className }) => {
 **Time:** ${new Date().toLocaleTimeString()}
 
 ### System Information
-- **Board:** ${systemStatus.boardName}
-- **Hardware Status:** ${systemStatus.hardware_status || "unknown"}
-- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}%
-- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal})
-- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C
-- **Devices:** ${systemStatus.devices.length} device(s)
+- **Board:** ${boardName}
+- **Hardware Status:** ${hardwareStatus || "unknown"}
+- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}%
+- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal})
+- **Temperature:** ${avgTemperature.toFixed(1)}°C
+- **Devices:** ${devices.length} device(s)
 - **Current URL:** ${currentUrl}
 - **User Agent:** ${userAgent}
 
 ### Hardware Details
 ${
-  systemStatus.devices.length > 0
-    ? systemStatus.devices
+  devices.length > 0
+    ? devices
         .map(
           (device, index) =>
             `**Device ${index + 1}:**
@@ -332,7 +337,7 @@ ${models.length > 0 ? models.map((model) => `- ${model.name} (${model.status})`)
 
 ### Error Information
 ${error ? `**System Error:** ${error}` : "No system errors detected"}
-${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"}
+${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"}
 
 ### FastAPI Logs
 ${fastapiLogs}
@@ -379,15 +384,15 @@ Add any other context about the problem here.
         : text;
     };
     const limitDevicesList = (maxDevices: number) => {
-      if (systemStatus.devices.length <= maxDevices) return undefined;
-      const blocks = systemStatus.devices
+      if (devices.length <= maxDevices) return undefined;
+      const blocks = devices
         .map(
           (device, index) =>
             `**Device ${index + 1}:**\n- Board Type: ${device.board_type}\n- Temperature: ${device.temperature.toFixed(1)}°C\n- Power: ${device.power.toFixed(2)}W\n- Voltage: ${device.voltage.toFixed(2)}V`
         )
         .slice(0, maxDevices)
         .join("\n\n");
-      return `${blocks}\n\n... (${systemStatus.devices.length - maxDevices} more device entries truncated)`;
+      return `${blocks}\n\n... (${devices.length - maxDevices} more device entries truncated)`;
     };
 
     const MAX_URL_LENGTH = 7000; // conservative safety limit for GitHub new-issue URL
@@ -406,15 +411,15 @@ Add any other context about the problem here.
 **Time:** ${new Date().toLocaleTimeString()}
 
 ### System Information
-- **Board:** ${systemStatus.boardName}
-- **Hardware Status:** ${systemStatus.hardware_status || "unknown"}
-- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}%
-- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal})
-- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C
-- **Devices:** ${systemStatus.devices.length} device(s)
+- **Board:** ${boardName}
+- **Hardware Status:** ${hardwareStatus || "unknown"}
+- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}%
+- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal})
+- **Temperature:** ${avgTemperature.toFixed(1)}°C
+- **Devices:** ${devices.length} device(s)
 
 ### Hardware Details (truncated)
-${devicesLimited ?? (systemStatus.devices.length ? "(within limit)" : "No hardware devices detected")}
+${devicesLimited ?? (devices.length ? "(within limit)" : "No hardware devices detected")}
 
 ### Deployed Models
 ${
@@ -428,7 +433,7 @@ ${
 
 ### Error Information
 ${error ? `**System Error:** ${error}` : "No system errors detected"}
-${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"}
+${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"}
 
 ### FastAPI Logs (truncated)
 ${truncatedFastapi}
@@ -530,16 +535,16 @@ Full logs have been copied to your clipboard and downloaded as a file. Please pa
 **Time:** ${new Date().toLocaleTimeString()}
 
 ### System Information
-- **Board:** ${systemStatus.boardName}
-- **Hardware Status:** ${systemStatus.hardware_status || "unknown"}
-- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}%
-- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal})
-- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C
-- **Devices:** ${systemStatus.devices.length} device(s)
+- **Board:** ${boardName}
+- **Hardware Status:** ${hardwareStatus || "unknown"}
+- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}%
+- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal})
+- **Temperature:** ${avgTemperature.toFixed(1)}°C
+- **Devices:** ${devices.length} device(s)
 
 ### Error Information
 ${error ? `**System Error:** ${error}` : "No system errors detected"}
-${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"}
+${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"}
 
 ### FastAPI Logs
 ${fallbackFastapiLogs}
@@ -622,7 +627,7 @@ Add any other context about the problem here.
               <span>TT Studio 2.0.1</span>
               <Github className="h-3.5 w-3.5" />
             </div>
-            {systemStatus.boardName?.toLowerCase().includes("t3k") ? (
+            {boardName?.toLowerCase().includes("t3k") ? (
               <div
                 className="flex items-center gap-2 px-3 py-1.5 bg-TT-purple-accent/10 dark:bg-TT-purple-accent/30 rounded-full cursor-pointer transition-all duration-200 hover:bg-TT-purple-accent/20 dark:hover:bg-TT-purple-accent/40 hover:scale-105"
                 title="Hardware status - Click to learn more"
@@ -635,10 +640,10 @@ Add any other context about the problem here.
               >
                 <HardwareIcon type="loudbox" className="h-4 w-4" />
                 <span className="text-sm font-medium text-TT-purple-accent">
-                  {systemStatus.boardName}
+                  {boardName}
                 </span>
               </div>
-            ) : systemStatus.boardName?.toLowerCase().includes("n300") ? (
+            ) : boardName?.toLowerCase().includes("n300") ? (
               <div
                 className="flex items-center gap-2 px-3 py-1.5 bg-TT-purple-accent/10 dark:bg-TT-purple-accent/30 rounded-full cursor-pointer transition-all duration-200 hover:bg-TT-purple-accent/20 dark:hover:bg-TT-purple-accent/40 hover:scale-105"
                 title="Hardware status - Click to learn more"
@@ -662,14 +667,14 @@ Add any other context about the problem here.
                   />
                 </svg>
                 <span className="text-sm font-medium text-TT-purple-accent">
-                  {systemStatus.boardName}
+                  {boardName}
                 </span>
               </div>
             ) : (
               <div className="flex items-center gap-1.5">
                 <Badge
                   variant={
-                    systemStatus.hardware_status === "error"
+                    hardwareStatus === "error"
                       ? "destructive"
                       : error
                         ? "destructive"
@@ -677,7 +682,7 @@ Add any other context about the problem here.
                   }
                   className={`text-xs ${textColor} cursor-pointer transition-all duration-200 hover:scale-105 hover:bg-opacity-80`}
                   title={
-                    systemStatus.hardware_error ||
+                    hardwareError ||
                     error ||
                     "Hardware status - Click to learn more"
                   }
@@ -685,8 +690,8 @@ Add any other context about the problem here.
                     window.open("https://www.tenstorrent.com/hardware", "_blank");
                   }}
                 >
-                  {systemStatus.boardName}
-                  {systemStatus.hardware_status === "error" && " ⚠️"}
+                  {boardName}
+                  {hardwareStatus === "error" && " ⚠️"}
                 </Badge>
                 {isBoardDetectionIssue && (
                   <TooltipProvider>
@@ -724,10 +729,10 @@ Add any other context about the problem here.
                 )}
               </div>
             )}
-            {(error || systemStatus.hardware_error) && (
+            {(error || hardwareError) && (
               <span
                 className={`text-xs text-red-500`}
-                title={systemStatus.hardware_error || error || "System error"}
+                title={hardwareError || error || "System error"}
               >
                 ⚠️
               </span>
@@ -794,23 +799,23 @@ Add any other context about the problem here.
               SYSTEM RESOURCES USAGE:
             </span>
             <span className={`text-sm ${textColor}`}>
-              RAM: {systemStatus.memoryUsage.toFixed(1)}% (
-              {systemStatus.memoryTotal}) | CPU:{" "}
-              {systemStatus.cpuUsage.toFixed(2)}%
-              {systemStatus.hardware_status === "healthy" && (
-                <> | TEMP: {systemStatus.temperature.toFixed(1)}°C</>
+              RAM: {systemResources.memoryUsage.toFixed(1)}% (
+              {systemResources.memoryTotal}) | CPU:{" "}
+              {systemResources.cpuUsage.toFixed(2)}%
+              {hardwareStatus === "healthy" && (
+                <> | TEMP: {avgTemperature.toFixed(1)}°C</>
               )}
-              {systemStatus.hardware_status === "error" && (
+              {hardwareStatus === "error" && (
                 <> | TT HARDWARE: UNAVAILABLE</>
               )}
-              {systemStatus.hardware_status === "unknown" && (
+              {hardwareStatus === "unknown" && (
                 <> | TT HARDWARE: CHECKING...</>
               )}
             </span>
-            {systemStatus.devices.length > 1 &&
-              systemStatus.hardware_status === "healthy" && (
+            {devices.length > 1 &&
+              hardwareStatus === "healthy" && (
                 <span className={`text-xs ${mutedTextColor}`}>
-                  ({systemStatus.devices.length} devices)
+                  ({devices.length} devices)
                 </span>
               )}
           </div>
diff --git a/app/frontend/src/components/HealthBadge.tsx b/app/frontend/src/components/HealthBadge.tsx
index 7fe28698..8f17a898 100644
--- a/app/frontend/src/components/HealthBadge.tsx
+++ b/app/frontend/src/components/HealthBadge.tsx
@@ -24,7 +24,7 @@ export interface HealthBadgeRef {
   refreshHealth: () => Promise<void>;
 }
 
-type HealthStatus = "healthy" | "unavailable" | "unhealthy" | "unknown";
+type HealthStatus = "healthy" | "starting" | "unavailable" | "unhealthy" | "unknown";
 
 const HealthBadge = forwardRef<HealthBadgeRef, HealthBadgeProps>(
   ({ deployId, onHealthChange }, ref) => {
@@ -43,6 +43,8 @@ const HealthBadge = forwardRef<HealthBadgeRef, HealthBadgeProps>(
 
         if (response.status === 200) {
           setHealth("healthy");
+        } else if (response.status === 202) {
+          setHealth("starting");
         } else if (response.status === 503) {
           setHealth("unavailable");
         } else {
@@ -154,7 +156,7 @@ const HealthBadge = forwardRef<HealthBadgeRef, HealthBadgeProps>(
               style={{ minHeight: 28 }}
             >
               <div
-                className={`w-2 h-2 rounded-full mr-2 ${getDotColor()} ${health === "healthy" ? "animate-pulse" : ""}`}
+                className={`w-2 h-2 rounded-full mr-2 ${getDotColor()} ${health === "healthy" || health === "starting" ? "animate-pulse" : ""}`}
               />
               {isLoading ? "Loading..." : health}
             </div>
diff --git a/app/frontend/src/components/NavBar.tsx b/app/frontend/src/components/NavBar.tsx
index 5c9dda0e..e3951c21 100644
--- a/app/frontend/src/components/NavBar.tsx
+++ b/app/frontend/src/components/NavBar.tsx
@@ -13,6 +13,8 @@ import {
   Image,
   Eye,
   AudioLines,
+  Mic,
+  Volume2,
   ChevronRight,
   ChevronLeft,
   type LucideIcon,
@@ -46,6 +48,7 @@ import {
   getDestinationFromModelType,
   ModelType,
   getModelTypeFromName,
+  getModelTypeFromBackendType,
 } from "../api/modelsDeployedApis";
 import { useHeroSection } from "../hooks/useHeroSection";
 
@@ -386,7 +389,12 @@ export default function NavBar() {
     if (models.length > 0) {
       const firstModel = models[0];
       if (firstModel.id && firstModel.name) {
-        handleModelNavigationClick(firstModel.id, firstModel.name, navigate);
+        handleModelNavigationClick(
+          firstModel.id,
+          firstModel.name,
+          navigate,
+          firstModel.model_type
+        );
       } else {
         console.error("Model ID or name is undefined");
       }
@@ -467,6 +475,20 @@ export default function NavBar() {
       label: "Logs",
       tooltip: "View system logs",
     },
+    {
+      type: "link",
+      to: "/voice-pipeline",
+      icon: Mic,
+      label: "Voice Pipeline",
+      tooltip: "End-to-end voice demo (Whisper → LLM → TTS)",
+    },
+    {
+      type: "link",
+      to: "/tts",
+      icon: Volume2,
+      label: "Text to Speech",
+      tooltip: "Convert text to audio with TTS model",
+    },
   ];
 
   // Define model-based navigation items (shown only when isDeployedEnabled is true)
@@ -484,7 +506,9 @@ export default function NavBar() {
       if (models.length > 0) {
         // Show navigation items for each deployed model
         return models.map((model) => {
-          const modelType = getModelTypeFromName(model.name);
+          const modelType = model.model_type
+            ? getModelTypeFromBackendType(model.model_type)
+            : getModelTypeFromName(model.name);
           console.log(`Model: ${model.name}, Type: ${modelType}`);
           return {
             type: "button",
@@ -545,7 +569,9 @@ export default function NavBar() {
       // In TT-Studio mode, show only deployed models
       console.log("TT-Studio mode - creating navigation for deployed models");
       return models.map((model) => {
-        const modelType = getModelTypeFromName(model.name);
+        const modelType = model.model_type
+          ? getModelTypeFromBackendType(model.model_type)
+          : getModelTypeFromName(model.name);
         console.log(`TT-Studio Model: ${model.name}, Type: ${modelType}`);
         return {
           type: "button",
diff --git a/app/frontend/src/components/ResetIcon.tsx b/app/frontend/src/components/ResetIcon.tsx
index 8497ea06..9d27e3e2 100644
--- a/app/frontend/src/components/ResetIcon.tsx
+++ b/app/frontend/src/components/ResetIcon.tsx
@@ -1,11 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
 
-import React, { useState, useEffect } from "react";
+import React, { useState } from "react";
 import axios from "axios";
-import { Cpu, CheckCircle, AlertTriangle } from "lucide-react";
+import {
+  Cpu,
+  CheckCircle,
+  XCircle,
+  AlertTriangle,
+  Loader2,
+  Trash2,
+  RotateCcw,
+  ChevronDown,
+} from "lucide-react";
 import { Spinner } from "./ui/spinner";
-import { customToast } from "./CustomToaster";
 import { useTheme } from "../hooks/useTheme";
 import { Button } from "./ui/button";
 import {
@@ -15,369 +23,542 @@ import {
   DialogHeader,
   DialogTitle,
   DialogTrigger,
-  DialogDescription,
 } from "./ui/dialog";
-import {
-  Accordion,
-  AccordionContent,
-  AccordionItem,
-  AccordionTrigger,
-} from "./ui/accordion";
 import { ScrollArea } from "./ui/scroll-area";
 import { fetchModels, deleteModel } from "../api/modelsDeployedApis";
 import { useModels } from "../hooks/useModels";
+import { useDeviceState } from "../hooks/useDeviceState";
 import BoardBadge from "./BoardBadge";
 
+type ResetStep = "deleting" | "resetting" | "done" | "failed" | null;
+
 interface ResetIconProps {
   onReset?: () => void;
 }
 
-// Board info interface
-interface BoardInfo {
-  type: string;
-  name: string;
+// ── Shared step-row (mirrors DeleteModelDialog) ──────────────────────────────
+function StepRow({
+  number,
+  icon,
+  label,
+  sublabel,
+  state,
+}: {
+  number: number;
+  icon: React.ReactNode;
+  label: string;
+  sublabel?: string;
+  state: "pending" | "active" | "done" | "skipped";
+}) {
+  return (
+    <div
+      className={`flex items-start gap-3 p-3 rounded-lg border transition-all duration-300 ${
+        state === "active"
+          ? "bg-blue-900/30 border-blue-500/40"
+          : state === "done"
+            ? "bg-green-900/20 border-green-600/30"
+            : state === "skipped"
+              ? "bg-stone-800/30 border-stone-700/30"
+              : "bg-stone-800/50 border-stone-700/40"
+      }`}
+    >
+      <div className="w-7 h-7 flex items-center justify-center shrink-0 mt-0.5">
+        {state === "active" ? (
+          <Loader2 className="w-5 h-5 text-blue-400 animate-spin" />
+        ) : state === "done" ? (
+          <CheckCircle className="w-5 h-5 text-green-400" />
+        ) : state === "skipped" ? (
+          <CheckCircle className="w-5 h-5 text-stone-500" />
+        ) : (
+          <div className="w-6 h-6 rounded-full bg-stone-600 flex items-center justify-center text-xs font-bold text-stone-300">
+            {number}
+          </div>
+        )}
+      </div>
+      <div className="flex-1 min-w-0">
+        <div
+          className={`font-medium text-sm inline-flex items-center gap-1.5 ${
+            state === "pending" || state === "skipped"
+              ? "text-stone-400"
+              : "text-white"
+          }`}
+        >
+          {icon}
+          {label}
+        </div>
+        {sublabel && state === "active" && (
+          <div className="text-xs text-blue-300 mt-1">{sublabel}</div>
+        )}
+        {state === "done" && (
+          <div className="text-xs text-green-400 mt-0.5">Completed</div>
+        )}
+        {state === "skipped" && (
+          <div className="text-xs text-stone-500 mt-0.5">
+            No models deployed — skipped
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
+
+// ── Board status banner ───────────────────────────────────────────────────────
+function BoardStatusBanner({
+  state,
+  boardType,
+}: {
+  state: string;
+  boardType: string;
+}) {
+  if (state === "BAD_STATE") {
+    return (
+      <div className="flex items-start gap-3 p-3 bg-orange-900/30 border border-orange-500/40 rounded-lg text-orange-200 text-sm">
+        <AlertTriangle className="h-4 w-4 text-orange-400 mt-0.5 shrink-0" />
+        <div>
+          <strong className="text-orange-300">Board unresponsive</strong>
+          <p className="mt-0.5 text-orange-200/80">
+            The board is present but not responding. A reset is strongly
+            recommended.
+          </p>
+        </div>
+      </div>
+    );
+  }
+  if (state === "NOT_PRESENT") {
+    return (
+      <div className="flex items-start gap-3 p-3 bg-red-900/30 border border-red-500/40 rounded-lg text-red-200 text-sm">
+        <AlertTriangle className="h-4 w-4 text-red-400 mt-0.5 shrink-0" />
+        <div>
+          <strong className="text-red-300">No device detected</strong>
+          <p className="mt-0.5 text-red-200/80">
+            <code className="bg-red-900/50 px-1 rounded">/dev/tenstorrent</code>{" "}
+            not found. Check your hardware connection.
+          </p>
+        </div>
+      </div>
+    );
+  }
+  if (state === "HEALTHY" && boardType !== "unknown") {
+    return (
+      <div className="flex items-center gap-2 p-3 bg-green-900/20 border border-green-600/30 rounded-lg text-green-200 text-sm">
+        <CheckCircle className="h-4 w-4 text-green-400 shrink-0" />
+        <span>
+          Board is <strong className="text-green-300">healthy</strong> — reset
+          is available if needed.
+        </span>
+      </div>
+    );
+  }
+  return null;
 }
 
+// ── Main component ────────────────────────────────────────────────────────────
 const ResetIcon: React.FC<ResetIconProps> = ({ onReset }) => {
   const { theme } = useTheme();
-  const { refreshModels } = useModels();
-  const [isLoading, setIsLoading] = useState(false);
-  const [isCompleted, setIsCompleted] = useState(false);
+  const { models, refreshModels } = useModels();
+  const { deviceState, refresh: refreshDeviceState } = useDeviceState();
+
   const [isDialogOpen, setIsDialogOpen] = useState(false);
+  const [resetStep, setResetStep] = useState<ResetStep>(null);
   const [errorMessage, setErrorMessage] = useState<string | null>(null);
+  const [cmdOutput, setCmdOutput] = useState<string | null>(null);
+  const [showOutput, setShowOutput] = useState(false);
   const [resetHistory, setResetHistory] = useState<Date[]>([]);
-  const [fullOutput, setFullOutput] = useState<string | null>(null);
-  const [boardInfo, setBoardInfo] = useState<BoardInfo | null>(null);
-  const [boardLoading, setBoardLoading] = useState(false);
-
-  // Fetch board information when dialog opens
-  useEffect(() => {
-    if (isDialogOpen && !boardInfo) {
-      fetchBoardInfo();
-    }
-  }, [isDialogOpen]);
 
-  const fetchBoardInfo = async () => {
-    setBoardLoading(true);
-    try {
-      const response = await axios.get<{ type: string; name: string }>(
-        "/docker-api/board-info/"
-      );
-      setBoardInfo(response.data);
-    } catch (error) {
-      console.error("Error fetching board info:", error);
-      // Set default values if detection fails
-      setBoardInfo({ type: "unknown", name: "Unknown Board" });
-    } finally {
-      setBoardLoading(false);
-    }
-  };
+  const isLoading =
+    resetStep === "deleting" || resetStep === "resetting";
+  const isCompleted = resetStep === "done";
+  const isFailed = resetStep === "failed";
 
-  const iconColor = theme === "dark" ? "text-zinc-200" : "text-black";
-  const hoverIconColor =
-    theme === "dark" ? "hover:text-zinc-300" : "hover:text-gray-700";
-  const buttonBackgroundColor = theme === "dark" ? "bg-zinc-900" : "bg-white";
-  const hoverButtonBackgroundColor =
-    theme === "dark" ? "hover:bg-zinc-700" : "hover:bg-gray-200";
+  const boardType = deviceState?.board_type ?? "unknown";
+  const deviceStateName = deviceState?.state ?? "UNKNOWN";
+  const isBadState = deviceStateName === "BAD_STATE";
+  const isNotPresent = deviceStateName === "NOT_PRESENT";
+  const isResettingContext = deviceStateName === "RESETTING";
+  const deployedCount = models.length;
+
+  // Step states for the progress rows
+  const step1State: "pending" | "active" | "done" | "skipped" =
+    resetStep === "deleting"
+      ? "active"
+      : resetStep === "resetting" || resetStep === "done" || resetStep === "failed"
+        ? deployedCount === 0
+          ? "skipped"
+          : "done"
+        : "pending";
+
+  const step2State: "pending" | "active" | "done" | "skipped" =
+    resetStep === "resetting"
+      ? "active"
+      : resetStep === "done"
+        ? "done"
+        : "pending";
+
+  // ── Reset execution ─────────────────────────────────────────────────────────
+  const executeReset = async () => {
+    setErrorMessage(null);
+    setCmdOutput(null);
+    setShowOutput(false);
 
-  // Function to delete all deployed models
-  const deleteAllModels = async (): Promise<void> => {
     try {
-      const models = await fetchModels(); // Fetch all deployed models
-      console.log("Models to delete:", models);
-      for (const model of models) {
-        await customToast.promise(deleteModel(model.id), {
-          loading: `Deleting Model ID: ${model.id.substring(0, 4)}...`,
-          success: `Model ID: ${model.id.substring(0, 4)} deleted successfully.`,
-          error: `Failed to delete Model ID: ${model.id.substring(0, 4)}.`,
-        });
+      // Step 1: delete deployed models
+      setResetStep("deleting");
+      const currentModels = await fetchModels();
+      for (const model of currentModels) {
+        await deleteModel(model.id);
       }
-
-      // Refresh the ModelsContext to sync with backend
       await refreshModels();
-    } catch (error) {
-      console.error("Error deleting models:", error);
-      throw new Error("Failed to delete all models.");
-    }
-  };
 
-  const resetBoardAsync = async (): Promise<void> => {
-    const response = await axios.post<Blob>("/docker-api/reset_board/", null, {
-      responseType: "blob",
-    });
-
-    const reader = response.data.stream().getReader();
-    const decoder = new TextDecoder();
-    let output = "";
-    let success = true;
-    const statusCode = response.status;
-
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-
-      const chunk = decoder.decode(value, { stream: true });
-      output += chunk;
-
-      // Check for failure in each chunk
-      if (
-        chunk.includes("Command failed") ||
-        chunk.includes("No Tenstorrent devices detected") ||
-        chunk.includes("Exiting") ||
-        chunk.includes("Error")
-      ) {
-        success = false;
-      }
-    }
+      // Step 2: run board reset
+      setResetStep("resetting");
+      const response = await axios.post<Blob>("/docker-api/reset_board/", null, {
+        responseType: "blob",
+      });
 
-    const finalChunk = decoder.decode();
-    if (finalChunk) {
-      output += finalChunk;
-      if (
-        finalChunk.includes("Command failed") ||
-        finalChunk.includes("No Tenstorrent devices detected") ||
-        finalChunk.includes("Exiting") ||
-        finalChunk.includes("Error")
-      ) {
-        success = false;
-      }
-    }
+      const reader = response.data.stream().getReader();
+      const decoder = new TextDecoder();
+      let output = "";
+      let success = true;
 
-    const styledOutput = success
-      ? `
-        <span style="color: green;">Board Reset Successfully</span>
-        -----------------------
-        <pre style="color: yellow; white-space: pre-wrap;">${output}</pre>
-      `
-      : `
-        <span style="color: red;">Board Reset Failed</span>
-        -----------------------
-        <pre style="color: yellow; white-space: pre-wrap;">${output}</pre>
-      `;
-
-    setFullOutput(styledOutput);
-
-    if (!success) {
-      if (statusCode === 501) {
-        throw new Error(
-          "No Tenstorrent devices detected. Please check your hardware connection and try again."
-        );
-      } else {
-        // Parse the error message from the output
-        const errorLines = output
-          .split("\n")
-          .filter(
-            (line) =>
-              line.includes("tt-smi reset failed") ||
-              line.includes("Please check if:") ||
-              line.includes("1.") ||
-              line.includes("2.") ||
-              line.includes("3.") ||
-              line.includes("4.")
-          );
-        if (errorLines.length > 0) {
-          throw new Error(errorLines.join("\n"));
-        } else {
-          throw new Error(
-            "Board reset failed. Please check the command output for details."
-          );
+      // eslint-disable-next-line no-constant-condition
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        const chunk = decoder.decode(value, { stream: true });
+        output += chunk;
+        if (
+          chunk.includes("Command failed") ||
+          chunk.includes("No Tenstorrent devices detected") ||
+          chunk.includes("Error")
+        ) {
+          success = false;
+        }
+      }
+      const tail = decoder.decode();
+      if (tail) {
+        output += tail;
+        if (
+          tail.includes("Command failed") ||
+          tail.includes("No Tenstorrent devices detected") ||
+          tail.includes("Error")
+        ) {
+          success = false;
         }
       }
-    }
-
-    setIsCompleted(true);
-    setResetHistory((prevHistory) => [...prevHistory, new Date()]);
-    setTimeout(() => setIsCompleted(false), 5000);
-  };
-
-  const resetBoard = async (): Promise<void> => {
-    setIsLoading(true);
-    setIsCompleted(false);
-    setErrorMessage(null);
-    setIsDialogOpen(false);
 
-    try {
-      await deleteAllModels();
+      setCmdOutput(output);
 
-      await customToast.promise(resetBoardAsync(), {
-        loading: "Resetting board...",
-        success: "Board reset successfully!",
-        error: "Failed to reset board.",
-      });
-
-      if (onReset) {
-        console.log("Calling onReset prop function");
-        onReset();
-      }
-    } catch (error) {
-      console.error("Error resetting board:", error);
-
-      if (error instanceof Error) {
-        const errorOutput = `
-          <span style="color: red;">Error Resetting Board</span>
-          -----------------------
-          <pre style="color: red; white-space: pre-wrap;">${error.message}</pre>
-        `;
-        setFullOutput(errorOutput);
-        setErrorMessage(error.message);
-      } else {
-        setErrorMessage("An unknown error occurred");
+      if (!success) {
+        throw new Error(
+          response.status === 501
+            ? "No Tenstorrent devices detected. Check hardware connection."
+            : "Board reset failed. See command output for details."
+        );
       }
 
-      setIsDialogOpen(true);
-    } finally {
-      setIsLoading(false);
+      setResetStep("done");
+      setResetHistory((prev) => [...prev, new Date()]);
+      refreshDeviceState();
+      if (onReset) onReset();
+    } catch (err) {
+      setErrorMessage(
+        err instanceof Error ? err.message : "An unknown error occurred."
+      );
+      setResetStep("failed");
     }
   };
 
-  const handleDialogOpenChange = (isOpen: boolean) => {
-    setIsDialogOpen(isOpen);
-    if (isOpen) {
+  const handleOpen = () => {
+    setIsDialogOpen(true);
+    // Only reset state when there's nothing in progress — otherwise re-show current progress
+    if (!isLoading) {
+      setResetStep(null);
       setErrorMessage(null);
+      setCmdOutput(null);
+      setShowOutput(false);
     }
   };
 
+  const handleClose = () => {
+    setIsDialogOpen(false);
+    // Do NOT reset state — any in-progress reset continues in the background.
+    // State is only cleared on the next fresh open (see handleOpen above).
+  };
+
+  // ── Navbar trigger button ───────────────────────────────────────────────────
+  const iconColor = theme === "dark" ? "text-zinc-200" : "text-black";
+  const hoverIconColor =
+    theme === "dark" ? "hover:text-zinc-300" : "hover:text-gray-700";
+  const btnBg = theme === "dark" ? "bg-zinc-900" : "bg-white";
+  const btnHover =
+    theme === "dark" ? "hover:bg-zinc-700" : "hover:bg-gray-200";
+
   return (
-    <Dialog open={isDialogOpen} onOpenChange={handleDialogOpenChange}>
+    <Dialog
+      open={isDialogOpen}
+      onOpenChange={(open) => (open ? handleOpen() : handleClose())}
+    >
       <DialogTrigger asChild>
         <Button
           variant="navbar"
           size="icon"
-          className={`relative inline-flex items-center justify-center p-2 rounded-full transition-all duration-300 ease-in-out ${buttonBackgroundColor} ${hoverButtonBackgroundColor}`}
-          onClick={() => setIsDialogOpen(true)}
+          className={`relative inline-flex items-center justify-center p-2 rounded-full transition-all duration-300 ease-in-out ${btnBg} ${btnHover}`}
+          onClick={handleOpen}
         >
           {isLoading ? (
             <Spinner />
           ) : isCompleted ? (
-            <CheckCircle className={`w-5 h-5 ${iconColor} ${hoverIconColor}`} />
+            <CheckCircle className={`w-5 h-5 text-green-500`} />
           ) : (
-            <Cpu className={`w-5 h-5 ${iconColor} ${hoverIconColor}`} />
+            <>
+              <Cpu className={`w-5 h-5 ${iconColor} ${hoverIconColor}`} />
+              {/* Red dot if board is unhealthy */}
+              {(isBadState || isNotPresent) && (
+                <span className="absolute top-1 right-1 w-2 h-2 rounded-full bg-red-500" />
+              )}
+            </>
           )}
         </Button>
       </DialogTrigger>
+
       <DialogContent
-        className={`sm:max-w-md p-6 rounded-lg shadow-lg ${
-          theme === "dark" ? "bg-zinc-900 text-white" : "bg-white text-black"
-        }`}
+        className="sm:max-w-md p-6 rounded-xl shadow-2xl bg-stone-900 text-white border border-stone-700 backdrop-blur-md"
       >
+        {/* ── HEADER ── */}
         <DialogHeader>
-          <div className="flex items-center justify-between mb-4">
-            <div className="flex items-center">
-              <AlertTriangle className="h-8 w-8 text-yellow-500 mr-2" />
-              <DialogTitle className="text-lg font-semibold">
-                Reset Card
-              </DialogTitle>
-            </div>
-            {boardInfo && boardInfo.type !== "unknown" && (
-              <BoardBadge boardName={boardInfo.type} />
-            )}
-            {boardLoading && (
-              <div className="flex items-center gap-2 px-3 py-1.5 bg-gray-100 dark:bg-gray-800 rounded-full">
-                <Spinner />
-                <span className="text-sm text-gray-600 dark:text-gray-400">
-                  Detecting...
-                </span>
+          <div className="flex items-center justify-between mb-1">
+            <div className="flex items-center gap-3">
+              {isLoading ? (
+                <div className="w-9 h-9 rounded-full bg-blue-900/50 flex items-center justify-center">
+                  <Loader2 className="h-5 w-5 text-blue-400 animate-spin" />
+                </div>
+              ) : isCompleted ? (
+                <div className="w-9 h-9 rounded-full bg-green-900/50 flex items-center justify-center">
+                  <CheckCircle className="h-5 w-5 text-green-400" />
+                </div>
+              ) : isFailed ? (
+                <div className="w-9 h-9 rounded-full bg-red-900/50 flex items-center justify-center">
+                  <XCircle className="h-5 w-5 text-red-400" />
+                </div>
+              ) : (
+                <div className="w-9 h-9 rounded-full bg-yellow-900/50 flex items-center justify-center">
+                  <RotateCcw className="h-5 w-5 text-yellow-400" />
+                </div>
+              )}
+              <div>
+                <DialogTitle className="text-base font-semibold text-white leading-tight">
+                  {isLoading
+                    ? resetStep === "deleting"
+                      ? "Removing deployed models…"
+                      : "Resetting board…"
+                    : isCompleted
+                      ? "Reset complete"
+                      : isFailed
+                        ? "Reset failed"
+                        : "Reset Card"}
+                </DialogTitle>
+                {isLoading && (
+                  <p className="text-xs text-stone-400 mt-0.5">
+                    Step {resetStep === "deleting" ? "1" : "2"} of 2 — do not
+                    close this window
+                  </p>
+                )}
               </div>
+            </div>
+            {/* Board badge — only when idle */}
+            {!isLoading && !isCompleted && !isFailed && boardType !== "unknown" && (
+              <BoardBadge boardName={boardType} />
             )}
           </div>
-          <DialogDescription className="text-left">
-            Are you sure you want to reset the card?
-          </DialogDescription>
         </DialogHeader>
-        {boardInfo && boardInfo.type === "unknown" && (
-          <div className="mb-4 p-4 bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300 rounded-md flex items-start">
-            <AlertTriangle className="h-5 w-5 text-red-700 dark:text-red-300 mr-2 mt-1 shrink-0" />
-            <div>
-              <div className="font-bold mb-1">
-                No Tenstorrent device detected
-              </div>
-              <div className="text-sm">
-                Device <code>/dev/tenstorrent</code> not found. Please check
-                your hardware connection and ensure the device is properly
-                installed.
+
+        <div className="space-y-3 mt-3">
+          {/* ── IDLE: board status + step overview ── */}
+          {!isLoading && !isCompleted && !isFailed && (
+            <>
+              <BoardStatusBanner
+                state={deviceStateName}
+                boardType={boardType}
+              />
+
+              {isResettingContext && (
+                <div className="flex items-center gap-3 p-3 bg-blue-900/30 border border-blue-500/40 rounded-lg text-blue-200 text-sm">
+                  <Loader2 className="h-4 w-4 text-blue-400 animate-spin shrink-0" />
+                  <span>Board is already resetting…</span>
+                </div>
+              )}
+
+              {/* Step overview */}
+              <StepRow
+                number={1}
+                icon={<Trash2 className="w-3.5 h-3.5" />}
+                label={
+                  deployedCount > 0
+                    ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}`
+                    : "Stop deployed models"
+                }
+                state="pending"
+              />
+              <StepRow
+                number={2}
+                icon={<RotateCcw className="w-3.5 h-3.5" />}
+                label="Reset the board (tt-smi -r)"
+                state="pending"
+              />
+
+              {/* Warning */}
+              <div className="flex items-start gap-2 p-3 bg-red-950/40 border border-red-500/25 rounded-lg text-red-200 text-sm">
+                <AlertTriangle className="h-4 w-4 text-red-400 mt-0.5 shrink-0" />
+                <span>
+                  <strong className="text-red-300">Warning:</strong> This will
+                  interrupt any ongoing processes on the card.
+                  {resetHistory.length > 0 && (
+                    <span className="block mt-1 text-red-300/70">
+                      Last reset:{" "}
+                      {resetHistory[resetHistory.length - 1].toLocaleTimeString()}
+                    </span>
+                  )}
+                </span>
               </div>
-            </div>
-          </div>
-        )}
-        <div
-          className={`mb-4 ${theme === "dark" ? "text-gray-400" : "text-gray-500"}`}
-        >
-          <div className="border-l-4 border-red-600 pl-2">
-            <div className="font-bold">
-              Warning! This action will stop all deployed models and might
-              interrupt ongoing processes.
-            </div>
-            {resetHistory.length > 0 && (
-              <div className="mt-2">
-                Note: This card was reset in the last 5 minutes. Frequent resets
-                may cause issues. Please wait before resetting again.
+            </>
+          )}
+
+          {/* ── LOADING: step progress ── */}
+          {isLoading && (
+            <>
+              <StepRow
+                number={1}
+                icon={<Trash2 className="w-3.5 h-3.5" />}
+                label={
+                  deployedCount > 0
+                    ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}`
+                    : "Stop deployed models"
+                }
+                sublabel="Sending stop signal to all containers…"
+                state={step1State}
+              />
+              <StepRow
+                number={2}
+                icon={<RotateCcw className="w-3.5 h-3.5" />}
+                label="Reset the board"
+                sublabel="Running tt-smi -r, this may take 10–30 seconds…"
+                state={step2State}
+              />
+            </>
+          )}
+
+          {/* ── COMPLETED ── */}
+          {isCompleted && (
+            <>
+              <StepRow
+                number={1}
+                icon={<Trash2 className="w-3.5 h-3.5" />}
+                label="Deployed models removed"
+                state={deployedCount === 0 ? "skipped" : "done"}
+              />
+              <StepRow
+                number={2}
+                icon={<RotateCcw className="w-3.5 h-3.5" />}
+                label="Board reset"
+                state="done"
+              />
+              {cmdOutput && (
+                <button
+                  type="button"
+                  onClick={() => setShowOutput((v) => !v)}
+                  className="flex items-center gap-1 text-xs text-stone-400 hover:text-stone-200 transition-colors"
+                >
+                  <ChevronDown
+                    className={`w-3.5 h-3.5 transition-transform ${showOutput ? "rotate-180" : ""}`}
+                  />
+                  {showOutput ? "Hide" : "Show"} command output
+                </button>
+              )}
+              {showOutput && cmdOutput && (
+                <ScrollArea className="h-36 rounded-lg border border-stone-700">
+                  <pre className="p-3 text-xs text-green-400 whitespace-pre-wrap font-mono bg-stone-950">
+                    {cmdOutput}
+                  </pre>
+                </ScrollArea>
+              )}
+            </>
+          )}
+
+          {/* ── FAILED ── */}
+          {isFailed && (
+            <>
+              <div className="flex items-start gap-3 p-3 bg-red-900/30 border border-red-500/40 rounded-lg">
+                <XCircle className="h-5 w-5 text-red-400 mt-0.5 shrink-0" />
+                <div>
+                  <p className="text-sm font-medium text-red-200">
+                    {errorMessage}
+                  </p>
+                  {cmdOutput && (
+                    <button
+                      type="button"
+                      onClick={() => setShowOutput((v) => !v)}
+                      className="flex items-center gap-1 text-xs text-stone-400 hover:text-stone-200 mt-2 transition-colors"
+                    >
+                      <ChevronDown
+                        className={`w-3.5 h-3.5 transition-transform ${showOutput ? "rotate-180" : ""}`}
+                      />
+                      {showOutput ? "Hide" : "Show"} command output
+                    </button>
+                  )}
+                </div>
               </div>
-            )}
-          </div>
+              {showOutput && cmdOutput && (
+                <ScrollArea className="h-36 rounded-lg border border-stone-700">
+                  <pre className="p-3 text-xs text-red-300 whitespace-pre-wrap font-mono bg-stone-950">
+                    {cmdOutput}
+                  </pre>
+                </ScrollArea>
+              )}
+            </>
+          )}
         </div>
-        {errorMessage && (
-          <div className="mt-4 p-4 bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300 rounded-md">
-            <div className="flex items-start">
-              <AlertTriangle className="h-5 w-5 text-red-700 dark:text-red-300 mr-2 mt-1 shrink-0" />
-              <div className="flex-1">
-                <div className="font-medium mb-2">Error:</div>
-                <pre className="whitespace-pre-wrap text-sm">
-                  {errorMessage}
-                </pre>
-              </div>
-            </div>
-          </div>
-        )}
-        <Accordion type="single" collapsible className="mt-4">
-          <AccordionItem value="history">
-            <AccordionTrigger className="text-md font-semibold">
-              Reset History
-            </AccordionTrigger>
-            <AccordionContent>
-              <ul className="list-disc pl-5 mt-2 text-sm">
-                {resetHistory.length > 0 ? (
-                  resetHistory.map((resetTime, index) => (
-                    <li key={index}>{resetTime.toLocaleString()}</li>
-                  ))
+
+        {/* ── FOOTER ── */}
+        <DialogFooter className="mt-5 flex justify-end gap-2">
+          {(isCompleted || isFailed) ? (
+            <Button
+              variant="outline"
+              onClick={handleClose}
+              className="border-stone-600 text-stone-300 hover:bg-stone-800"
+            >
+              Close
+            </Button>
+          ) : (
+            <>
+              <Button
+                variant="outline"
+                onClick={handleClose}
+                className="border-stone-600 text-stone-300 hover:bg-stone-800"
+              >
+                {isLoading ? "Minimize" : "Cancel"}
+              </Button>
+              <Button
+                onClick={executeReset}
+                disabled={isLoading || isResettingContext || isNotPresent}
+                className={`min-w-[120px] border ${
+                  isBadState
+                    ? "bg-orange-600 hover:bg-orange-700 border-orange-500/40 text-white"
+                    : "bg-red-600 hover:bg-red-700 border-red-500/30 text-white"
+                }`}
+              >
+                {isLoading ? (
+                  <span className="flex items-center gap-2">
+                    <Loader2 className="w-4 h-4 animate-spin" />
+                    Processing…
+                  </span>
+                ) : isBadState ? (
+                  "Reset (Recommended)"
                 ) : (
-                  <li>No resets yet.</li>
+                  "Reset Card"
                 )}
-              </ul>
-            </AccordionContent>
-          </AccordionItem>
-          {fullOutput && (
-            <AccordionItem value="output">
-              <AccordionTrigger className="text-md font-semibold">
-                Command Output
-              </AccordionTrigger>
-              <AccordionContent>
-                <ScrollArea className="h-48 w-full overflow-auto rounded-md border">
-                  <div
-                    className="text-sm mt-2 px-2 py-1 whitespace-pre-wrap bg-zinc-800 text-green-500 rounded-md"
-                    dangerouslySetInnerHTML={{ __html: fullOutput }}
-                  />
-                </ScrollArea>
-              </AccordionContent>
-            </AccordionItem>
+              </Button>
+            </>
           )}
-        </Accordion>
-        <DialogFooter className="mt-4 flex justify-end space-x-2">
-          <Button
-            type="button"
-            variant="outline"
-            onClick={() => setIsDialogOpen(false)}
-            className={`${theme === "dark" ? "text-white" : "text-black"}`}
-          >
-            Cancel
-          </Button>
-          <Button
-            type="button"
-            variant="outline"
-            className="bg-red-600 text-white hover:bg-red-700"
-            onClick={resetBoard}
-          >
-            Yes, Reset
-          </Button>
         </DialogFooter>
       </DialogContent>
     </Dialog>
diff --git a/app/frontend/src/components/SelectionSteps.tsx b/app/frontend/src/components/SelectionSteps.tsx
index 07f8211d..948b5a40 100644
--- a/app/frontend/src/components/SelectionSteps.tsx
+++ b/app/frontend/src/components/SelectionSteps.tsx
@@ -10,6 +10,7 @@ import { customToast } from "./CustomToaster";
 import StepperFooter from "./StepperFooter";
 import { DeployModelStep } from "./DeployModelStep";
 import { FirstStepForm } from "./FirstStepForm";
+import { ChipConfigStep } from "./ChipConfigStep";
 
 const dockerAPIURL = "/docker-api/";
 const deployUrl = `${dockerAPIURL}deploy/`;
@@ -22,6 +23,9 @@ export interface Model {
   compatible_boards: string[]; // List of boards this model can run on
   model_type: string; // Type of model (e.g., CHAT, IMAGE_GENERATION, etc.)
   current_board: string; // The detected board type
+  status?: "EXPERIMENTAL" | "FUNCTIONAL" | "COMPLETE" | null;
+  display_model_type?: string;
+  chips_required?: number; // Number of chips required (1 or 4)
 }
 
 export default function StepperDemo() {
@@ -29,17 +33,52 @@ export default function StepperDemo() {
   const navigate = useNavigate();
   const autoDeployModel = searchParams.get("auto-deploy");
 
-  const steps = [
-    { label: "Step 1", description: "Model Selection" },
-    { label: "Final Step", description: "Deploy Model" },
-  ];
+  const [chipStatus, setChipStatus] = useState<{
+    board_type: string;
+    total_slots: number;
+    slots: { slot_id: number; status: string; model_name?: string; deployment_id?: number; is_multi_chip?: boolean }[];
+  } | null>(null);
+  const [totalSlots, setTotalSlots] = useState<number | null>(null);
+  const isMultiChipBoard = totalSlots !== null && totalSlots > 1;
+
+  // Fetch chip status on mount and poll every 7 minutes
+  useEffect(() => {
+    const fetchChipStatus = () => {
+      axios
+        .get("/docker-api/chip-status/")
+        .then((res) => {
+          setChipStatus(res.data);
+          setTotalSlots(res.data.total_slots ?? 1);
+        })
+        .catch(() => {
+          setChipStatus(null);
+          setTotalSlots(1); // safe fallback to single-chip
+        });
+    };
+    fetchChipStatus();
+    const interval = setInterval(fetchChipStatus, 7 * 60 * 1000);
+    return () => clearInterval(interval);
+  }, []);
+
+  const steps = isMultiChipBoard
+    ? [
+        { label: "Step 1", description: "Hardware Configuration" },
+        { label: "Step 2", description: "Model Selection" },
+        { label: "Final Step", description: "Deploy Model" },
+      ]
+    : [
+        { label: "Step 1", description: "Model Selection" },
+        { label: "Final Step", description: "Deploy Model" },
+      ];
 
   // No-op function for removing dynamic steps (no dynamic steps in this component)
   const removeDynamicSteps = () => {
     // This component uses static steps, so no action needed
   };
 
+  const [chipMode, setChipMode] = useState<"single" | "multi" | null>(null);
   const [selectedModel, setSelectedModel] = useState<string | null>(null);
+  const [selectedDeviceId, setSelectedDeviceId] = useState<number>(0);
   const [loading, setLoading] = useState(false);
   const [formError, setFormError] = useState(false);
   const [isAutoDeploying, setIsAutoDeploying] = useState(false);
@@ -72,9 +111,11 @@ export default function StepperDemo() {
       console.log("Found model for auto-deploy:", model);
 
       // Deploy with default weights
+      const deviceIdParam = parseInt(searchParams.get("device-id") ?? "0", 10);
       const deployPayload = {
         model_id: model.id,
         weights_id: "", // Empty string for default weights
+        device_id: isNaN(deviceIdParam) ? 0 : deviceIdParam,
       };
 
       console.log("Auto-deploy payload:", deployPayload);
@@ -137,6 +178,7 @@ export default function StepperDemo() {
     const payload = JSON.stringify({
       model_id,
       weights_id,
+      device_id: selectedDeviceId,
     });
 
     console.log("📦 Deploying with default weights:", { model_id, weights_id });
@@ -171,6 +213,48 @@ export default function StepperDemo() {
       };
     } catch (error) {
       console.error("Error during deployment:", error);
+
+      // Check if this is a chip allocation conflict error
+      if (axios.isAxiosError(error) && error.response?.status === 409) {
+        const errorData = error.response.data;
+        const errorType = errorData?.error_type;
+
+        if (errorType === 'multi_chip_conflict') {
+          // Multi-chip conflict with detailed information
+          const conflicts = errorData?.conflicts || [];
+          const message = errorData?.message || 'Multi-chip model requires all slots to be free';
+
+          customToast.error(
+            <div className="max-w-md">
+              <p className="font-bold mb-2">Multi-chip Deployment Conflict</p>
+              <p className="text-sm mb-2">{message}</p>
+
+              {conflicts.length > 0 && (
+                <div className="mt-3 p-2 bg-red-100 dark:bg-red-900/30 rounded">
+                  <p className="text-xs font-semibold mb-1">Stop these models first:</p>
+                  <ul className="text-xs space-y-1">
+                    {conflicts.map((c: any, i: number) => (
+                      <li key={i} className="flex items-center justify-between">
+                        <span>• {c.model} (slot {c.slot})</span>
+                      </li>
+                    ))}
+                  </ul>
+                  <p className="text-xs mt-2 italic">Go to Models Deployed page to stop models.</p>
+                </div>
+              )}
+            </div>,
+            { duration: 15000 }
+          );
+
+          return { success: false };
+        } else if (errorType === 'allocation_failed') {
+          // General allocation failure (all slots occupied)
+          const message = errorData?.message || 'All chip slots are occupied';
+          customToast.error(`Chip Allocation Failed: ${message}`, { duration: 10000 });
+          return { success: false };
+        }
+      }
+
       // Extract error message and job_id from response if available
       const errorMessage =
         axios.isAxiosError(error) && error.response?.data?.message
@@ -186,6 +270,17 @@ export default function StepperDemo() {
     }
   };
 
+  // Wait until we know total_slots to avoid re-mounting Stepper mid-render
+  if (totalSlots === null) {
+    return (
+      <div className="flex flex-col gap-4 w-full max-w-6xl mx-auto px-6 md:px-8 lg:px-12 pt-8 pb-4 md:pt-12 md:pb-8">
+        <div className="p-8 text-sm text-gray-500 font-mono animate-pulse">
+          Detecting hardware...
+        </div>
+      </div>
+    );
+  }
+
   return (
     <div className="flex flex-col gap-4 w-full max-w-6xl mx-auto px-6 md:px-8 lg:px-12 pt-8 pb-4 md:pt-12 md:pb-8">
       <ElevatedCard
@@ -207,21 +302,48 @@ export default function StepperDemo() {
               description={step.description}
               className="mb-4"
             >
-              {step.label === "Step 1" && (
+              {/* Multi-chip flow: Step 1 = Hardware Config */}
+              {isMultiChipBoard && step.label === "Step 1" && (
+                <ChipConfigStep
+                  onConfirm={(mode, slotId) => {
+                    setChipMode(mode);
+                    setSelectedDeviceId(slotId);
+                  }}
+                />
+              )}
+              {/* Multi-chip flow: Step 2 = Model Selection (with chipMode filter) */}
+              {isMultiChipBoard && step.label === "Step 2" && (
+                <FirstStepForm
+                  setSelectedModel={(modelId: string) => {
+                    console.log("🔄 setSelectedModel called with:", modelId);
+                    setSelectedModel(modelId);
+                  }}
+                  setSelectedDeviceId={setSelectedDeviceId}
+                  setFormError={setFormError}
+                  autoDeployModel={autoDeployModel}
+                  isAutoDeploying={isAutoDeploying}
+                  chipMode={chipMode ?? undefined}
+                />
+              )}
+              {/* Single-chip flow: Step 1 = Model Selection (no chipMode filter) */}
+              {!isMultiChipBoard && step.label === "Step 1" && (
                 <FirstStepForm
                   setSelectedModel={(modelId: string) => {
                     console.log("🔄 setSelectedModel called with:", modelId);
                     setSelectedModel(modelId);
                   }}
+                  setSelectedDeviceId={setSelectedDeviceId}
                   setFormError={setFormError}
                   autoDeployModel={autoDeployModel}
                   isAutoDeploying={isAutoDeploying}
                 />
               )}
+              {/* Both flows: Final Step = Deploy */}
               {step.label === "Final Step" && (
                 <DeployModelStep
                   selectedModel={selectedModel}
                   handleDeploy={handleDeploy}
+                  selectedDeviceId={isMultiChipBoard ? selectedDeviceId : undefined}
                 />
               )}
             </Step>
diff --git a/app/frontend/src/components/chatui/runInference.ts b/app/frontend/src/components/chatui/runInference.ts
index 251c7cd0..32a8abdc 100644
--- a/app/frontend/src/components/chatui/runInference.ts
+++ b/app/frontend/src/components/chatui/runInference.ts
@@ -309,7 +309,7 @@ export const runInference = async (
                 const jsonData = JSON.parse(trimmedLine.slice(5));
 
                 // Handle final statistics from backend (after [DONE])
-                if (!isAgentSelected && jsonData.ttft && jsonData.tpot) {
+                if (!isAgentSelected && jsonData.tokens_decoded !== undefined) {
                   const backendStats: InferenceStats = {
                     user_ttft_s: jsonData.ttft,
                     user_tpot: jsonData.tpot,
@@ -331,8 +331,8 @@ export const runInference = async (
                   metricsTracker.recordUsage(usage);
                 }
 
-                // Handle generated text content
-                const content = jsonData.choices[0]?.delta?.content || "";
+                // Handle generated text content (chat completions use delta.content, text completions use text)
+                const content = jsonData.choices[0]?.delta?.content ?? jsonData.choices[0]?.text ?? "";
                 if (content) {
                   // Record first token arrival
                   metricsTracker.recordFirstToken();
diff --git a/app/frontend/src/components/models/DeleteModelDialog.tsx b/app/frontend/src/components/models/DeleteModelDialog.tsx
index 887436f2..e8948286 100644
--- a/app/frontend/src/components/models/DeleteModelDialog.tsx
+++ b/app/frontend/src/components/models/DeleteModelDialog.tsx
@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
 
-// React import not needed for modern JSX transform
+import type { ReactNode } from "react";
+import { AlertTriangle, CheckCircle, Loader2, Trash2, RotateCcw } from "lucide-react";
 import {
   Dialog,
   DialogContent,
@@ -10,70 +11,176 @@ import {
   DialogTitle,
 } from "../ui/dialog";
 import { Button } from "../ui/button";
-import { AlertTriangle } from "lucide-react";
+
+export type DeleteStep = "deleting" | "resetting" | null;
 
 interface Props {
   open: boolean;
   modelId: string;
   isLoading: boolean;
+  deleteStep: DeleteStep;
   onConfirm: () => void;
   onCancel: () => void;
 }
 
+function StepRow({
+  number,
+  icon,
+  label,
+  sublabel,
+  state,
+}: {
+  number: number;
+  icon: ReactNode;
+  label: string;
+  sublabel?: string;
+  state: "pending" | "active" | "done";
+}) {
+  return (
+    <div
+      className={`flex items-start gap-3 p-3 rounded-lg border transition-all duration-300 ${
+        state === "active"
+          ? "bg-blue-900/30 border-blue-500/40"
+          : state === "done"
+            ? "bg-green-900/20 border-green-600/30"
+            : "bg-stone-800/50 border-stone-700/40"
+      }`}
+    >
+      <div className="w-7 h-7 flex items-center justify-center shrink-0 mt-0.5">
+        {state === "active" ? (
+          <Loader2 className="w-5 h-5 text-blue-400 animate-spin" />
+        ) : state === "done" ? (
+          <CheckCircle className="w-5 h-5 text-green-400" />
+        ) : (
+          <div className="w-6 h-6 rounded-full bg-stone-600 flex items-center justify-center text-xs font-bold text-stone-300">
+            {number}
+          </div>
+        )}
+      </div>
+      <div className="flex-1 min-w-0">
+        <div
+          className={`font-medium text-sm ${
+            state === "pending" ? "text-stone-400" : "text-white"
+          }`}
+        >
+          <span className="inline-flex items-center gap-1.5">
+            {icon}
+            {label}
+          </span>
+        </div>
+        {sublabel && state === "active" && (
+          <div className="text-xs text-blue-300 mt-1">{sublabel}</div>
+        )}
+        {state === "done" && (
+          <div className="text-xs text-green-400 mt-0.5">Completed</div>
+        )}
+      </div>
+    </div>
+  );
+}
+
 export default function DeleteModelDialog({
   open,
-  modelId: _modelId, // Marked as intentionally unused for now
+  modelId: _modelId,
   isLoading,
+  deleteStep,
   onConfirm,
   onCancel,
 }: Props) {
+  const step1State =
+    deleteStep === "deleting"
+      ? "active"
+      : deleteStep === "resetting"
+        ? "done"
+        : "pending";
+
+  const step2State =
+    deleteStep === "resetting" ? "active" : "pending";
+
   return (
-    <Dialog open={open} onOpenChange={(v) => !v && onCancel()}>
-      <DialogContent className="sm:max-w-md p-6 rounded-xl shadow-2xl bg-stone-900/95 text-white border-2 border-yellow-500/50 backdrop-blur-md">
+    <Dialog open={open} onOpenChange={(v) => !v && !isLoading && onCancel()}>
+      <DialogContent className="sm:max-w-md p-6 rounded-xl shadow-2xl bg-stone-900 text-white border border-stone-700 backdrop-blur-md">
         <DialogHeader>
-          <div className="flex items-center justify-between mb-4">
-            <div className="flex items-center">
-              <AlertTriangle className="h-8 w-8 text-yellow-500 mr-2" />
-              <DialogTitle className="text-lg font-semibold text-white">
-                Delete Model & Reset Card
+          <div className="flex items-center gap-3 mb-1">
+            {isLoading ? (
+              <div className="w-9 h-9 rounded-full bg-blue-900/50 flex items-center justify-center">
+                <Loader2 className="h-5 w-5 text-blue-400 animate-spin" />
+              </div>
+            ) : (
+              <div className="w-9 h-9 rounded-full bg-yellow-900/50 flex items-center justify-center">
+                <AlertTriangle className="h-5 w-5 text-yellow-400" />
+              </div>
+            )}
+            <div>
+              <DialogTitle className="text-base font-semibold text-white leading-tight">
+                {isLoading
+                  ? deleteStep === "deleting"
+                    ? "Removing model…"
+                    : "Resetting board…"
+                  : "Delete Model & Reset Card"}
               </DialogTitle>
+              {isLoading && (
+                <p className="text-xs text-stone-400 mt-0.5">
+                  Step {deleteStep === "deleting" ? "1" : "2"} of 2 — do not close this window
+                </p>
+              )}
             </div>
           </div>
         </DialogHeader>
-        <div className="mb-4 p-4 bg-yellow-900/30 text-yellow-100 rounded-lg border border-yellow-500/30 backdrop-blur-sm flex items-start">
-          <AlertTriangle className="h-5 w-5 text-yellow-400 mr-2 mt-1 shrink-0" />
-          <div>
-            <div className="font-bold mb-1 text-yellow-100">
-              Warning! This action will stop and remove the model, then reset
-              the card.
-            </div>
-            <div className="text-sm text-yellow-200">
-              Deleting a model will attempt to stop and remove the model
-              container.
-              <br />
-              After deletion, the card will automatically be reset using{" "}
-              <code>tt-smi reset</code>.
-              <br />
-              <span className="font-bold text-yellow-300">
-                This may interrupt any ongoing processes on the card.
-              </span>
-            </div>
-          </div>
+
+        <div className="space-y-2 mt-2">
+          {/* Step 1 */}
+          <StepRow
+            number={1}
+            icon={<Trash2 className="w-3.5 h-3.5" />}
+            label="Stop & remove model container"
+            sublabel="Sending stop signal to the container…"
+            state={step1State}
+          />
+
+          {/* Step 2 */}
+          <StepRow
+            number={2}
+            icon={<RotateCcw className="w-3.5 h-3.5" />}
+            label="Reset the board"
+            sublabel="Running tt-smi -r, this may take 10–30 seconds…"
+            state={step2State}
+          />
         </div>
-        <DialogFooter className="mt-4 flex justify-end space-x-2">
+
+        {/* Warning — only shown when idle */}
+        {!isLoading && (
+          <div className="mt-4 flex items-start gap-2 p-3 bg-red-950/40 rounded-lg border border-red-500/25 text-red-200 text-sm">
+            <AlertTriangle className="h-4 w-4 text-red-400 mt-0.5 shrink-0" />
+            <span>
+              <strong className="text-red-300">Warning:</strong> This will
+              interrupt any ongoing processes on the card and cannot be undone.
+            </span>
+          </div>
+        )}
+
+        <DialogFooter className="mt-5 flex justify-end gap-2">
           <Button
+            variant="outline"
             onClick={onCancel}
             disabled={isLoading}
-            className="hover:shadow-lg hover:shadow-stone-200/20 transition-all duration-300 hover:-translate-y-0.5 active:translate-y-0 rounded-lg"
+            className="border-stone-600 text-stone-300 hover:bg-stone-800"
           >
             Cancel
           </Button>
           <Button
             onClick={onConfirm}
-            className="bg-red-600 text-white hover:bg-red-700 hover:shadow-lg hover:shadow-red-500/30 transition-all duration-300 hover:-translate-y-0.5 active:translate-y-0 rounded-lg border border-red-500/30"
             disabled={isLoading}
+            className="bg-red-600 text-white hover:bg-red-700 border border-red-500/30 min-w-[130px]"
           >
-            {isLoading ? "Processing..." : `Yes, Delete & Reset`}
+            {isLoading ? (
+              <span className="flex items-center gap-2">
+                <Loader2 className="w-4 h-4 animate-spin" />
+                Processing…
+              </span>
+            ) : (
+              "Delete & Reset"
+            )}
           </Button>
         </DialogFooter>
       </DialogContent>
diff --git a/app/frontend/src/components/models/ModelsDeployedCard.tsx b/app/frontend/src/components/models/ModelsDeployedCard.tsx
index 2f68ca20..e9e5e696 100644
--- a/app/frontend/src/components/models/ModelsDeployedCard.tsx
+++ b/app/frontend/src/components/models/ModelsDeployedCard.tsx
@@ -22,6 +22,8 @@ import {
   handleRedeploy,
   handleModelNavigationClick,
   fetchModels,
+  fetchDeployedModelsInfo,
+  getModelTypeFromBackendType,
 } from "../../api/modelsDeployedApis";
 import type {
   ColumnVisibilityMap,
@@ -30,10 +32,12 @@ import type {
 } from "../../types/models";
 import ModelsToolbar from "./ModelsToolbar.tsx";
 import ModelsTable from "./ModelsTable.tsx";
-import DeleteModelDialog from "./DeleteModelDialog.tsx";
+import DeleteModelDialog, { type DeleteStep } from "./DeleteModelDialog.tsx";
 import LogStreamDialog from "./Logs/LogStreamDialog.tsx";
 import { useNavigate } from "react-router-dom";
 import { useTablePrefs } from "../../hooks/useTablePrefs";
+import axios from "axios";
+import { ChipStatusDisplay } from "../ChipStatusDisplay";
 
 export default function ModelsDeployedCard(): JSX.Element {
   const { models, setModels, refreshModels } = useModels();
@@ -43,6 +47,27 @@ export default function ModelsDeployedCard(): JSX.Element {
   const [loading, setLoading] = useState(true);
   const [loadError, setLoadError] = useState<string | null>(null);
 
+  // Chip slot status for multi-chip boards
+  const [chipStatus, setChipStatus] = useState<{
+    board_type: string;
+    total_slots: number;
+    slots: { slot_id: number; status: string; model_name?: string; deployment_id?: number; is_multi_chip?: boolean }[];
+  } | null>(null);
+
+  useEffect(() => {
+    const fetchChipStatus = () => {
+      axios
+        .get("/docker-api/chip-status/")
+        .then((res) => setChipStatus(res.data))
+        .catch(() => setChipStatus(null));
+    };
+    fetchChipStatus();
+    const interval = setInterval(fetchChipStatus, 7 * 60 * 1000);
+    return () => clearInterval(interval);
+  }, [refreshTrigger]);
+
+  const isMultiChipBoard = chipStatus !== null && chipStatus.total_slots > 1;
+
   const { isRefreshing, refreshAllHealth, register } = useHealthRefresh();
   const {
     value: columns,
@@ -59,7 +84,10 @@ export default function ModelsDeployedCard(): JSX.Element {
     setLoadError(null);
     try {
       const fetched = await fetchModels();
-      setModels(fetched);
+      const deployedInfo = await fetchDeployedModelsInfo();
+      const typeById = Object.fromEntries(deployedInfo.map(d => [d.id, d.model_type]));
+      const enriched = fetched.map(m => ({ ...m, model_type: m.model_type ?? typeById[m.id] }));
+      setModels(enriched);
       if (fetched.length === 0) {
         triggerRefresh();
       }
@@ -131,6 +159,7 @@ export default function ModelsDeployedCard(): JSX.Element {
   const [showDeleteModal, setShowDeleteModal] = useState(false);
   const [deleteTargetId, setDeleteTargetId] = useState<string | null>(null);
   const [isProcessingDelete, setIsProcessingDelete] = useState(false);
+  const [deleteStep, setDeleteStep] = useState<DeleteStep>(null);
 
   useEffect(() => {
     loadModels();
@@ -150,30 +179,28 @@ export default function ModelsDeployedCard(): JSX.Element {
     setIsProcessingDelete(true);
     const truncatedModelId = deleteTargetId.substring(0, 4);
     try {
+      // Step 1: stop & remove the model (backend also runs tt-smi -r internally)
+      setDeleteStep("deleting");
       await customToast.promise(deleteModel(deleteTargetId), {
-        loading: `Attempting to delete Model ID: ${truncatedModelId}...`,
-        success: `Model ID: ${truncatedModelId} has been deleted.`,
-        error: `Failed to delete Model ID: ${truncatedModelId}.`,
+        loading: `Stopping model ${truncatedModelId}…`,
+        success: `Model ${truncatedModelId} stopped.`,
+        error: `Failed to stop model ${truncatedModelId}.`,
       });
-      // Simulate resetCard same as original placeholder
-      await customToast.promise(
-        new Promise((resolve) => window.setTimeout(resolve, 2000)),
-        {
-          loading: "Resetting card (tt-smi reset)...",
-          success: "Card reset successfully!",
-          error: "Failed to reset card.",
-        }
-      );
+
+      // Step 2: board reset is handled by the stop API, show progress while cleanup settles
+      setDeleteStep("resetting");
+      await new Promise((resolve) => window.setTimeout(resolve, 2000));
+
       await refreshModels();
       triggerHardwareRefresh();
       setShowDeleteModal(false);
       setDeleteTargetId(null);
-      // Slight delay then refresh health
       window.setTimeout(() => {
         refreshAllHealth();
       }, 1000);
     } finally {
       setIsProcessingDelete(false);
+      setDeleteStep(null);
     }
   }, [deleteTargetId, refreshModels, triggerHardwareRefresh, refreshAllHealth]);
 
@@ -301,6 +328,18 @@ export default function ModelsDeployedCard(): JSX.Element {
           />
         </div>
       </CardHeader>
+
+      {/* Chip slot visualization for multi-chip boards */}
+      {isMultiChipBoard && chipStatus && (
+        <div className="px-6 pb-4">
+          <ChipStatusDisplay
+            boardType={chipStatus.board_type}
+            totalSlots={chipStatus.total_slots}
+            slots={chipStatus.slots as any}
+          />
+        </div>
+      )}
+
       <div
         className={`${selectedContainerId ? "blur-sm backdrop-blur-sm" : ""} transition-all duration-200`}
       >
@@ -317,9 +356,13 @@ export default function ModelsDeployedCard(): JSX.Element {
                   setShowDeleteModal(true);
                 }}
                 onRedeploy={(image?: string) => image && handleRedeploy(image)}
-                onNavigateToModel={(id: string, name: string) =>
-                  handleModelNavigationClick(id, name, navigate)
-                }
+                onNavigateToModel={(id: string, name: string) => {
+                  const row = rows.find((r) => r.id === id);
+                  const frontendType = row?.model_type
+                    ? getModelTypeFromBackendType(row.model_type)
+                    : undefined;
+                  handleModelNavigationClick(id, name, navigate, frontendType);
+                }}
                 onOpenApi={(id: string) => {
                   const encoded = encodeURIComponent(id);
                   window.location.href = `/api-info/${encoded}`;
@@ -353,8 +396,9 @@ export default function ModelsDeployedCard(): JSX.Element {
         open={showDeleteModal}
         modelId={deleteTargetId || ""}
         isLoading={isProcessingDelete}
+        deleteStep={deleteStep}
         onConfirm={handleConfirmDelete}
-        onCancel={() => setShowDeleteModal(false)}
+        onCancel={() => !isProcessingDelete && setShowDeleteModal(false)}
       />
     </ElevatedCard>
   );
diff --git a/app/frontend/src/components/models/ModelsTable.tsx b/app/frontend/src/components/models/ModelsTable.tsx
index d02e3a70..aa7242eb 100644
--- a/app/frontend/src/components/models/ModelsTable.tsx
+++ b/app/frontend/src/components/models/ModelsTable.tsx
@@ -12,6 +12,7 @@ import {
 } from "../ui/table";
 import {
   Activity,
+  Cpu,
   Heart,
   Network,
   // Settings,
@@ -129,6 +130,13 @@ export default function ModelsTable({
             />
             Model Name
           </TableHead>
+          <TableHead className="text-right font-semibold">
+            <Cpu
+              className="inline-block mr-2 text-TT-purple-accent"
+              size={16}
+            />
+            Chip
+          </TableHead>
           {image && (
             <TableHead className="text-right font-semibold">
               <div className="flex items-center">
@@ -178,6 +186,7 @@ export default function ModelsTable({
           const isExpanded = !!expanded[row.id];
           const colCount =
             1 /* name */ +
+            1 /* chip */ +
             1 /* status */ +
             1 /* health */ +
             1 /* manage */ +
@@ -206,6 +215,16 @@ export default function ModelsTable({
                     <ModelNameCell name={row.name} />
                   </button>
                 </TableCell>
+                <TableCell className="text-right">
+                  {row.device_id != null ? (
+                    <span className="inline-flex items-center gap-1.5 text-xs font-mono px-2 py-1 rounded-full bg-TT-purple-shade/40 text-TT-purple border border-TT-purple-accent/30">
+                      <Cpu className="w-3 h-3" />
+                      Slot {String(row.device_id).padStart(2, "0")}
+                    </span>
+                  ) : (
+                    <span className="text-xs text-gray-500">—</span>
+                  )}
+                </TableCell>
                 {image ? (
                   <TableCell className="text-right">
                     <ImageCell image={row.image} />
@@ -231,6 +250,7 @@ export default function ModelsTable({
                     id={row.id}
                     name={row.name}
                     image={row.image}
+                    model_type={row.model_type}
                     health={healthMap[row.id]}
                     onDelete={onDelete}
                     onRedeploy={onRedeploy}
@@ -259,6 +279,10 @@ export default function ModelsTable({
                         </div>
                         <CopyableText text={row.image ?? ""} />
                       </div>
+                      <div className="min-w-0">
+                        <div className="text-xs text-stone-500 mb-1">Chip Slot</div>
+                        <CopyableText text={row.device_id != null ? `Slot ${row.device_id}` : "N/A"} />
+                      </div>
                       <div className="min-w-0">
                         <div className="text-xs text-stone-500 mb-1">Ports</div>
                         <CopyableText text={row.ports ?? ""} />
diff --git a/app/frontend/src/components/models/row-cells/ManageCell.tsx b/app/frontend/src/components/models/row-cells/ManageCell.tsx
index 5e863c6f..4804a75e 100644
--- a/app/frontend/src/components/models/row-cells/ManageCell.tsx
+++ b/app/frontend/src/components/models/row-cells/ManageCell.tsx
@@ -11,10 +11,12 @@ import {
   Image as ImageIcon,
   Crosshair,
   Mic,
+  Volume2,
 } from "lucide-react";
 import type { HealthStatus } from "../../../types/models";
 import {
   getModelTypeFromName,
+  getModelTypeFromBackendType,
   ModelType,
 } from "../../../api/modelsDeployedApis";
 
@@ -22,6 +24,7 @@ interface Props {
   id: string;
   name?: string;
   image?: string;
+  model_type?: string;
   health?: HealthStatus;
   onDelete: (id: string) => void;
   onRedeploy: (image?: string) => void;
@@ -33,6 +36,7 @@ export default React.memo(function ManageCell({
   id,
   name,
   image: _image,
+  model_type,
   health,
   onDelete,
   onRedeploy: _onRedeploy,
@@ -48,7 +52,9 @@ export default React.memo(function ManageCell({
   const dangerBtn =
     "!border-red-400/70 !text-red-300 !bg-red-600/20 hover:!bg-red-600/30 shadow-[0_8px_24px_rgba(255,0,0,0.15)]";
 
-  const modelType = getModelTypeFromName(name ?? "");
+  const modelType = model_type
+    ? getModelTypeFromBackendType(model_type)
+    : getModelTypeFromName(name ?? "");
   const openLabel =
     modelType === ModelType.ImageGeneration
       ? "Image Gen"
@@ -56,7 +62,9 @@ export default React.memo(function ManageCell({
         ? "Object Detect"
         : modelType === ModelType.SpeechRecognitionModel
           ? "Speech"
-          : "Chat";
+          : modelType === ModelType.TTS
+            ? "TTS"
+            : "Chat";
   const OpenIcon =
     modelType === ModelType.ImageGeneration
       ? ImageIcon
@@ -64,7 +72,9 @@ export default React.memo(function ManageCell({
         ? Crosshair
         : modelType === ModelType.SpeechRecognitionModel
           ? Mic
-          : MessageSquareText;
+          : modelType === ModelType.TTS
+            ? Volume2
+            : MessageSquareText;
 
   return (
     <div className="relative flex items-center justify-center gap-2 flex-wrap">
diff --git a/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx
new file mode 100644
index 00000000..fcdf11dc
--- /dev/null
+++ b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import { useEffect, useRef, useState } from "react";
+import { Mic, Square, Volume2, CheckCircle, Loader2, Circle } from "lucide-react";
+import { Button } from "../ui/button";
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "../ui/select";
+import { runVoicePipeline } from "../../api/modelsDeployedApis";
+import { customToast } from "../CustomToaster";
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+interface DeployedModelInfo {
+  id: string;
+  modelName: string;
+  model_type?: string;
+}
+
+type PipelineStage = "idle" | "recording" | "stt" | "llm" | "tts" | "done";
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+async function fetchDeployedByType(
+  modelType: string
+): Promise<DeployedModelInfo[]> {
+  try {
+    const res = await fetch("/models-api/deployed/");
+    if (!res.ok) return [];
+    const data = await res.json();
+    return Object.entries(data)
+      .map(([id, info]: [string, any]) => ({
+        id,
+        modelName:
+          info.model_impl?.model_name ||
+          info.model_impl?.hf_model_id ||
+          "Unknown",
+        model_type: info.model_impl?.model_type,
+      }))
+      .filter((m) => m.model_type === modelType);
+  } catch {
+    return [];
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Stage indicator
+// ---------------------------------------------------------------------------
+
+const STAGES: { key: PipelineStage; label: string }[] = [
+  { key: "recording", label: "Mic" },
+  { key: "stt", label: "Whisper" },
+  { key: "llm", label: "LLM" },
+  { key: "tts", label: "TTS" },
+];
+
+const STAGE_ORDER: Record<PipelineStage, number> = {
+  idle: -1,
+  recording: 0,
+  stt: 1,
+  llm: 2,
+  tts: 3,
+  done: 4,
+};
+
+function StageIndicator({ current }: { current: PipelineStage }) {
+  return (
+    <div className="flex items-center gap-2">
+      {STAGES.map((s, i) => {
+        const order = STAGE_ORDER[s.key];
+        const currentOrder = STAGE_ORDER[current];
+        const isDone = currentOrder > order;
+        const isActive = current === s.key;
+
+        return (
+          <div key={s.key} className="flex items-center gap-2">
+            {i > 0 && (
+              <div
+                className={`h-0.5 w-8 ${isDone ? "bg-green-500" : "bg-gray-300 dark:bg-gray-600"}`}
+              />
+            )}
+            <div className="flex flex-col items-center gap-1">
+              {isDone ? (
+                <CheckCircle className="w-5 h-5 text-green-500" />
+              ) : isActive ? (
+                <Loader2 className="w-5 h-5 text-TT-purple-accent animate-spin" />
+              ) : (
+                <Circle className="w-5 h-5 text-gray-400" />
+              )}
+              <span className="text-xs text-gray-500 dark:text-gray-400">
+                {s.label}
+              </span>
+            </div>
+          </div>
+        );
+      })}
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Main component
+// ---------------------------------------------------------------------------
+
+export default function VoicePipelineDemo() {
+  // Model dropdowns
+  const [sttModels, setSttModels] = useState<DeployedModelInfo[]>([]);
+  const [llmModels, setLlmModels] = useState<DeployedModelInfo[]>([]);
+  const [ttsModels, setTtsModels] = useState<DeployedModelInfo[]>([]);
+
+  const [whisperDeployId, setWhisperDeployId] = useState("");
+  const [llmDeployId, setLlmDeployId] = useState("");
+  const [ttsDeployId, setTtsDeployId] = useState("");
+
+  // Recording
+  const [isRecording, setIsRecording] = useState(false);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+
+  // Pipeline state
+  const [stage, setStage] = useState<PipelineStage>("idle");
+  const [transcript, setTranscript] = useState("");
+  const [llmResponse, setLlmResponse] = useState("");
+  const [audioUrl, setAudioUrl] = useState<string | null>(null);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
+  // Fetch deployed models on mount
+  useEffect(() => {
+    Promise.all([
+      fetchDeployedByType("speech_recognition"),
+      fetchDeployedByType("chat"),
+      fetchDeployedByType("tts"),
+    ]).then(([stt, llm, tts]) => {
+      setSttModels(stt);
+      setLlmModels(llm);
+      setTtsModels(tts);
+      if (stt.length > 0) setWhisperDeployId(stt[0].id);
+      if (llm.length > 0) setLlmDeployId(llm[0].id);
+      if (tts.length > 0) setTtsDeployId(tts[0].id);
+    });
+  }, []);
+
+  const startRecording = async () => {
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      const mr = new MediaRecorder(stream);
+      chunksRef.current = [];
+      mr.ondataavailable = (e) => chunksRef.current.push(e.data);
+      mr.start();
+      mediaRecorderRef.current = mr;
+      setIsRecording(true);
+      setStage("recording");
+      setTranscript("");
+      setLlmResponse("");
+      setAudioUrl(null);
+    } catch (err) {
+      customToast.error("Microphone access denied");
+    }
+  };
+
+  const stopRecording = () => {
+    const mr = mediaRecorderRef.current;
+    if (!mr) return;
+    mr.onstop = async () => {
+      const blob = new Blob(chunksRef.current, { type: "audio/webm" });
+      const file = new File([blob], "recording.webm", { type: "audio/webm" });
+      await runPipeline(file);
+    };
+    mr.stop();
+    mr.stream.getTracks().forEach((t) => t.stop());
+    setIsRecording(false);
+  };
+
+  const runPipeline = async (audioFile: File) => {
+    if (!whisperDeployId || !llmDeployId) {
+      customToast.error("Please select STT and LLM models");
+      setStage("idle");
+      return;
+    }
+
+    setStage("stt");
+    let llmText = "";
+
+    await runVoicePipeline(
+      {
+        audioFile,
+        whisperDeployId,
+        llmDeployId,
+        ttsDeployId: ttsDeployId || undefined,
+      },
+      // onTranscript
+      (text) => {
+        setTranscript(text);
+        setStage("llm");
+      },
+      // onLlmChunk
+      (chunk) => {
+        llmText += chunk;
+        setLlmResponse((prev) => prev + chunk);
+      },
+      // onAudio
+      (url) => {
+        setAudioUrl(url);
+        setStage("tts");
+        // Auto-play
+        setTimeout(() => {
+          if (audioRef.current) {
+            audioRef.current.src = url;
+            audioRef.current.play().catch(() => {});
+          }
+        }, 100);
+      },
+      // onError
+      (stage, message) => {
+        customToast.error(`Pipeline error (${stage}): ${message}`);
+        setStage("idle");
+      },
+      // onDone
+      () => {
+        setStage("done");
+      }
+    );
+  };
+
+  return (
+    <div className="flex flex-col gap-6 max-w-3xl mx-auto px-4 py-8">
+      <h1 className="text-2xl font-bold text-gray-900 dark:text-white">
+        Voice Pipeline Demo
+      </h1>
+      <p className="text-sm text-gray-500 dark:text-gray-400">
+        Mic → Whisper STT → LLM → TTS → Speaker
+      </p>
+
+      {/* Model selectors */}
+      <div className="grid grid-cols-1 sm:grid-cols-3 gap-4">
+        <div className="flex flex-col gap-1">
+          <label className="text-xs font-semibold text-gray-600 dark:text-gray-300">
+            STT (Whisper)
+          </label>
+          <Select value={whisperDeployId} onValueChange={setWhisperDeployId}>
+            <SelectTrigger>
+              <SelectValue
+                placeholder={
+                  sttModels.length === 0 ? "No STT deployed" : "Select STT"
+                }
+              />
+            </SelectTrigger>
+            <SelectContent>
+              {sttModels.map((m) => (
+                <SelectItem key={m.id} value={m.id}>
+                  {m.modelName}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+
+        <div className="flex flex-col gap-1">
+          <label className="text-xs font-semibold text-gray-600 dark:text-gray-300">
+            LLM
+          </label>
+          <Select value={llmDeployId} onValueChange={setLlmDeployId}>
+            <SelectTrigger>
+              <SelectValue
+                placeholder={
+                  llmModels.length === 0 ? "No LLM deployed" : "Select LLM"
+                }
+              />
+            </SelectTrigger>
+            <SelectContent>
+              {llmModels.map((m) => (
+                <SelectItem key={m.id} value={m.id}>
+                  {m.modelName}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+
+        <div className="flex flex-col gap-1">
+          <label className="text-xs font-semibold text-gray-600 dark:text-gray-300">
+            TTS (optional)
+          </label>
+          <Select
+            value={ttsDeployId}
+            onValueChange={setTtsDeployId}
+          >
+            <SelectTrigger>
+              <SelectValue placeholder="None (skip TTS)" />
+            </SelectTrigger>
+            <SelectContent>
+              <SelectItem value="">None</SelectItem>
+              {ttsModels.map((m) => (
+                <SelectItem key={m.id} value={m.id}>
+                  {m.modelName}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+      </div>
+
+      {/* Stage indicator */}
+      <div className="flex justify-center py-2">
+        <StageIndicator current={stage} />
+      </div>
+
+      {/* Record button */}
+      <div className="flex justify-center">
+        {isRecording ? (
+          <Button
+            variant="destructive"
+            size="lg"
+            className="flex items-center gap-2 px-8"
+            onClick={stopRecording}
+          >
+            <Square className="w-5 h-5" />
+            Stop Recording
+          </Button>
+        ) : (
+          <Button
+            size="lg"
+            className="flex items-center gap-2 px-8 bg-TT-purple-accent hover:bg-TT-purple text-white"
+            onClick={startRecording}
+            disabled={stage !== "idle" && stage !== "done"}
+          >
+            <Mic className="w-5 h-5" />
+            {stage === "idle" || stage === "done"
+              ? "Start Recording"
+              : "Processing…"}
+          </Button>
+        )}
+      </div>
+
+      {/* Outputs */}
+      {transcript && (
+        <div className="rounded-lg border border-gray-200 dark:border-gray-700 p-4 bg-white dark:bg-gray-900">
+          <p className="text-xs font-semibold text-gray-500 dark:text-gray-400 mb-1">
+            Transcript
+          </p>
+          <p className="text-sm text-gray-800 dark:text-gray-100">
+            {transcript}
+          </p>
+        </div>
+      )}
+
+      {llmResponse && (
+        <div className="rounded-lg border border-gray-200 dark:border-gray-700 p-4 bg-white dark:bg-gray-900">
+          <p className="text-xs font-semibold text-gray-500 dark:text-gray-400 mb-1">
+            LLM Response
+          </p>
+          <p className="text-sm text-gray-800 dark:text-gray-100 whitespace-pre-wrap">
+            {llmResponse}
+          </p>
+        </div>
+      )}
+
+      {audioUrl && (
+        <div className="rounded-lg border border-gray-200 dark:border-gray-700 p-4 bg-white dark:bg-gray-900 flex items-center gap-4">
+          <Volume2 className="w-5 h-5 text-TT-purple-accent" />
+          <audio ref={audioRef} controls src={audioUrl} className="flex-1" />
+        </div>
+      )}
+
+      {/* Hidden audio element for autoplay */}
+      {!audioUrl && <audio ref={audioRef} className="hidden" />}
+    </div>
+  );
+}
diff --git a/app/frontend/src/components/tts/TTSDemo.tsx b/app/frontend/src/components/tts/TTSDemo.tsx
new file mode 100644
index 00000000..3d9c6cbc
--- /dev/null
+++ b/app/frontend/src/components/tts/TTSDemo.tsx
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import { useEffect, useRef, useState } from "react";
+import { Volume2, Loader2, Download } from "lucide-react";
+import { motion } from "framer-motion";
+import { Button } from "../ui/button";
+import { Textarea } from "../ui/textarea";
+import { Card } from "../ui/card";
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "../ui/select";
+import { runTTSInference } from "../../api/modelsDeployedApis";
+import { customToast } from "../CustomToaster";
+
+interface DeployedModelInfo {
+  id: string;
+  modelName: string;
+  model_type?: string;
+}
+
+async function fetchTTSModels(): Promise<DeployedModelInfo[]> {
+  try {
+    const res = await fetch("/models-api/deployed/");
+    if (!res.ok) return [];
+    const data = await res.json();
+    return Object.entries(data)
+      .map(([id, info]: [string, any]) => ({
+        id,
+        modelName:
+          info.model_impl?.model_name ||
+          info.model_impl?.hf_model_id ||
+          "Unknown",
+        model_type: info.model_impl?.model_type,
+      }))
+      .filter((m) => m.model_type === "tts");
+  } catch {
+    return [];
+  }
+}
+
+export default function TTSDemo() {
+  const [ttsModels, setTtsModels] = useState<DeployedModelInfo[]>([]);
+  const [selectedDeployId, setSelectedDeployId] = useState("");
+  const [text, setText] = useState("");
+  const [audioUrl, setAudioUrl] = useState<string | null>(null);
+  const [isLoading, setIsLoading] = useState(false);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
+  useEffect(() => {
+    fetchTTSModels().then((models) => {
+      setTtsModels(models);
+      if (models.length > 0) setSelectedDeployId(models[0].id);
+    });
+  }, []);
+
+  // Revoke previous object URL to avoid memory leaks
+  useEffect(() => {
+    return () => {
+      if (audioUrl) URL.revokeObjectURL(audioUrl);
+    };
+  }, [audioUrl]);
+
+  const handleGenerate = async () => {
+    if (!selectedDeployId) {
+      customToast.error("Please select a TTS model");
+      return;
+    }
+    if (!text.trim()) {
+      customToast.error("Please enter some text to synthesize");
+      return;
+    }
+
+    setIsLoading(true);
+    if (audioUrl) {
+      URL.revokeObjectURL(audioUrl);
+      setAudioUrl(null);
+    }
+
+    try {
+      const blob = await runTTSInference(selectedDeployId, text.trim());
+      const url = URL.createObjectURL(blob);
+      setAudioUrl(url);
+      setTimeout(() => {
+        if (audioRef.current) {
+          audioRef.current.src = url;
+          audioRef.current.play().catch(() => {});
+        }
+      }, 100);
+    } catch (err) {
+      customToast.error(
+        `TTS generation failed: ${err instanceof Error ? err.message : "Unknown error"}`
+      );
+    } finally {
+      setIsLoading(false);
+    }
+  };
+
+  const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
+    if (e.key === "Enter" && !e.shiftKey) {
+      e.preventDefault();
+      if (!isLoading && ttsModels.length > 0 && text.trim()) {
+        handleGenerate();
+      }
+    }
+  };
+
+  const handleDownload = () => {
+    if (!audioUrl) return;
+    const a = document.createElement("a");
+    a.href = audioUrl;
+    a.download = "tts-output.wav";
+    a.click();
+  };
+
+  return (
+    <Card className="flex flex-col w-full max-w-3xl max-h-[85vh] overflow-hidden shadow-xl bg-white dark:bg-black border-gray-200 dark:border-[#7C68FA]/20 rounded-2xl">
+      <div className="flex-1 overflow-auto flex items-center justify-center">
+        <div className="w-full max-w-3xl px-6 py-8 flex flex-col gap-6">
+            {/* Header */}
+            <motion.div
+              initial={{ opacity: 0, y: -10 }}
+              animate={{ opacity: 1, y: 0 }}
+              transition={{ duration: 0.3 }}
+              className="text-center"
+            >
+              <h1 className="text-4xl font-bold text-gray-900 dark:text-white">
+                Text to Speech Demo
+              </h1>
+              <p className="mt-2 text-base text-gray-600 dark:text-gray-300">
+                Type text below and generate audio using a deployed TTS model.
+              </p>
+            </motion.div>
+
+            {/* Model selector */}
+            <motion.div
+              initial={{ opacity: 0, y: 10 }}
+              animate={{ opacity: 1, y: 0 }}
+              transition={{ duration: 0.3, delay: 0.1 }}
+              className="flex flex-col gap-2"
+            >
+              <label className="text-sm font-semibold text-gray-700 dark:text-gray-200">
+                TTS Model
+              </label>
+              {ttsModels.length === 0 ? (
+                <div className="text-sm text-amber-600 dark:text-amber-400 border-2 border-amber-400 dark:border-amber-600 rounded-lg px-4 py-3 bg-amber-50 dark:bg-amber-950">
+                  No TTS models are currently deployed. Deploy a TTS model to
+                  get started.
+                </div>
+              ) : (
+                <Select
+                  value={selectedDeployId}
+                  onValueChange={setSelectedDeployId}
+                >
+                  <SelectTrigger className="h-12 text-base border-2">
+                    <SelectValue placeholder="Select TTS model" />
+                  </SelectTrigger>
+                  <SelectContent>
+                    {ttsModels.map((m) => (
+                      <SelectItem key={m.id} value={m.id}>
+                        {m.modelName}
+                      </SelectItem>
+                    ))}
+                  </SelectContent>
+                </Select>
+              )}
+            </motion.div>
+
+            {/* Text input */}
+            <motion.div
+              initial={{ opacity: 0, y: 10 }}
+              animate={{ opacity: 1, y: 0 }}
+              transition={{ duration: 0.3, delay: 0.2 }}
+              className="flex flex-col gap-2"
+            >
+              <label className="text-sm font-semibold text-gray-700 dark:text-gray-200">
+                Text to synthesize
+              </label>
+              <Textarea
+                rows={6}
+                placeholder="Enter text here…"
+                value={text}
+                onChange={(e) => setText(e.target.value)}
+                onKeyDown={handleKeyDown}
+                className="resize-none focus-visible:ring-2 focus-visible:ring-TT-purple-accent text-base border-2"
+                disabled={isLoading}
+              />
+              <p className="text-xs text-gray-500 dark:text-gray-400 flex items-center gap-2 flex-wrap">
+                <span>Press</span>
+                <kbd className="px-2 py-1 rounded font-mono text-[11px] bg-TT-purple-accent/20 dark:bg-TT-purple-accent/30 text-TT-purple-accent dark:text-TT-purple-tint1 border border-TT-purple-accent/40 dark:border-TT-purple-accent/50">
+                  Enter
+                </kbd>
+                <span>to generate</span>
+                <span className="text-gray-400 dark:text-gray-600">•</span>
+                <kbd className="px-2 py-1 rounded font-mono text-[11px] bg-TT-purple-accent/20 dark:bg-TT-purple-accent/30 text-TT-purple-accent dark:text-TT-purple-tint1 border border-TT-purple-accent/40 dark:border-TT-purple-accent/50">
+                  Shift+Enter
+                </kbd>
+                <span>for new line</span>
+              </p>
+            </motion.div>
+
+            {/* Generate button */}
+            <motion.div
+              initial={{ opacity: 0, y: 10 }}
+              animate={{ opacity: 1, y: 0 }}
+              transition={{ duration: 0.3, delay: 0.3 }}
+              className="flex justify-center"
+            >
+              <Button
+                size="lg"
+                className="flex items-center gap-2 px-12 h-14 text-lg bg-TT-purple-accent hover:bg-TT-purple text-white font-semibold transition-all duration-200 hover:shadow-xl hover:scale-105 disabled:hover:scale-100 disabled:hover:shadow-none"
+                onClick={handleGenerate}
+                disabled={isLoading || ttsModels.length === 0 || !text.trim()}
+              >
+                {isLoading ? (
+                  <>
+                    <Loader2 className="w-6 h-6 animate-spin" />
+                    Generating…
+                  </>
+                ) : (
+                  <>
+                    <Volume2 className="w-6 h-6" />
+                    Generate Audio
+                  </>
+                )}
+              </Button>
+            </motion.div>
+
+            {/* Audio player */}
+            {audioUrl && (
+              <motion.div
+                initial={{ opacity: 0, scale: 0.95 }}
+                animate={{ opacity: 1, scale: 1 }}
+                transition={{ duration: 0.3 }}
+                className="rounded-xl border-2 border-TT-purple-accent/30 dark:border-[#7C68FA]/40 p-6 bg-gradient-to-br from-purple-50 to-white dark:from-purple-950/20 dark:to-gray-900/50 flex flex-col gap-4 shadow-xl"
+              >
+                <div className="flex items-center justify-between">
+                  <div className="flex items-center gap-3">
+                    <div className="p-2.5 rounded-lg bg-TT-purple-accent/20 dark:bg-TT-purple-accent/30">
+                      <Volume2 className="w-5 h-5 text-TT-purple-accent" />
+                    </div>
+                    <span className="text-base font-semibold text-gray-800 dark:text-gray-100">
+                      Generated Audio
+                    </span>
+                  </div>
+                  <Button
+                    variant="ghost"
+                    size="sm"
+                    className="flex items-center gap-1.5 text-gray-600 hover:text-TT-purple-accent dark:text-gray-300 dark:hover:text-TT-purple-accent hover:bg-TT-purple-accent/10"
+                    onClick={handleDownload}
+                  >
+                    <Download className="w-4 h-4" />
+                    Download
+                  </Button>
+                </div>
+                <audio
+                  ref={audioRef}
+                  controls
+                  src={audioUrl}
+                  className="w-full"
+                />
+              </motion.div>
+            )}
+
+            {/* Hidden audio element before URL is set */}
+            {!audioUrl && <audio ref={audioRef} className="hidden" />}
+        </div>
+      </div>
+    </Card>
+  );
+}
diff --git a/app/frontend/src/contexts/DeviceStateContext.ts b/app/frontend/src/contexts/DeviceStateContext.ts
new file mode 100644
index 00000000..a9768597
--- /dev/null
+++ b/app/frontend/src/contexts/DeviceStateContext.ts
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import { createContext } from "react";
+
+export type DeviceState =
+  | "HEALTHY"
+  | "BAD_STATE"
+  | "RESETTING"
+  | "NOT_PRESENT"
+  | "UNKNOWN";
+
+export interface DeviceInfo {
+  index: number;
+  board_type: string;
+  bus_id: string;
+  temperature: number;
+  power: number;
+  voltage: number;
+}
+
+export interface DeviceStateData {
+  state: DeviceState;
+  board_type: string;
+  board_name: string;
+  devices: DeviceInfo[];
+  last_updated: string;
+  reset_suggested: boolean;
+}
+
+export interface DeviceStateContextType {
+  deviceState: DeviceStateData | null;
+  loading: boolean;
+  error: string | null;
+  /** Immediately re-fetch device state and reschedule polling. */
+  refresh: () => void;
+}
+
+export const DeviceStateContext = createContext<
+  DeviceStateContextType | undefined
+>(undefined);
diff --git a/app/frontend/src/contexts/ModelsContext.ts b/app/frontend/src/contexts/ModelsContext.ts
index 57dd2202..be4850a2 100644
--- a/app/frontend/src/contexts/ModelsContext.ts
+++ b/app/frontend/src/contexts/ModelsContext.ts
@@ -11,6 +11,8 @@ export interface Model {
   status: string;
   health: string;
   ports: string;
+  model_type?: string;
+  device_id?: number | null;
 }
 
 export interface ModelsContextType {
diff --git a/app/frontend/src/hooks/useDeviceState.ts b/app/frontend/src/hooks/useDeviceState.ts
new file mode 100644
index 00000000..af0eee89
--- /dev/null
+++ b/app/frontend/src/hooks/useDeviceState.ts
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import { useContext } from "react";
+import { DeviceStateContext } from "../contexts/DeviceStateContext";
+
+export const useDeviceState = () => {
+  const context = useContext(DeviceStateContext);
+  if (context === undefined) {
+    throw new Error("useDeviceState must be used within a DeviceStateProvider");
+  }
+  return context;
+};
diff --git a/app/frontend/src/pages/TTSPage.tsx b/app/frontend/src/pages/TTSPage.tsx
new file mode 100644
index 00000000..7c5163cf
--- /dev/null
+++ b/app/frontend/src/pages/TTSPage.tsx
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import TTSDemo from "../components/tts/TTSDemo";
+
+export default function TTSPage() {
+  return (
+    <>
+      <div className="fixed inset-0 w-full dark:bg-black bg-white dark:bg-grid-white/[0.2] bg-grid-black/[0.2]">
+        <div
+          className="absolute pointer-events-none inset-0 flex items-center justify-center dark:bg-black bg-white"
+          style={{
+            maskImage:
+              "radial-gradient(ellipse at center, transparent 95%, black 100%)",
+          }}
+        ></div>
+        <div className="w-full h-screen flex items-center justify-center pl-[4.5rem] lg:pl-32 pb-20 p-4">
+          <TTSDemo />
+        </div>
+      </div>
+    </>
+  );
+}
diff --git a/app/frontend/src/pages/VoicePipelinePage.tsx b/app/frontend/src/pages/VoicePipelinePage.tsx
new file mode 100644
index 00000000..de5c2b54
--- /dev/null
+++ b/app/frontend/src/pages/VoicePipelinePage.tsx
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import VoicePipelineDemo from "../components/pipeline/VoicePipelineDemo";
+
+export default function VoicePipelinePage() {
+  return <VoicePipelineDemo />;
+}
diff --git a/app/frontend/src/providers/DeviceStateContext.tsx b/app/frontend/src/providers/DeviceStateContext.tsx
new file mode 100644
index 00000000..9da7d048
--- /dev/null
+++ b/app/frontend/src/providers/DeviceStateContext.tsx
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+
+import React, { useState, useCallback, useEffect, useRef } from "react";
+import {
+  DeviceStateContext,
+  type DeviceStateData,
+} from "../contexts/DeviceStateContext";
+
+/**
+ * Adaptive poll intervals by device state.
+ * Fast polling during recovery states so the UI updates promptly.
+ */
+const POLL_INTERVALS: Record<string, number> = {
+  HEALTHY: 30_000,
+  BAD_STATE: 5_000,
+  RESETTING: 2_000,
+  NOT_PRESENT: 30_000,
+  UNKNOWN: 10_000,
+};
+
+export const DeviceStateProvider: React.FC<{ children: React.ReactNode }> = ({
+  children,
+}) => {
+  const [deviceState, setDeviceState] = useState<DeviceStateData | null>(null);
+  const [loading, setLoading] = useState(true);
+  const [error, setError] = useState<string | null>(null);
+
+  // Store the current state in a ref so the scheduled callback always reads
+  // the latest value without creating stale closures.
+  const stateRef = useRef<string>("UNKNOWN");
+  const timerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
+  // pollRef lets us call poll() from the refresh callback without circular deps.
+  const pollRef = useRef<() => Promise<void>>(async () => {});
+
+  const scheduleNext = useCallback(() => {
+    if (timerRef.current) clearTimeout(timerRef.current);
+    const interval = POLL_INTERVALS[stateRef.current] ?? 10_000;
+    timerRef.current = setTimeout(() => pollRef.current(), interval);
+  }, []);
+
+  useEffect(() => {
+    const poll = async () => {
+      try {
+        const response = await fetch("/board-api/device-state/");
+        if (!response.ok)
+          throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+        const data: DeviceStateData = await response.json();
+        stateRef.current = data.state;
+        setDeviceState(data);
+        setError(null);
+      } catch (err) {
+        setError(err instanceof Error ? err.message : "Unknown error");
+      } finally {
+        setLoading(false);
+        scheduleNext();
+      }
+    };
+
+    pollRef.current = poll;
+    poll();
+
+    return () => {
+      if (timerRef.current) clearTimeout(timerRef.current);
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, []);
+
+  const refresh = useCallback(() => {
+    if (timerRef.current) clearTimeout(timerRef.current);
+    pollRef.current();
+  }, []);
+
+  return (
+    <DeviceStateContext.Provider value={{ deviceState, loading, error, refresh }}>
+      {children}
+    </DeviceStateContext.Provider>
+  );
+};
diff --git a/app/frontend/src/providers/ModelsContext.tsx b/app/frontend/src/providers/ModelsContext.tsx
index 0c02fef9..ce2ce1bf 100644
--- a/app/frontend/src/providers/ModelsContext.tsx
+++ b/app/frontend/src/providers/ModelsContext.tsx
@@ -39,6 +39,7 @@ export const ModelsProvider: React.FC<{ children: React.ReactNode }> = ({
             status: dockerModel?.status || "deployed",
             health: dockerModel?.health || "unknown",
             ports: dockerModel?.ports || "No ports",
+            model_type: deployedModel.model_type,
           };
         });
 
diff --git a/app/frontend/src/routes/index.tsx b/app/frontend/src/routes/index.tsx
index 7627de6f..0bd5b45e 100644
--- a/app/frontend/src/routes/index.tsx
+++ b/app/frontend/src/routes/index.tsx
@@ -4,6 +4,7 @@
 import { BrowserRouter as Router, Routes, Route } from "react-router-dom";
 import { RefreshProvider } from "../providers/RefreshContext";
 import { ModelsProvider } from "../providers/ModelsContext";
+import { DeviceStateProvider } from "../providers/DeviceStateContext";
 import { getRoutes } from "./route-config";
 import { MainLayout } from "../layouts/MainLayout";
 
@@ -18,23 +19,25 @@ const AppRouter = () => {
   );
 
   return (
-    <RefreshProvider>
-      <ModelsProvider>
-        <Router>
-          <Routes>
-            {routes
-              .filter((route) => route.condition !== false)
-              .map((route) => (
-                <Route
-                  key={route.path}
-                  path={route.path}
-                  element={<MainLayout>{route.element}</MainLayout>}
-                />
-              ))}
-          </Routes>
-        </Router>
-      </ModelsProvider>
-    </RefreshProvider>
+    <DeviceStateProvider>
+      <RefreshProvider>
+        <ModelsProvider>
+          <Router>
+            <Routes>
+              {routes
+                .filter((route) => route.condition !== false)
+                .map((route) => (
+                  <Route
+                    key={route.path}
+                    path={route.path}
+                    element={<MainLayout>{route.element}</MainLayout>}
+                  />
+                ))}
+            </Routes>
+          </Router>
+        </ModelsProvider>
+      </RefreshProvider>
+    </DeviceStateProvider>
   );
 };
 
diff --git a/app/frontend/src/routes/route-config.tsx b/app/frontend/src/routes/route-config.tsx
index 7225f053..f9493429 100644
--- a/app/frontend/src/routes/route-config.tsx
+++ b/app/frontend/src/routes/route-config.tsx
@@ -52,6 +52,8 @@ import ImageGenPage from "../pages/ImageGenPage";
 import AudioDetectionPage from "../pages/AudioDetectionPage";
 import ApiInfoPage from "../pages/ApiInfoPage";
 import DeploymentHistoryPage from "../pages/DeploymentHistoryPage";
+import VoicePipelinePage from "../pages/VoicePipelinePage";
+import TTSPage from "../pages/TTSPage";
 
 // Define route configuration type
 export interface RouteConfig {
@@ -123,6 +125,16 @@ export const getRoutes = (): RouteConfig[] => {
       element: <DeploymentHistoryPage />,
       condition: true,
     },
+    {
+      path: "/voice-pipeline",
+      element: <VoicePipelinePage />,
+      condition: true,
+    },
+    {
+      path: "/tts",
+      element: <TTSPage />,
+      condition: true,
+    },
     {
       // catch all for all other routes
       path: "*",
diff --git a/app/frontend/src/types/models.ts b/app/frontend/src/types/models.ts
index 29f9effb..bcee92cd 100644
--- a/app/frontend/src/types/models.ts
+++ b/app/frontend/src/types/models.ts
@@ -11,6 +11,8 @@ export interface ModelRow {
   image?: string;
   status?: string;
   ports?: string;
+  model_type?: string;
+  device_id?: number | null;
 }
 
 export interface ColumnVisibilityMap {
diff --git a/inference-api/api.py b/inference-api/api.py
index e6d65d42..34bc4760 100644
--- a/inference-api/api.py
+++ b/inference-api/api.py
@@ -703,7 +703,7 @@ async def run_inference(request: RunRequest):
             "AUTOMATIC_HOST_SETUP": "True",
             "TT_PROGRESS_DEBUG": "1",  # Enable structured progress emission
             "TT_PROGRESS_SSE": "1",     # Enable SSE endpoint for real-time progress
-            "SERVICE_PORT": "7000"      # Set SERVICE_PORT to match --service-port argument
+            "SERVICE_PORT": request.service_port or "7000"  # Use requested port (per-slot)
         }
         
         # Handle secrets - use from request if provided and not already in environment
@@ -742,7 +742,7 @@ async def run_inference(request: RunRequest):
         # Skip system software validation if requested (handles prerelease versions like '2.6.0-rc1')
         if request.skip_system_sw_validation:
             sys.argv.extend(["--skip-system-sw-validation"])
-        sys.argv.extend(["--service-port", "7000"])
+        sys.argv.extend(["--service-port", request.service_port or "7000"])
         
         # Add optional arguments if they are set
         if request.impl:
@@ -757,9 +757,8 @@ async def run_inference(request: RunRequest):
             sys.argv.append("--disable-trace-capture")
         if request.override_docker_image:
             sys.argv.extend(["--override-docker-image", request.override_docker_image])
-        # TODO: Uncomment this for dev branch
-        # if request.device_id:
-        #     sys.argv.extend(["--device-id", request.device_id])
+        if request.device_id:
+            sys.argv.extend(["--device-id", request.device_id])
         if request.override_tt_config:
             sys.argv.extend(["--override-tt-config", request.override_tt_config])
         if request.vllm_override_args:
diff --git a/run.py b/run.py
index 4b22f218..8604dba0 100644
--- a/run.py
+++ b/run.py
@@ -1248,27 +1248,28 @@ def wait_for_all_services(skip_fastapi=False, is_deployed_mode=False):
         print("\n⚠️  Some services may not be fully ready, but main app may still be accessible.")
     return all_healthy
 
-def wait_for_frontend_and_open_browser(host="localhost", port=3000, timeout=60, auto_deploy_model=None):
+def wait_for_frontend_and_open_browser(host="localhost", port=3000, timeout=60, auto_deploy_model=None, device_id=0):
     """
     Wait for frontend service to be healthy before opening browser.
-    
+
     Args:
         host: Frontend host
         port: Frontend port
         timeout: Timeout in seconds
         auto_deploy_model: Model name to auto-deploy (optional)
-    
+        device_id: Chip slot index for auto-deploy (default 0)
+
     Returns:
         bool: True if browser opened successfully, False otherwise
     """
     base_url = f"http://{host}:{port}/"
-    
+
     # Add auto-deploy parameter if specified
     if auto_deploy_model:
         from urllib.parse import urlencode
-        params = urlencode({"auto-deploy": auto_deploy_model})
+        params = urlencode({"auto-deploy": auto_deploy_model, "device-id": device_id})
         frontend_url = f"{base_url}?{params}"
-        print(f"\n🤖 Auto-deploying model: {auto_deploy_model}")
+        print(f"\n🤖 Auto-deploying model: {auto_deploy_model} on chip {device_id}")
     else:
         frontend_url = base_url
     
@@ -1654,7 +1655,7 @@ def validate_artifact_structure(artifact_dir):
     return True
 
 
-def setup_tt_inference_server():
+def setup_tt_inference_server(pull_branch=False):
     """Set up TT Inference Server by downloading/extracting artifact from GitHub release or branch."""
     print(f"\n{C_TT_PURPLE}{C_BOLD}====================================================={C_RESET}")
     print(f"{C_TT_PURPLE}{C_BOLD}         🔧 Setting up TT Inference Server (Artifact){C_RESET}")
@@ -1729,8 +1730,13 @@ def setup_tt_inference_server():
                     print(f"{C_YELLOW}⚠️  Artifact metadata missing - will re-download branch '{artifact_branch}'{C_RESET}")
                 
                 if not branch_mismatch:
-                    # For branches, we can't easily verify without git, so just show what's configured
-                    print(f"✅ TT Inference Server configuration already exists at {INFERENCE_ARTIFACT_DIR}{branch_str}")
+                    if pull_branch:
+                        # --pull-branch flag: force re-download to pick up new commits on the branch
+                        branch_mismatch = True
+                        print(f"🔄 --pull-branch: re-fetching latest '{artifact_branch}' from remote...")
+                    else:
+                        # For branches, we can't easily verify without git, so just show what's configured
+                        print(f"✅ TT Inference Server configuration already exists at {INFERENCE_ARTIFACT_DIR}{branch_str}")
             elif artifact_version and artifact_version != "latest" and version:
                 req = artifact_version.lstrip("v").strip()
                 cur = version.lstrip("v").strip()
@@ -2220,6 +2226,33 @@ def handle_remove_readonly(func, path, exc):
         print(f"   See: https://github.com/tenstorrent/tt-inference-server/releases")
         return False
 
+def _sync_model_catalog():
+    """Regenerate models_from_inference_server.json from the downloaded artifact."""
+    sync_script = os.path.join(TT_STUDIO_ROOT, "app", "backend", "shared_config", "sync_models_from_inference_server.py")
+    if not os.path.exists(sync_script):
+        print(f"{C_YELLOW}⚠️  Model catalog sync script not found at {sync_script}, skipping.{C_RESET}")
+        return
+    print(f"\n{C_CYAN}🔄 Syncing model catalog from artifact...{C_RESET}")
+    try:
+        result = subprocess.run(
+            [sys.executable, sync_script],
+            capture_output=True, text=True, timeout=30,
+        )
+        if result.returncode == 0:
+            print(f"{C_GREEN}✅ Model catalog synced.{C_RESET}")
+            if result.stdout.strip():
+                for line in result.stdout.strip().splitlines():
+                    print(f"   {line}")
+            print(f"{C_YELLOW}💡 Reminder: commit app/backend/shared_config/models_from_inference_server.json")
+            print(f"   so CI/CD Docker image builds use the updated catalog.{C_RESET}")
+        else:
+            print(f"{C_YELLOW}⚠️  Model catalog sync exited with code {result.returncode}:{C_RESET}")
+            if result.stderr.strip():
+                print(result.stderr.strip()[:500])
+    except Exception as e:
+        print(f"{C_YELLOW}⚠️  Model catalog sync failed: {e}{C_RESET}")
+
+
 def setup_fastapi_environment():
     """Set up the inference-api FastAPI environment."""
     print(f"🔧 Setting up inference-api environment...")
@@ -3467,6 +3500,8 @@ def main():
                            help="🔄 Reset preferences and reconfigure all options")
         parser.add_argument("--reconfigure-inference-server", action="store_true",
                            help="🔄 Reconfigure TT Inference Server artifact (branch/version)")
+        parser.add_argument("--pull-branch", action="store_true",
+                           help="🔄 Re-download the inference server artifact from the configured branch to pick up new commits")
         parser.add_argument("--skip-fastapi", action="store_true",
                            help="⏭️  Skip TT Inference Server FastAPI setup (auto-skipped in AI Playground mode)")
         parser.add_argument("--skip-docker-control", action="store_true",
@@ -3485,6 +3520,8 @@ def main():
                    help="🔍 Check for missing SPDX license headers without adding them")
         parser.add_argument("--auto-deploy", type=str, metavar="MODEL_NAME",
                    help="🤖 Automatically deploy the specified model after startup (e.g., 'Llama-3.2-1B-Instruct')")
+        parser.add_argument("--device-id", type=int, default=0, metavar="CHIP_ID",
+                   help="🔌 Chip slot index (0-7) to use when auto-deploying a model (default: 0)")
         parser.add_argument("--fix-docker", action="store_true",
                    help="🔧 Automatically fix Docker service and permission issues")
         parser.add_argument("--easy", action="store_true",
@@ -3793,9 +3830,11 @@ def main():
             # The --no-sudo flag is kept for backward compatibility
             try:
                 # Setup TT Inference Server
-                if not setup_tt_inference_server():
+                if not setup_tt_inference_server(pull_branch=args.pull_branch):
                     print(f"{C_RED}⛔ Failed to setup TT Inference Server. Continuing without FastAPI server.{C_RESET}")
                 else:
+                    # Sync model catalog from the newly downloaded artifact
+                    _sync_model_catalog()
                     # Setup FastAPI environment
                     if not setup_fastapi_environment():
                         print(f"{C_RED}⛔ Failed to setup FastAPI environment. Continuing without FastAPI server.{C_RESET}")
@@ -3899,12 +3938,14 @@ def main():
             host, port, timeout = get_frontend_config()
             
             # Use the new function that reuses existing infrastructure
-            if not wait_for_frontend_and_open_browser(host, port, timeout, args.auto_deploy):
-                auto_deploy_param = f"?auto-deploy={args.auto_deploy}" if args.auto_deploy else ""
+            device_id_val = getattr(args, "device_id", 0)
+            if not wait_for_frontend_and_open_browser(host, port, timeout, args.auto_deploy, device_id=device_id_val):
+                auto_deploy_param = f"?auto-deploy={args.auto_deploy}&device-id={device_id_val}" if args.auto_deploy else ""
                 print(f"{C_YELLOW}⚠️  Browser opening failed. Please manually navigate to http://{host}:{port}{auto_deploy_param}{C_RESET}")
         else:
             host, port, _ = get_frontend_config()
-            auto_deploy_param = f"?auto-deploy={args.auto_deploy}" if args.auto_deploy else ""
+            device_id_val = getattr(args, "device_id", 0)
+            auto_deploy_param = f"?auto-deploy={args.auto_deploy}&device-id={device_id_val}" if args.auto_deploy else ""
             print(f"{C_BLUE}🌐 Automatic browser opening disabled. Access TT-Studio at: {C_CYAN}http://{host}:{port}{auto_deploy_param}{C_RESET}")
         
         # If in dev mode, show logs similar to startup.sh
diff --git a/tt-inference-server b/tt-inference-server
new file mode 160000
index 00000000..ac1892b7
--- /dev/null
+++ b/tt-inference-server
@@ -0,0 +1 @@
+Subproject commit ac1892b7e69f08e7020031dab3f9a30a0dcbe269