rhesis-ai · harry-rhesis · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/apps/polyphemus/Dockerfile b/apps/polyphemus/Dockerfile
@@ -14,16 +14,21 @@
 # ============================================================================
 # STAGE 1: Builder - Install dependencies
 # ============================================================================
-FROM mirror.gcr.io/library/python:3.10.17-slim AS builder
+FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS builder
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 
 WORKDIR /app
 
-# Install build dependencies
+# Install Python 3.10 and build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-venv \
+    python3.10-dev \
+    python3-pip \
     build-essential \
     git \
     curl \
+    && ln -s /usr/bin/python3.10 /usr/bin/python \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy only the dependency files for polyphemus
@@ -70,13 +75,16 @@ RUN mkdir -p /app/models
 # ============================================================================
 # STAGE 2: Runtime - Create minimal runtime image
 # ============================================================================
-FROM mirror.gcr.io/library/python:3.10.17-slim AS runtime
+FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime
 
 WORKDIR /app
 
-# Install only runtime dependencies (no build tools)
+# Install Python 3.10 and runtime dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-venv \
     curl \
+    && ln -s /usr/bin/python3.10 /usr/bin/python \
     && rm -rf /var/lib/apt/lists/*
 
 # Create non-root user first
@@ -114,6 +122,10 @@ ENV PYTHONUNBUFFERED=1 \
 # Expose the port the app will run on
 EXPOSE 8080
 
+# Add health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:8080/health || exit 1
+
 # Start the FastAPI app using Gunicorn with optimized settings
 # Module path is rhesis.polyphemus.main:app (package structure is rhesis.polyphemus)
 CMD ["gunicorn", "--bind", "0.0.0.0:8080", "--workers", "1", "--timeout", "3600", "--worker-class", "uvicorn.workers.UvicornWorker", "rhesis.polyphemus.main:app", "--preload"]
diff --git a/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py b/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py
@@ -209,18 +209,20 @@ def load_model(self) -> BaseLLM:
             # Try to optimize with BetterTransformer (PyTorch 2.0+ optimization)
             # This can provide 1.5-2x speedup for inference
             # Requires: pip install optimum
-            if hasattr(self._internal_model, "model") and hasattr(
-                self._internal_model.model, "to_bettertransformer"
-            ):
+            if hasattr(self._internal_model, "model"):
                 try:
+                    from optimum.bettertransformer import BetterTransformer
+
                     logger.info("Applying BetterTransformer optimization...")
-                    self._internal_model.model = self._internal_model.model.to_bettertransformer()
+                    self._internal_model.model = BetterTransformer.transform(
+                        self._internal_model.model, keep_original_model=False
+                    )
                     self.model = self._internal_model.model
                     logger.info("✅ BetterTransformer applied successfully (1.5-2x speedup)")
-                except ImportError as import_error:
+                except ImportError:
                     logger.info(
-                        f"⚠️ BetterTransformer not available (optional): {import_error}. "
-                        f"Install 'optimum' package for 1.5-2x inference speedup."
+                        "⚠️ BetterTransformer not available (optional). "
+                        "Install 'optimum' package for 1.5-2x inference speedup."
                     )
                 except Exception as bt_error:
                     logger.warning(
@@ -251,6 +253,19 @@ def load_model(self) -> BaseLLM:
                             # Log GPU name
                             gpu_name = torch.cuda.get_device_name(0)
                             logger.info(f"✅ GPU: {gpu_name}")
+
+                            # VERIFY: Test GPU computation
+                            try:
+                                test_tensor = torch.randn(1000, 1000, device=device)
+                                result = torch.matmul(test_tensor, test_tensor)
+                                logger.info(
+                                    f"✅ GPU Computation Test: PASSED "
+                                    f"(result device: {result.device})"
+                                )
+                                del test_tensor, result
+                                torch.cuda.empty_cache()
+                            except Exception as compute_error:
+                                logger.error(f"❌ GPU Computation Test: FAILED - {compute_error}")
                         else:
                             logger.warning("⚠️ Model has no parameters to check device")
                     else:

diff --git a/apps/polyphemus/uv.lock b/apps/polyphemus/uv.lock
diff --git a/sdk/src/rhesis/sdk/models/providers/huggingface.py b/sdk/src/rhesis/sdk/models/providers/huggingface.py
@@ -300,6 +300,12 @@ def generate(
         # Move inputs to the model's device
         inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
+        # VERIFY: Log input device placement
+        if torch.cuda.is_available():
+            first_input_device = next(iter(inputs.values())).device
+            print(f"🔍 GPU Debug - Input tensors on device: {first_input_device}")
+            print(f"🔍 GPU Debug - Model device: {self.device}")
+
         # Capture essential metrics
         input_tokens = inputs["input_ids"].shape[1]
         model_memory_gb = self._get_model_memory_gb()
@@ -308,6 +314,11 @@ def generate(
         # Set default max_new_tokens (HuggingFace defaults to only 20 tokens)
         kwargs.setdefault("max_new_tokens", 2048)
 
+        # VERIFY: Check GPU utilization before generation
+        if torch.cuda.is_available():
+            pre_gen_allocated = torch.cuda.memory_allocated() / 1e9
+            print(f"🔍 GPU Debug - Pre-generation GPU memory: {pre_gen_allocated:.2f}GB")
+
         # generate response
         output_ids = self.model.generate(
             **inputs,
@@ -319,6 +330,15 @@ def generate(
         end_time = time.time()
         generation_time = end_time - start_time
 
+        # VERIFY: Check GPU utilization after generation
+        if torch.cuda.is_available():
+            post_gen_allocated = torch.cuda.memory_allocated() / 1e9
+            print(
+                f"🔍 GPU Debug - Post-generation GPU memory: {post_gen_allocated:.2f}GB "
+                f"(+{post_gen_allocated - pre_gen_allocated:.2f}GB peak during generation)"
+            )
+            print(f"🔍 GPU Debug - Generation took {generation_time:.2f}s for inference")
+
         completion = self.tokenizer.decode(
             output_ids[0][inputs["input_ids"].shape[1] :],  # only take the newly generated content
             skip_special_tokens=True,