diff --git a/apps/polyphemus/Dockerfile b/apps/polyphemus/Dockerfile index bf2333432..e6b632be3 100644 --- a/apps/polyphemus/Dockerfile +++ b/apps/polyphemus/Dockerfile @@ -14,16 +14,21 @@ # ============================================================================ # STAGE 1: Builder - Install dependencies # ============================================================================ -FROM mirror.gcr.io/library/python:3.10.17-slim AS builder +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS builder COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ WORKDIR /app -# Install build dependencies +# Install Python 3.10 and build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-venv \ + python3.10-dev \ + python3-pip \ build-essential \ git \ curl \ + && ln -s /usr/bin/python3.10 /usr/bin/python \ && rm -rf /var/lib/apt/lists/* # Copy only the dependency files for polyphemus @@ -70,13 +75,16 @@ RUN mkdir -p /app/models # ============================================================================ # STAGE 2: Runtime - Create minimal runtime image # ============================================================================ -FROM mirror.gcr.io/library/python:3.10.17-slim AS runtime +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime WORKDIR /app -# Install only runtime dependencies (no build tools) +# Install Python 3.10 and runtime dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-venv \ curl \ + && ln -s /usr/bin/python3.10 /usr/bin/python \ && rm -rf /var/lib/apt/lists/* # Create non-root user first @@ -114,6 +122,10 @@ ENV PYTHONUNBUFFERED=1 \ # Expose the port the app will run on EXPOSE 8080 +# Add health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:8080/health || exit 1 + # Start the FastAPI app using Gunicorn with optimized settings # Module path is rhesis.polyphemus.main:app (package structure is rhesis.polyphemus) CMD ["gunicorn", "--bind", "0.0.0.0:8080", "--workers", "1", "--timeout", "3600", "--worker-class", "uvicorn.workers.UvicornWorker", "rhesis.polyphemus.main:app", "--preload"] \ No newline at end of file diff --git a/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py b/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py index 3f4eba6f0..bc818d299 100644 --- a/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py +++ b/apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py @@ -209,18 +209,20 @@ def load_model(self) -> BaseLLM: # Try to optimize with BetterTransformer (PyTorch 2.0+ optimization) # This can provide 1.5-2x speedup for inference # Requires: pip install optimum - if hasattr(self._internal_model, "model") and hasattr( - self._internal_model.model, "to_bettertransformer" - ): + if hasattr(self._internal_model, "model"): try: + from optimum.bettertransformer import BetterTransformer + logger.info("Applying BetterTransformer optimization...") - self._internal_model.model = self._internal_model.model.to_bettertransformer() + self._internal_model.model = BetterTransformer.transform( + self._internal_model.model, keep_original_model=False + ) self.model = self._internal_model.model logger.info("✅ BetterTransformer applied successfully (1.5-2x speedup)") - except ImportError as import_error: + except ImportError: logger.info( - f"⚠️ BetterTransformer not available (optional): {import_error}. " - f"Install 'optimum' package for 1.5-2x inference speedup." + "⚠️ BetterTransformer not available (optional). " + "Install 'optimum' package for 1.5-2x inference speedup." ) except Exception as bt_error: logger.warning( @@ -251,6 +253,19 @@ def load_model(self) -> BaseLLM: # Log GPU name gpu_name = torch.cuda.get_device_name(0) logger.info(f"✅ GPU: {gpu_name}") + + # VERIFY: Test GPU computation + try: + test_tensor = torch.randn(1000, 1000, device=device) + result = torch.matmul(test_tensor, test_tensor) + logger.info( + f"✅ GPU Computation Test: PASSED " + f"(result device: {result.device})" + ) + del test_tensor, result + torch.cuda.empty_cache() + except Exception as compute_error: + logger.error(f"❌ GPU Computation Test: FAILED - {compute_error}") else: logger.warning("⚠️ Model has no parameters to check device") else: diff --git a/apps/polyphemus/uv.lock b/apps/polyphemus/uv.lock index b40c3fd01..a92b76dfd 100644 --- a/apps/polyphemus/uv.lock +++ b/apps/polyphemus/uv.lock @@ -6769,7 +6769,7 @@ wheels = [ [[package]] name = "rhesis-backend" -version = "0.6.1" +version = "0.6.2" source = { editable = "../backend" } dependencies = [ { name = "alembic" }, @@ -6959,7 +6959,7 @@ langgraph = [ [[package]] name = "rhesis-sdk" -version = "0.6.2" +version = "0.6.3" source = { editable = "../../sdk" } dependencies = [ { name = "deepeval" }, diff --git a/sdk/src/rhesis/sdk/models/providers/huggingface.py b/sdk/src/rhesis/sdk/models/providers/huggingface.py index 2e36fa16f..5e8228546 100644 --- a/sdk/src/rhesis/sdk/models/providers/huggingface.py +++ b/sdk/src/rhesis/sdk/models/providers/huggingface.py @@ -300,6 +300,12 @@ def generate( # Move inputs to the model's device inputs = {k: v.to(self.device) for k, v in inputs.items()} + # VERIFY: Log input device placement + if torch.cuda.is_available(): + first_input_device = next(iter(inputs.values())).device + print(f"🔍 GPU Debug - Input tensors on device: {first_input_device}") + print(f"🔍 GPU Debug - Model device: {self.device}") + # Capture essential metrics input_tokens = inputs["input_ids"].shape[1] model_memory_gb = self._get_model_memory_gb() @@ -308,6 +314,11 @@ def generate( # Set default max_new_tokens (HuggingFace defaults to only 20 tokens) kwargs.setdefault("max_new_tokens", 2048) + # VERIFY: Check GPU utilization before generation + if torch.cuda.is_available(): + pre_gen_allocated = torch.cuda.memory_allocated() / 1e9 + print(f"🔍 GPU Debug - Pre-generation GPU memory: {pre_gen_allocated:.2f}GB") + # generate response output_ids = self.model.generate( **inputs, @@ -319,6 +330,15 @@ def generate( end_time = time.time() generation_time = end_time - start_time + # VERIFY: Check GPU utilization after generation + if torch.cuda.is_available(): + post_gen_allocated = torch.cuda.memory_allocated() / 1e9 + print( + f"🔍 GPU Debug - Post-generation GPU memory: {post_gen_allocated:.2f}GB " + f"(+{post_gen_allocated - pre_gen_allocated:.2f}GB peak during generation)" + ) + print(f"🔍 GPU Debug - Generation took {generation_time:.2f}s for inference") + completion = self.tokenizer.decode( output_ids[0][inputs["input_ids"].shape[1] :], # only take the newly generated content skip_special_tokens=True,