Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions apps/polyphemus/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
# ============================================================================
# STAGE 1: Builder - Install dependencies
# ============================================================================
FROM mirror.gcr.io/library/python:3.10.17-slim AS builder
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS builder
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

WORKDIR /app

# Install build dependencies
# Install Python 3.10 and build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.10 \
python3.10-venv \
python3.10-dev \
python3-pip \
build-essential \
git \
curl \
&& ln -s /usr/bin/python3.10 /usr/bin/python \
&& rm -rf /var/lib/apt/lists/*

# Copy only the dependency files for polyphemus
Expand Down Expand Up @@ -70,13 +75,16 @@ RUN mkdir -p /app/models
# ============================================================================
# STAGE 2: Runtime - Create minimal runtime image
# ============================================================================
FROM mirror.gcr.io/library/python:3.10.17-slim AS runtime
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 AS runtime

WORKDIR /app

# Install only runtime dependencies (no build tools)
# Install Python 3.10 and runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.10 \
python3.10-venv \
curl \
&& ln -s /usr/bin/python3.10 /usr/bin/python \
&& rm -rf /var/lib/apt/lists/*

# Create non-root user first
Expand Down Expand Up @@ -114,6 +122,10 @@ ENV PYTHONUNBUFFERED=1 \
# Expose the port the app will run on
EXPOSE 8080

# Add health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1

# Start the FastAPI app using Gunicorn with optimized settings
# Module path is rhesis.polyphemus.main:app (package structure is rhesis.polyphemus)
CMD ["gunicorn", "--bind", "0.0.0.0:8080", "--workers", "1", "--timeout", "3600", "--worker-class", "uvicorn.workers.UvicornWorker", "rhesis.polyphemus.main:app", "--preload"]
29 changes: 22 additions & 7 deletions apps/polyphemus/src/rhesis/polyphemus/models/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,18 +209,20 @@ def load_model(self) -> BaseLLM:
# Try to optimize with BetterTransformer (PyTorch 2.0+ optimization)
# This can provide 1.5-2x speedup for inference
# Requires: pip install optimum
if hasattr(self._internal_model, "model") and hasattr(
self._internal_model.model, "to_bettertransformer"
):
if hasattr(self._internal_model, "model"):
try:
from optimum.bettertransformer import BetterTransformer

logger.info("Applying BetterTransformer optimization...")
self._internal_model.model = self._internal_model.model.to_bettertransformer()
self._internal_model.model = BetterTransformer.transform(
self._internal_model.model, keep_original_model=False
)
self.model = self._internal_model.model
logger.info("✅ BetterTransformer applied successfully (1.5-2x speedup)")
except ImportError as import_error:
except ImportError:
logger.info(
f"⚠️ BetterTransformer not available (optional): {import_error}. "
f"Install 'optimum' package for 1.5-2x inference speedup."
"⚠️ BetterTransformer not available (optional). "
"Install 'optimum' package for 1.5-2x inference speedup."
)
except Exception as bt_error:
logger.warning(
Expand Down Expand Up @@ -251,6 +253,19 @@ def load_model(self) -> BaseLLM:
# Log GPU name
gpu_name = torch.cuda.get_device_name(0)
logger.info(f"✅ GPU: {gpu_name}")

# VERIFY: Test GPU computation
try:
test_tensor = torch.randn(1000, 1000, device=device)
result = torch.matmul(test_tensor, test_tensor)
logger.info(
f"✅ GPU Computation Test: PASSED "
f"(result device: {result.device})"
)
del test_tensor, result
torch.cuda.empty_cache()
except Exception as compute_error:
logger.error(f"❌ GPU Computation Test: FAILED - {compute_error}")
else:
logger.warning("⚠️ Model has no parameters to check device")
else:
Expand Down
4 changes: 2 additions & 2 deletions apps/polyphemus/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions sdk/src/rhesis/sdk/models/providers/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,12 @@ def generate(
# Move inputs to the model's device
inputs = {k: v.to(self.device) for k, v in inputs.items()}

# VERIFY: Log input device placement
if torch.cuda.is_available():
first_input_device = next(iter(inputs.values())).device
print(f"🔍 GPU Debug - Input tensors on device: {first_input_device}")
print(f"🔍 GPU Debug - Model device: {self.device}")

# Capture essential metrics
input_tokens = inputs["input_ids"].shape[1]
model_memory_gb = self._get_model_memory_gb()
Expand All @@ -308,6 +314,11 @@ def generate(
# Set default max_new_tokens (HuggingFace defaults to only 20 tokens)
kwargs.setdefault("max_new_tokens", 2048)

# VERIFY: Check GPU utilization before generation
if torch.cuda.is_available():
pre_gen_allocated = torch.cuda.memory_allocated() / 1e9
print(f"🔍 GPU Debug - Pre-generation GPU memory: {pre_gen_allocated:.2f}GB")

# generate response
output_ids = self.model.generate(
**inputs,
Expand All @@ -319,6 +330,15 @@ def generate(
end_time = time.time()
generation_time = end_time - start_time

# VERIFY: Check GPU utilization after generation
if torch.cuda.is_available():
post_gen_allocated = torch.cuda.memory_allocated() / 1e9
print(
f"🔍 GPU Debug - Post-generation GPU memory: {post_gen_allocated:.2f}GB "
f"(+{post_gen_allocated - pre_gen_allocated:.2f}GB peak during generation)"
)
print(f"🔍 GPU Debug - Generation took {generation_time:.2f}s for inference")

completion = self.tokenizer.decode(
output_ids[0][inputs["input_ids"].shape[1] :], # only take the newly generated content
skip_special_tokens=True,
Expand Down
Loading