Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#Secrets
SAGE_USER=
SAGE_PASS=
HF_TOKEN=

#weaviate
#https://weaviate.io/developers/weaviate/config-refs/env-vars
BIND_INFERENCE_API=http://multi2vec-bind:8080
RERANKER_INFERENCE_API=http://reranker-transformers:8080
QUERY_DEFAULTS_LIMIT=25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true'
PERSISTENCE_DATA_PATH='/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE='multi2vec-bind'
ENABLE_MODULES='multi2vec-bind,reranker-transformers,backup-filesystem'
BACKUP_FILESYSTEM_PATH='/tmp/backups'
CLUSTER_HOSTNAME=node1
#https://weaviate.io/developers/weaviate/concepts/vector-index#asynchronous-indexing
ASYNC_INDEXING=true
#https://weaviate.io/blog/weaviate-1-18-release#improvements-to-bm25-and-hybrid-search
USE_BLOCKMAX_WAND=true
USE_INVERTED_SEARCHABLE=true
LIMIT_RESOURCES=true
#default is info
# LOG_LEVEL: 'debug'
#https://weaviate.io/developers/weaviate/configuration/monitoring
# PROMETHEUS_MONITORING_ENABLED: true
# PROMETHEUS_MONITORING_PORT: 2112

#triton
# Ensure this path matches the model repository directory
MODEL_REPOSITORY=/app/models
CLIP_MODEL_PATH=/models/DFN5B-CLIP-ViT-H-14-378
CLIP_MODEL_VERSION=419d1f8f6a96aabaf5913c526d059facda50c24b
GEMMA_MODEL_PATH=/models/gemma-3-4b-it
GEMMA_MODEL_VERSION=093f9f388b31de276ce2de164bdc2081324b9767

#weavloader
TRITON_HOST=triton
TRITON_PORT=8001
WEAVIATE_HOST=weaviate
WEAVIATE_PORT=8080
CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0
UNALLOWED_NODES="W042,N001,V012,W015,W01C,W01E,W024,W026,W02C,W02D,W02E,W02F,W031,W040,W046,W047,W048,W049,W04A,W051,W055,W059,W05A,W05B,W05C,W05D,W05E,W05F,W060,W061,W062,W063,W064,W065,W066,W06E,W072,W073,W074,W075,W076,W077,W078,W079,W07A,W07B,W07D,W07E,W07F,W080,W081,W086,W088,W089,W08A,W08B,W08D,W08E,W08F,W090,W091,W092,W094,W096,W099,W09B,W09E,W0A0,W0A1,W0BB,W0BC"
LOG_LEVEL='INFO'
MONITOR_DATA_STREAM_INTERVAL=60
MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE=5

#gardio-ui
WEAVIATE_HOST=weaviate
WEAVIATE_PORT=8080
WEAVIATE_GRPC_PORT=50051
CLUSTER_FLAG=True
45 changes: 45 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,51 @@ This repository includes a GitHub Action that builds and pushes Docker images fo

---

## Docker compose
envs:
```
cp .env.example .env
```
Make sure to fill in the secrets (top three env vars)

Run:
```
docker compose up -d --build
```

Clean up:
```
docker compose down
```

All together:
```
docker compose down && docker compose up -d --build
```

Clean up (volumes):
```
docker compose down --volumes
```

Notes:
- Triton migh not be able load either one of the models (CLIP and gemma3) or for some reason OSErrors loading the model weights so this is a workaround to download the models to your local directory and then move them to the container:
```
source .env #assumes that HF_TOKEN is set
cd triton
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
huggingface-cli download --local-dir DFN5B-CLIP-ViT-H-14-378 --revision "$CLIP_MODEL_VERSION" apple/DFN5B-CLIP-ViT-H-14-378

huggingface-cli download --local-dir gemma-3-4b-it --revision "$GEMMA_MODEL_VERSION" google/gemma-3-4b-it

docker cp DFN5B-CLIP-ViT-H-14-378 sage-nrp-image-search-triton-1:/models/
docker cp gemma-3-4b-it sage-nrp-image-search-triton-1:/models/
```

---

## Kubernetes
Developed and test with these versions for k8s and kustomize:
```
Expand Down
118 changes: 118 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
version: '3.4'
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: semitechnologies/weaviate:1.32.0 #https://weaviate.io/developers/weaviate/release-notes#weaviate-core-and-client-releases
ports:
- 8080:8080
- 50051:50051
restart: on-failure
environment:
- BIND_INFERENCE_API
- RERANKER_INFERENCE_API
- QUERY_DEFAULTS_LIMIT
- AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
- PERSISTENCE_DATA_PATH
- DEFAULT_VECTORIZER_MODULE
- ENABLE_MODULES
- BACKUP_FILESYSTEM_PATH
- CLUSTER_HOSTNAME
- ASYNC_INDEXING
- USE_BLOCKMAX_WAND
- USE_INVERTED_SEARCHABLE
- LIMIT_RESOURCES
# - LOG_LEVEL
# - PROMETHEUS_MONITORING_ENABLED
# - PROMETHEUS_MONITORING_PORT
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:8080/v1/.well-known/ready"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
volumes:
- weaviate:/var/lib/weaviate

multi2vec-bind:
image: semitechnologies/multi2vec-bind:imagebind

reranker-transformers:
image: semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2

triton:
build:
context: ./triton
platform: "linux/amd64"
ports:
- 8000:8000
- 8001:8001
- 8002:8002
shm_size: '500MB' #shared memory size
restart: on-failure
environment:
- MODEL_REPOSITORY
- CLIP_MODEL_PATH
- CLIP_MODEL_VERSION
- GEMMA_MODEL_PATH
- GEMMA_MODEL_VERSION
- HF_TOKEN
volumes:
- triton:/models

weavmanage:
build:
context: ./weavmanage
environment:
- WEAVIATE_HOST
- WEAVIATE_PORT
- WEAVIATE_GRPC_PORT
depends_on:
- weaviate

weavloader:
build:
context: ./weavloader
ports:
- 8081:8080
- 5555:5555
restart: on-failure
environment:
- TRITON_HOST
- TRITON_PORT
- WEAVIATE_HOST
- WEAVIATE_PORT
- SAGE_USER
- SAGE_PASS
- CELERY_BROKER_URL
- CELERY_RESULT_BACKEND
- UNALLOWED_NODES
- LOG_LEVEL
- MONITOR_DATA_STREAM_INTERVAL
- MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE
depends_on:
- weaviate
- weavmanage
- triton

gradio-ui:
build:
context: ./app
ports:
- 7860:7860
restart: on-failure
environment:
- WEAVIATE_HOST
- WEAVIATE_PORT
- WEAVIATE_GRPC_PORT
- CLUSTER_FLAG
- UNALLOWED_NODES

volumes:
triton:
weaviate:
110 changes: 5 additions & 105 deletions triton/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,41 +1,5 @@
# 1. Builder stage: compile OpenAI Triton on aarch64 for Gemma3, enable this step for H100
# --------------------------------------------------------------------------
# FROM python:3.10-slim AS builder
FROM nvcr.io/nvidia/tritonserver:24.06-pyt-python-py3

# # Install build dependencies
# RUN apt-get update && \
# apt-get install -y --no-install-recommends \
# build-essential cmake ninja-build git python3-dev libnuma-dev && \
# rm -rf /var/lib/apt/lists/*

# WORKDIR /tmp

# # Clone and build Triton wheel
# RUN git clone --depth 1 --branch v3.0.0 https://github.com/openai/triton.git triton && \
# cd triton/python && \
# pip wheel --no-cache-dir . -w /tmp/wheels && \
# cd /tmp && rm -rf triton

# 2. Runtime stage: Nvidia Triton server
# Use Triton's base image with Python 3.11
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
# --------------------------------------------------------------------------
# built for NVIDIA Driver Release 510 or later (Sage Blades, V033)
# FROM nvcr.io/nvidia/tritonserver:22.04-py3 <-- used this when using florence 2 base model, swtiched to 23.12 for Qwen2.5-VL-72B-Instruct
# FROM nvcr.io/nvidia/tritonserver:23.12-py3

# built for NVIDIA Driver Release 545 or later (Sage H100)
# FROM nvcr.io/nvidia/tritonserver:24.06-py3 switching to below to try to trim image size
FROM nvcr.io/nvidia/tritonserver:24.06-pyt-python-py3

# Set the Hugging Face token as a build argument and environment variable
# ARG HF_TOKEN
# ENV HF_TOKEN=${HF_TOKEN}

# Fix missing GPG key error
RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC

# Install system dependencies
RUN apt-get update \
&& apt-get install -y \
wget \
Expand All @@ -44,80 +8,16 @@ RUN apt-get update \
libglib2.0-0 \
git

# Set working directory
WORKDIR /app

# Copy the requirements.txt into the container
COPY requirements.txt .
COPY torch_requirements.txt .
#COPY flash_requirements.txt . enable for flash attention, flash attention must have have CUDA 11.7 and above

# Install dependencies using pip
RUN pip install --no-cache-dir --force-reinstall -r torch_requirements.txt
RUN pip install --no-cache-dir -r torch_requirements.txt
RUN pip install --no-cache-dir --no-build-isolation -r requirements.txt
# RUN pip install --no-cache-dir --no-build-isolation -r flash_requirements.txt enable for flash attention, flash attention must have have CUDA 11.7 and above

# Copy and install the Triton wheel from builder for Gemma3, enable this step for H100
# COPY --from=builder /tmp/wheels/triton-*.whl /tmp/
# RUN pip install --no-cache-dir /tmp/triton-*.whl && rm /tmp/triton-*.whl

# Set environment variables
ENV MODEL_PATH=/app/Florence-2-base
ENV MODEL_VERSION=ee1f1f163f352801f3b7af6b2b96e4baaa6ff2ff
ENV COLBERT_MODEL_PATH=/app/colbertv2.0
ENV COLBERT_MODEL_VERSION=c1e84128e85ef755c096a95bdb06b47793b13acf
ENV ALIGN_MODEL_PATH=/app/align-base
ENV ALIGN_MODEL_VERSION=e96a37facc7b1f59090ece82293226b817afd6ba
ENV CLIP_MODEL_PATH=/app/DFN5B-CLIP-ViT-H-14-378
# ENV CLIP_MODEL_VERSION=01b771ed0d1395ca5ffdd279897d665ebe00dfd2
ENV CLIP_MODEL_VERSION=419d1f8f6a96aabaf5913c526d059facda50c24b
ENV QWEN_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct
ENV QWEN_MODEL_VERSION=cc594898137f460bfe9f0759e9844b3ce807cfb5
ENV GEMMA_MODEL_PATH=/app/gemma-3-4b-it
ENV GEMMA_MODEL_VERSION=093f9f388b31de276ce2de164bdc2081324b9767

# Download Florence 2 model from Hugging Face
# RUN huggingface-cli download \
# --local-dir $MODEL_PATH \
# --revision $MODEL_VERSION \
# microsoft/Florence-2-base

# Download ColBERT model from Hugging Face
# RUN huggingface-cli download \
# --local-dir $COLBERT_MODEL_PATH \
# --revision $COLBERT_MODEL_VERSION \
# colbert-ir/colbertv2.0

# Download allign model from Hugging Face
# RUN huggingface-cli download \
# --local-dir $ALIGN_MODEL_PATH \
# --revision $ALIGN_MODEL_VERSION \
# kakaobrain/align-base

# Download CLIP model from Hugging Face
# RUN huggingface-cli download \
# --local-dir $CLIP_MODEL_PATH \
# --revision $CLIP_MODEL_VERSION \
# apple/DFN5B-CLIP-ViT-H-14-378

# Download Qwen model from Hugging Face
# RUN huggingface-cli download \
# --local-dir $QWEN_MODEL_PATH \
# --revision $QWEN_MODEL_VERSION \
# Qwen/Qwen2.5-VL-7B-Instruct

# Download Gemma model from Hugging Face
# RUN huggingface-cli login --token "$HF_TOKEN" \
# && huggingface-cli download \
# --local-dir $GEMMA_MODEL_PATH \
# --revision $GEMMA_MODEL_VERSION \
# google/gemma-3-4b-it

# Copy the application code into the container
COPY . .

# Expose Triton server ports
EXPOSE 8000 8001 8002
COPY models models
COPY HyperParameters.py HyperParameters.py
COPY entrypoint.sh entrypoint.sh

# Start the Triton Inference Server with the Python Backend
ENTRYPOINT ["./entrypoint.sh"]
10 changes: 5 additions & 5 deletions triton/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#!/bin/bash
#Test

set -e

# Download CLIP model if not already present and check if directory is empty
#Download CLIP model if not already present and check if directory is empty
if [ ! -d "$CLIP_MODEL_PATH" ] || [ -z "$(ls -A "$CLIP_MODEL_PATH" 2>/dev/null)" ]; then
echo "Downloading CLIP model..."
HF_TOKEN= huggingface-cli download \
--local-dir "$CLIP_MODEL_PATH" \
--revision "$CLIP_MODEL_VERSION" \
apple/DFN5B-CLIP-ViT-H-14-378
else
else
echo "CLIP model already present. Skipping download."
fi

Expand All @@ -23,12 +23,12 @@ if [ -n "$HF_TOKEN" ]; then
--local-dir "$GEMMA_MODEL_PATH" \
--revision "$GEMMA_MODEL_VERSION" \
google/gemma-3-4b-it
else
else
echo "Gemma model already present. Skipping download."
fi
else
echo "HF_TOKEN not provided. Skipping Hugging Face model downloads."
fi

# Start Triton Inference Server
exec tritonserver --model-repository=/app/models "$@"
exec tritonserver --model-repository=$MODEL_REPOSITORY "$@"
1 change: 0 additions & 1 deletion triton/models/clip/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import triton_python_backend_utils as pb_utils
import torch
from transformers import CLIPProcessor, CLIPModel
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

MODEL_PATH = os.environ.get("CLIP_MODEL_PATH")

Expand Down
Loading