Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#Secrets
SAGE_USER=
SAGE_PASS=
HF_TOKEN=

#weaviate
#https://weaviate.io/developers/weaviate/config-refs/env-vars
BIND_INFERENCE_API=http://multi2vec-bind:8080
RERANKER_INFERENCE_API=http://reranker-transformers:8080
QUERY_DEFAULTS_LIMIT=25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true'
PERSISTENCE_DATA_PATH='/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE='multi2vec-bind'
ENABLE_MODULES='multi2vec-bind,reranker-transformers,backup-filesystem'
BACKUP_FILESYSTEM_PATH='/tmp/backups'
CLUSTER_HOSTNAME=node1
#https://weaviate.io/developers/weaviate/concepts/vector-index#asynchronous-indexing
ASYNC_INDEXING=true
#https://weaviate.io/blog/weaviate-1-18-release#improvements-to-bm25-and-hybrid-search
USE_BLOCKMAX_WAND=true
USE_INVERTED_SEARCHABLE=true
LIMIT_RESOURCES=true
#default is info
# LOG_LEVEL: 'debug'
#https://weaviate.io/developers/weaviate/configuration/monitoring
# PROMETHEUS_MONITORING_ENABLED: true
# PROMETHEUS_MONITORING_PORT: 2112

#triton
# Ensure this path matches the model repository directory
MODEL_REPOSITORY=/app/models
CLIP_MODEL_PATH=/models/DFN5B-CLIP-ViT-H-14-378
CLIP_MODEL_VERSION=419d1f8f6a96aabaf5913c526d059facda50c24b
GEMMA_MODEL_PATH=/models/gemma-3-4b-it
GEMMA_MODEL_VERSION=093f9f388b31de276ce2de164bdc2081324b9767

#weavloader
TRITON_HOST=triton
TRITON_PORT=8001
WEAVIATE_HOST=weaviate
WEAVIATE_PORT=8080
CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0
UNALLOWED_NODES="W042,N001,V012,W015,W01C,W01E,W024,W026,W02C,W02D,W02E,W02F,W031,W040,W046,W047,W048,W049,W04A,W051,W055,W059,W05A,W05B,W05C,W05D,W05E,W05F,W060,W061,W062,W063,W064,W065,W066,W06E,W072,W073,W074,W075,W076,W077,W078,W079,W07A,W07B,W07D,W07E,W07F,W080,W081,W086,W088,W089,W08A,W08B,W08D,W08E,W08F,W090,W091,W092,W094,W096,W099,W09B,W09E,W0A0,W0A1,W0BB,W0BC"
LOG_LEVEL='INFO'
MONITOR_DATA_STREAM_INTERVAL=60
MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE=5

#gardio-ui
WEAVIATE_HOST=weaviate
WEAVIATE_PORT=8080
WEAVIATE_GRPC_PORT=50051
CLUSTER_FLAG=True
45 changes: 45 additions & 0 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,51 @@ This repository includes a GitHub Action that builds and pushes Docker images fo

---

## Docker compose
envs:
```
cp .env.example .env
```
Make sure to fill in the secrets (top three env vars)

Run:
```
docker compose up -d --build
```

Clean up:
```
docker compose down
```

All together:
```
docker compose down && docker compose up -d --build
```

Clean up (volumes):
```
docker compose down --volumes
```

Notes:
- Triton migh not be able load either one of the models (CLIP and gemma3) or for some reason OSErrors loading the model weights so this is a workaround to download the models to your local directory and then move them to the container:
```
source .env #assumes that HF_TOKEN is set
cd triton
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
huggingface-cli download --local-dir DFN5B-CLIP-ViT-H-14-378 --revision "$CLIP_MODEL_VERSION" apple/DFN5B-CLIP-ViT-H-14-378

huggingface-cli download --local-dir gemma-3-4b-it --revision "$GEMMA_MODEL_VERSION" google/gemma-3-4b-it

docker cp DFN5B-CLIP-ViT-H-14-378 sage-nrp-image-search-triton-1:/models/
docker cp gemma-3-4b-it sage-nrp-image-search-triton-1:/models/
```

---

## Kubernetes
Developed and test with these versions for k8s and kustomize:
```
Expand Down
118 changes: 118 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
version: '3.4'
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: semitechnologies/weaviate:1.32.0 #https://weaviate.io/developers/weaviate/release-notes#weaviate-core-and-client-releases
ports:
- 8080:8080
- 50051:50051
restart: on-failure
environment:
- BIND_INFERENCE_API
- RERANKER_INFERENCE_API
- QUERY_DEFAULTS_LIMIT
- AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
- PERSISTENCE_DATA_PATH
- DEFAULT_VECTORIZER_MODULE
- ENABLE_MODULES
- BACKUP_FILESYSTEM_PATH
- CLUSTER_HOSTNAME
- ASYNC_INDEXING
- USE_BLOCKMAX_WAND
- USE_INVERTED_SEARCHABLE
- LIMIT_RESOURCES
# - LOG_LEVEL
# - PROMETHEUS_MONITORING_ENABLED
# - PROMETHEUS_MONITORING_PORT
healthcheck:
test: ["CMD", "wget", "-qO-", "http://localhost:8080/v1/.well-known/ready"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
volumes:
- weaviate:/var/lib/weaviate

multi2vec-bind:
image: semitechnologies/multi2vec-bind:imagebind

reranker-transformers:
image: semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2

triton:
build:
context: ./triton
platform: "linux/amd64"
ports:
- 8000:8000
- 8001:8001
- 8002:8002
shm_size: '500MB' #shared memory size
restart: on-failure
environment:
- MODEL_REPOSITORY
- CLIP_MODEL_PATH
- CLIP_MODEL_VERSION
- GEMMA_MODEL_PATH
- GEMMA_MODEL_VERSION
- HF_TOKEN
volumes:
- triton:/models

weavmanage:
build:
context: ./weavmanage
environment:
- WEAVIATE_HOST
- WEAVIATE_PORT
- WEAVIATE_GRPC_PORT
depends_on:
- weaviate

weavloader:
build:
context: ./weavloader
ports:
- 8081:8080
- 5555:5555
restart: on-failure
environment:
- TRITON_HOST
- TRITON_PORT
- WEAVIATE_HOST
- WEAVIATE_PORT
- SAGE_USER
- SAGE_PASS
- CELERY_BROKER_URL
- CELERY_RESULT_BACKEND
- UNALLOWED_NODES
- LOG_LEVEL
- MONITOR_DATA_STREAM_INTERVAL
- MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE
depends_on:
- weaviate
- weavmanage
- triton

gradio-ui:
build:
context: ./app
ports:
- 7860:7860
restart: on-failure
environment:
- WEAVIATE_HOST
- WEAVIATE_PORT
- WEAVIATE_GRPC_PORT
- CLUSTER_FLAG
- UNALLOWED_NODES

volumes:
triton:
weaviate:
65 changes: 64 additions & 1 deletion kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ Copy the template and fill in your HuggingFace token (base64-encoded):
cp base/huggingface-secret.template.yaml base/._huggingface-secret.yaml
```

To generateb base64 encoded Hugging Face token:
```
echo -n "your_hf_token_here" | base64
```

### 2. Sage User Secret

Copy the Sage user secret template and add your Sage account name and password:
Expand All @@ -41,7 +46,12 @@ Copy the Sage user secret template and add your Sage account name and password:
cp base/sage-user-secret.template.yaml base/._sage-user-secret.yaml
```

- Encode username and password values as above.
Base64 encoded SAGE_USER and SAGE_PASS to generate:
``
echo -n "your_username_here" | base64
echo -n "your_password_here" | base64
```

- Update the `SAGE_USER` and `SAGE_PASS` fields.

> **Important:**
Expand All @@ -66,6 +76,59 @@ Or, using kubectl (if it supports native kustomize):
kubectl apply -k base/
```

Deploy all services:
```
kubectl kustomize nrp-dev | kubectl apply -f -
kubectl kustomize nrp-prod | kubectl apply -f -
```

Delete all services:
```
kubectl kustomize nrp-dev | kubectl delete -f -
kubectl kustomize nrp-prod | kubectl delete -f -
```

Debugging - output to yaml:
```
kubectl kustomize nrp-dev -o hybrid-search-dev.yaml
kubectl kustomize nrp-prod -o hybrid-search-dev.yaml
```

## Testing a Pull Request
For testing a Pull Request (PR), the overlay [prs](/kubernetes/prs/) is provided. Github Actions is setup to create an image for each PR so that we can manually test or in the future automatically test an instance of the image search deployed on k8s.

The following manual steps are required for now:
- [kubernetes/prs/kustomization.yaml](/kubernetes/prs/kustomization.yaml)
- change the `namePrefix` to the name of the PR
- change `commonLabels.env` to the name of the PR
- change the `newTag` to the name of the PR for each service that needs it
- port-forwarding for any of the services to test out (update `pr`):
- `kubectl port-forward svc/pr-triton 8001:8001`: triton endpoint to call the LLM models locally
- `kubectl port-forward svc/pr-gradio-ui 7860:7860`: Search UI
- `kubectl port-forward svc/pr-weaviate 8080:8080`: Weaviate REST endpoint
- `kubectl port-forward svc/pr-weaviate 50051:50051`: Weaviate GRPC endpoint
- `kubectl port-forward svc/pr-weavloader-metrics 5555:5555`: Weavloader Flower endpoint
- `kubectl port-forward svc/pr-weavloader-metrics 8081:8080`: Weavloader Prometheus endpoint

Deploy:
```
kubectl kustomize prs | kubectl apply -f -
```

Delete all services:
```
kubectl kustomize prs | kubectl delete -f -
```

Debugging - output to yaml:
```
kubectl kustomize prs -o hybrid-search-pr.yaml
```

Notes:
- Make sure that your PR is up-to-date with `main` so that the services that were not modified are reflected for the `latest` tag. This can be also be checked with the [docker-compose](/docker-compose.yml) local deployment (after the PR is up-to-date with `main`) to see if the changes in the PR are working with the rest of the services that were not modified.
- Users can utilized this overlay to combine it with their local docker compose instance to use a triton instance that has an NVIDIA GPU. This involves commenting out the ports from the docker compose manifest file for triton and doing the kubectl port-forwarding described above.

## Managing and Customizing

You can extend or patch this `base/` deployment using kustomize overlays for different environments, resource limits, or development setups. See included overlays (such as those in benchmark subfolders) for example usage.
Expand Down
20 changes: 16 additions & 4 deletions kubernetes/base/triton.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ spec:
value: "info"
- name: TORCHDYNAMO_DISABLE
value: "1"
- name: MODEL_REPOSITORY
value: "/app/models"
- name: CLIP_MODEL_PATH
value: "/models/DFN5B-CLIP-ViT-H-14-378"
- name: CLIP_MODEL_VERSION
value: "419d1f8f6a96aabaf5913c526d059facda50c24b"
- name: GEMMA_MODEL_PATH
value: "/models/gemma-3-4b-it"
- name: GEMMA_MODEL_VERSION
value: "093f9f388b31de276ce2de164bdc2081324b9767"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
Expand All @@ -32,10 +42,12 @@ spec:
cpu: 1
memory: 8Gi
nvidia.com/gpu: 1
ephemeral-storage: 50Gi
requests:
cpu: 1
memory: 8Gi
nvidia.com/gpu: 1
ephemeral-storage: 50Gi
ports:
- name: http
containerPort: 8000
Expand All @@ -46,10 +58,10 @@ spec:
volumeMounts:
- mountPath: /dev/shm
name: dshm
# - mountPath: /app/models/gemma-3-4b-it
# name: models
# - mountPath: /app/models/DFN5B-CLIP-ViT-H-14-378
# name: models
- mountPath: /models/gemma-3-4b-it
name: models
- mountPath: /models/DFN5B-CLIP-ViT-H-14-378
name: models
volumes:
- name: dshm
emptyDir:
Expand Down
4 changes: 4 additions & 0 deletions kubernetes/base/weavloader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,8 @@ spec:
- name: metrics
port: 8080
targetPort: 8080
protocol: TCP
- name: flower
port: 5555
targetPort: 5555
protocol: TCP
4 changes: 0 additions & 4 deletions kubernetes/nrp-dev/gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ spec:
operator: In
values:
- NVIDIA-A10
- key: nautilus.io/reservation
operator: In
values:
- sage
tolerations:
- key: nautilus.io/reservation
operator: Equal
Expand Down
4 changes: 0 additions & 4 deletions kubernetes/nrp-prod/gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ spec:
operator: In
values:
- NVIDIA-A10
- key: nautilus.io/reservation
operator: In
values:
- sage
tolerations:
- key: nautilus.io/reservation
operator: Equal
Expand Down
Loading