waggle-sensor · iperezx · Feb 6, 2026 · Jan 28, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,53 @@
+#Secrets
+SAGE_USER=
+SAGE_PASS=
+HF_TOKEN=
+
+#weaviate
+#https://weaviate.io/developers/weaviate/config-refs/env-vars
+BIND_INFERENCE_API=http://multi2vec-bind:8080
+RERANKER_INFERENCE_API=http://reranker-transformers:8080
+QUERY_DEFAULTS_LIMIT=25
+AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true'
+PERSISTENCE_DATA_PATH='/var/lib/weaviate'
+DEFAULT_VECTORIZER_MODULE='multi2vec-bind'
+ENABLE_MODULES='multi2vec-bind,reranker-transformers,backup-filesystem'
+BACKUP_FILESYSTEM_PATH='/tmp/backups'
+CLUSTER_HOSTNAME=node1
+#https://weaviate.io/developers/weaviate/concepts/vector-index#asynchronous-indexing
+ASYNC_INDEXING=true
+#https://weaviate.io/blog/weaviate-1-18-release#improvements-to-bm25-and-hybrid-search
+USE_BLOCKMAX_WAND=true
+USE_INVERTED_SEARCHABLE=true
+LIMIT_RESOURCES=true
+#default is info
+# LOG_LEVEL: 'debug'
+#https://weaviate.io/developers/weaviate/configuration/monitoring 
+# PROMETHEUS_MONITORING_ENABLED: true
+# PROMETHEUS_MONITORING_PORT: 2112
+
+#triton
+# Ensure this path matches the model repository directory 
+MODEL_REPOSITORY=/app/models
+CLIP_MODEL_PATH=/models/DFN5B-CLIP-ViT-H-14-378
+CLIP_MODEL_VERSION=419d1f8f6a96aabaf5913c526d059facda50c24b
+GEMMA_MODEL_PATH=/models/gemma-3-4b-it
+GEMMA_MODEL_VERSION=093f9f388b31de276ce2de164bdc2081324b9767
+
+#weavloader
+TRITON_HOST=triton
+TRITON_PORT=8001
+WEAVIATE_HOST=weaviate
+WEAVIATE_PORT=8080
+CELERY_BROKER_URL=redis://localhost:6379/0
+CELERY_RESULT_BACKEND=redis://localhost:6379/0
+UNALLOWED_NODES="W042,N001,V012,W015,W01C,W01E,W024,W026,W02C,W02D,W02E,W02F,W031,W040,W046,W047,W048,W049,W04A,W051,W055,W059,W05A,W05B,W05C,W05D,W05E,W05F,W060,W061,W062,W063,W064,W065,W066,W06E,W072,W073,W074,W075,W076,W077,W078,W079,W07A,W07B,W07D,W07E,W07F,W080,W081,W086,W088,W089,W08A,W08B,W08D,W08E,W08F,W090,W091,W092,W094,W096,W099,W09B,W09E,W0A0,W0A1,W0BB,W0BC"
+LOG_LEVEL='INFO'
+MONITOR_DATA_STREAM_INTERVAL=60
+MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE=5
+
+#gardio-ui
+WEAVIATE_HOST=weaviate
+WEAVIATE_PORT=8080
+WEAVIATE_GRPC_PORT=50051
+CLUSTER_FLAG=True
diff --git a/Readme.md b/Readme.md
@@ -34,6 +34,51 @@ This repository includes a GitHub Action that builds and pushes Docker images fo
 
 ---
 
+## Docker compose
+envs:
+```
+cp .env.example .env
+```
+Make sure to fill in the secrets (top three env vars)
+
+Run:
+```
+docker compose up -d --build
+```
+
+Clean up:
+```
+docker compose down
+```
+
+All together:
+```
+docker compose down && docker compose up -d --build
+```
+
+Clean up (volumes):
+```
+docker compose down --volumes
+```
+
+Notes:
+- Triton migh not be able load either one of the models (CLIP and gemma3) or for some reason OSErrors loading the model weights so this is a workaround to download the models to your local directory and then move them to the container:
+   ```
+   source .env #assumes that HF_TOKEN is set
+   cd triton
+   python3 -m venv env
+   source env/bin/activate
+   pip install -r requirements.txt
+   huggingface-cli download --local-dir DFN5B-CLIP-ViT-H-14-378  --revision "$CLIP_MODEL_VERSION" apple/DFN5B-CLIP-ViT-H-14-378
+
+   huggingface-cli download --local-dir gemma-3-4b-it --revision "$GEMMA_MODEL_VERSION" google/gemma-3-4b-it
+
+   docker cp DFN5B-CLIP-ViT-H-14-378 sage-nrp-image-search-triton-1:/models/
+   docker cp gemma-3-4b-it sage-nrp-image-search-triton-1:/models/
+   ```
+
+---
+
 ## Kubernetes
 Developed and test with these versions for k8s and kustomize:
 ```

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,118 @@
+version: '3.4'
+services:
+  weaviate:
+    command:
+      - --host
+      - 0.0.0.0
+      - --port
+      - '8080'
+      - --scheme
+      - http
+    image: semitechnologies/weaviate:1.32.0 #https://weaviate.io/developers/weaviate/release-notes#weaviate-core-and-client-releases
+    ports:
+      - 8080:8080
+      - 50051:50051
+    restart: on-failure
+    environment:
+    - BIND_INFERENCE_API
+    - RERANKER_INFERENCE_API
+    - QUERY_DEFAULTS_LIMIT
+    - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
+    - PERSISTENCE_DATA_PATH
+    - DEFAULT_VECTORIZER_MODULE
+    - ENABLE_MODULES
+    - BACKUP_FILESYSTEM_PATH
+    - CLUSTER_HOSTNAME
+    - ASYNC_INDEXING
+    - USE_BLOCKMAX_WAND
+    - USE_INVERTED_SEARCHABLE
+    - LIMIT_RESOURCES
+    # - LOG_LEVEL
+    # - PROMETHEUS_MONITORING_ENABLED
+    # - PROMETHEUS_MONITORING_PORT
+    healthcheck:
+      test: ["CMD", "wget", "-qO-", "http://localhost:8080/v1/.well-known/ready"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+    volumes:
+    - weaviate:/var/lib/weaviate
+
+  multi2vec-bind:
+    image: semitechnologies/multi2vec-bind:imagebind
+
+  reranker-transformers:
+    image: semitechnologies/reranker-transformers:cross-encoder-ms-marco-MiniLM-L-6-v2
+
+  triton:
+    build:
+      context: ./triton
+    platform: "linux/amd64"
+    ports:
+      - 8000:8000
+      - 8001:8001 
+      - 8002:8002
+    shm_size: '500MB'  #shared memory size
+    restart: on-failure
+    environment:
+    - MODEL_REPOSITORY
+    - CLIP_MODEL_PATH
+    - CLIP_MODEL_VERSION
+    - GEMMA_MODEL_PATH
+    - GEMMA_MODEL_VERSION
+    - HF_TOKEN
+    volumes:
+    - triton:/models
+
+  weavmanage:
+    build:
+      context: ./weavmanage
+    environment:
+    - WEAVIATE_HOST
+    - WEAVIATE_PORT
+    - WEAVIATE_GRPC_PORT
+    depends_on:
+      - weaviate
+
+  weavloader:
+    build:
+      context: ./weavloader
+    ports:
+      - 8081:8080
+      - 5555:5555
+    restart: on-failure
+    environment:
+    - TRITON_HOST
+    - TRITON_PORT
+    - WEAVIATE_HOST
+    - WEAVIATE_PORT
+    - SAGE_USER
+    - SAGE_PASS
+    - CELERY_BROKER_URL
+    - CELERY_RESULT_BACKEND
+    - UNALLOWED_NODES
+    - LOG_LEVEL
+    - MONITOR_DATA_STREAM_INTERVAL
+    - MONITOR_DATA_STREAM_QUERY_DELAY_MINUTE
+    depends_on:
+      - weaviate
+      - weavmanage
+      - triton
+
+  gradio-ui:
+    build:
+      context: ./app
+    ports:
+      - 7860:7860
+    restart: on-failure
+    environment:
+    - WEAVIATE_HOST
+    - WEAVIATE_PORT
+    - WEAVIATE_GRPC_PORT
+    - CLUSTER_FLAG
+    - UNALLOWED_NODES
+
+volumes:
+  triton:
+  weaviate:
diff --git a/triton/Dockerfile b/triton/Dockerfile
@@ -1,41 +1,5 @@
-# 1. Builder stage: compile OpenAI Triton on aarch64 for Gemma3, enable this step for H100
-# --------------------------------------------------------------------------
-# FROM python:3.10-slim AS builder
+FROM nvcr.io/nvidia/tritonserver:24.06-pyt-python-py3
 
-# # Install build dependencies
-# RUN apt-get update && \
-#     apt-get install -y --no-install-recommends \
-#       build-essential cmake ninja-build git python3-dev libnuma-dev && \
-#     rm -rf /var/lib/apt/lists/*
-
-# WORKDIR /tmp
-
-# # Clone and build Triton wheel
-# RUN git clone --depth 1 --branch v3.0.0 https://github.com/openai/triton.git triton && \
-#     cd triton/python && \
-#     pip wheel --no-cache-dir . -w /tmp/wheels && \
-#     cd /tmp && rm -rf triton
-
-# 2. Runtime stage: Nvidia Triton server
-# Use Triton's base image with Python 3.11
-# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
-# --------------------------------------------------------------------------
-# built for NVIDIA Driver Release 510 or later (Sage Blades, V033)
-# FROM nvcr.io/nvidia/tritonserver:22.04-py3 <-- used this when using florence 2 base model, swtiched to 23.12 for Qwen2.5-VL-72B-Instruct
-# FROM nvcr.io/nvidia/tritonserver:23.12-py3
-
-# built for NVIDIA Driver Release 545 or later (Sage H100)
-# FROM nvcr.io/nvidia/tritonserver:24.06-py3 switching to below to try to trim image size
-FROM nvcr.io/nvidia/tritonserver:24.06-pyt-python-py3  
-
-# Set the Hugging Face token as a build argument and environment variable
-# ARG HF_TOKEN
-# ENV HF_TOKEN=${HF_TOKEN}
-
-# Fix missing GPG key error
-RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
-
-# Install system dependencies
 RUN apt-get update \
   && apt-get install -y \
   wget \
@@ -44,80 +8,16 @@ RUN apt-get update \
   libglib2.0-0 \
   git
 
-# Set working directory
 WORKDIR /app
 
-# Copy the requirements.txt into the container
 COPY requirements.txt .
 COPY torch_requirements.txt .
-#COPY flash_requirements.txt . enable for flash attention, flash attention must have have CUDA 11.7 and above 
 
-# Install dependencies using pip
-RUN pip install --no-cache-dir --force-reinstall -r torch_requirements.txt
+RUN pip install --no-cache-dir -r torch_requirements.txt
 RUN pip install --no-cache-dir  --no-build-isolation -r requirements.txt
-# RUN pip install --no-cache-dir --no-build-isolation -r flash_requirements.txt enable for flash attention, flash attention must have have CUDA 11.7 and above 
-
-# Copy and install the Triton wheel from builder for Gemma3, enable this step for H100
-# COPY --from=builder /tmp/wheels/triton-*.whl /tmp/
-# RUN pip install --no-cache-dir /tmp/triton-*.whl && rm /tmp/triton-*.whl
-
-# Set environment variables
-ENV MODEL_PATH=/app/Florence-2-base
-ENV MODEL_VERSION=ee1f1f163f352801f3b7af6b2b96e4baaa6ff2ff
-ENV COLBERT_MODEL_PATH=/app/colbertv2.0
-ENV COLBERT_MODEL_VERSION=c1e84128e85ef755c096a95bdb06b47793b13acf
-ENV ALIGN_MODEL_PATH=/app/align-base
-ENV ALIGN_MODEL_VERSION=e96a37facc7b1f59090ece82293226b817afd6ba
-ENV CLIP_MODEL_PATH=/app/DFN5B-CLIP-ViT-H-14-378
-# ENV CLIP_MODEL_VERSION=01b771ed0d1395ca5ffdd279897d665ebe00dfd2
-ENV CLIP_MODEL_VERSION=419d1f8f6a96aabaf5913c526d059facda50c24b
-ENV QWEN_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct
-ENV QWEN_MODEL_VERSION=cc594898137f460bfe9f0759e9844b3ce807cfb5
-ENV GEMMA_MODEL_PATH=/app/gemma-3-4b-it
-ENV GEMMA_MODEL_VERSION=093f9f388b31de276ce2de164bdc2081324b9767
-
-# Download Florence 2 model from Hugging Face
-# RUN huggingface-cli download \
-#   --local-dir $MODEL_PATH \
-#   --revision $MODEL_VERSION \
-#   microsoft/Florence-2-base
-
-# Download ColBERT model from Hugging Face
-# RUN huggingface-cli download \
-#   --local-dir $COLBERT_MODEL_PATH \
-#   --revision $COLBERT_MODEL_VERSION \
-#   colbert-ir/colbertv2.0
-
-# Download allign model from Hugging Face
-# RUN huggingface-cli download \
-#   --local-dir $ALIGN_MODEL_PATH \
-#   --revision $ALIGN_MODEL_VERSION \
-#   kakaobrain/align-base
-
-# Download CLIP model from Hugging Face
-# RUN huggingface-cli download \
-#   --local-dir $CLIP_MODEL_PATH \
-#   --revision $CLIP_MODEL_VERSION \
-#   apple/DFN5B-CLIP-ViT-H-14-378
-
-# Download Qwen model from Hugging Face
-# RUN huggingface-cli download \
-#   --local-dir $QWEN_MODEL_PATH \
-#   --revision $QWEN_MODEL_VERSION \
-#   Qwen/Qwen2.5-VL-7B-Instruct
-
-# Download Gemma model from Hugging Face
-# RUN huggingface-cli login --token "$HF_TOKEN" \
-#   && huggingface-cli download \
-#   --local-dir $GEMMA_MODEL_PATH \
-#   --revision $GEMMA_MODEL_VERSION \
-#   google/gemma-3-4b-it
-
-# Copy the application code into the container
-COPY . .
 
-# Expose Triton server ports
-EXPOSE 8000 8001 8002
+COPY models models
+COPY HyperParameters.py HyperParameters.py
+COPY entrypoint.sh entrypoint.sh
 
-# Start the Triton Inference Server with the Python Backend
 ENTRYPOINT ["./entrypoint.sh"]
diff --git a/triton/entrypoint.sh b/triton/entrypoint.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
-#Test
+
 set -e
 
-# Download CLIP model if not already present and check if directory is empty
+#Download CLIP model if not already present and check if directory is empty
 if [ ! -d "$CLIP_MODEL_PATH" ] || [ -z "$(ls -A "$CLIP_MODEL_PATH" 2>/dev/null)" ]; then
   echo "Downloading CLIP model..."
   HF_TOKEN= huggingface-cli download \
       --local-dir "$CLIP_MODEL_PATH" \
       --revision "$CLIP_MODEL_VERSION" \
       apple/DFN5B-CLIP-ViT-H-14-378
-  else
+else
   echo "CLIP model already present. Skipping download."
 fi
 
@@ -23,12 +23,12 @@ if [ -n "$HF_TOKEN" ]; then
       --local-dir "$GEMMA_MODEL_PATH" \
       --revision "$GEMMA_MODEL_VERSION" \
       google/gemma-3-4b-it
-  else
+else
     echo "Gemma model already present. Skipping download."
   fi
 else
   echo "HF_TOKEN not provided. Skipping Hugging Face model downloads."
 fi
 
 # Start Triton Inference Server
-exec tritonserver --model-repository=/app/models "$@"
+exec tritonserver --model-repository=$MODEL_REPOSITORY "$@"
diff --git a/triton/models/clip/1/model.py b/triton/models/clip/1/model.py
@@ -3,7 +3,6 @@
 import triton_python_backend_utils as pb_utils
 import torch
 from transformers import CLIPProcessor, CLIPModel
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 
 MODEL_PATH = os.environ.get("CLIP_MODEL_PATH")