Add example overlay to use an inference model deployed remotely

jgarciao · jgarciao · commit 09d70ea17b9b · 2025-07-07T11:57:36.000+02:00
Signed-off-by: Jorge Garcia Oncins &lt;jgarciao@redhat.com&gt;
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
@@ -24,7 +24,7 @@ oc new-project rag-stack
 
 ### 3. Deploy the Stack
 
-The project offers three deployment options:
+The project offers multiple deployment options:
 
 #### Option A: Default Setup (KServe vLLM + Llama 3.2)
 ```bash
@@ -42,6 +42,27 @@ oc apply -k stack/overlays/vllm-standalone-llama3.2
 oc patch secret hf-token-secret --type='merge' -p='{"data":{"HF_TOKEN":"'$(echo -n "hf_your_token" | base64)'"}}'
 ```
 
+#### Option D: setup using an inference model deployed remotely
+
+Note: do not use VLLM_TLS_VERIFY=false in production environments
+```bash
+# Create secret llama-stack-remote-inference-model-secret providing remote model info
+export INFERENCE_MODEL=llama-3-2-3b
+export VLLM_URL=https://llama-3-2-3b.apps.remote-cluster.com:443/v1
+export VLLM_TLS_VERIFY=false
+export VLLM_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXX
+
+oc create secret generic llama-stack-remote-inference-model-secret \
+  --from-literal INFERENCE_MODEL=$INFERENCE_MODEL   \
+  --from-literal VLLM_URL=$VLLM_URL                 \
+  --from-literal VLLM_TLS_VERIFY=$VLLM_TLS_VERIFY   \
+  --from-literal VLLM_API_TOKEN=$VLLM_API_TOKEN     
+  
+# Deploy the LlamaStackDistribution
+oc apply -k stack/overlays/vllm-remote-inference-model
+```
+
+
 ### 4. Verify Deployment
 
 Check if all pods are running:
diff --git a/stack/overlays/vllm-remote-inference-model/kustomization.yaml b/stack/overlays/vllm-remote-inference-model/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - llama-stack-distribution.yaml
+
diff --git a/stack/overlays/vllm-remote-inference-model/llama-stack-distribution.yaml b/stack/overlays/vllm-remote-inference-model/llama-stack-distribution.yaml
@@ -0,0 +1,51 @@
+---
+apiVersion: llamastack.io/v1alpha1
+kind: LlamaStackDistribution
+metadata:
+  name: lsd-llama-milvus
+spec:
+  replicas: 1
+  server:
+    containerSpec:
+      resources:
+        requests:
+          cpu: "250m"
+          memory: "500Mi"
+        limits:
+          cpu: "2"
+          memory: "12Gi"
+      env:
+        - name: INFERENCE_MODEL
+          valueFrom:
+            secretKeyRef:
+              key: INFERENCE_MODEL
+              name: llama-stack-remote-inference-model-secret
+              optional: true
+        - name: VLLM_URL
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_URL
+              name: llama-stack-remote-inference-model-secret
+              optional: true
+        - name: VLLM_TLS_VERIFY
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_TLS_VERIFY
+              name: llama-stack-remote-inference-model-secret
+              optional: true
+        - name: VLLM_API_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_API_TOKEN
+              name: llama-stack-remote-inference-model-secret
+              optional: true
+        - name: MILVUS_DB_PATH
+          value: ~/.llama/milvus.db
+        - name: FMS_ORCHESTRATOR_URL
+          value: "http://localhost"
+      name: llama-stack
+      port: 8321
+    distribution:
+      image: quay.io/opendatahub/llama-stack:odh
+    storage:
+      size: "5Gi"
diff --git a/stack/overlays/vllm-remote-inference-model/llama-stack-remote-inference-model-secret.yaml b/stack/overlays/vllm-remote-inference-model/llama-stack-remote-inference-model-secret.yaml
@@ -0,0 +1,12 @@
+# Secret added as example. It should be manually created with the right values via
+# oc create secret generic ... before creating the llama-stack-distribution
+apiVersion: v1
+stringData:
+  INFERENCE_MODEL: ""
+  VLLM_API_TOKEN: ""
+  VLLM_TLS_VERIFY: ""
+  VLLM_URL: ""
+kind: Secret
+metadata:
+  name: llama-stack-remote-inference-model-secret
+type: Opaque