Merge pull request #23 from jgarciao/add-overlay-remote-model

openshift-merge-bot[bot] · web-flow · commit 24de683f8a99 · 2025-07-08T13:45:32.000Z
Add example overlay to use an inference model deployed remotely when creating a LlamaStackDistribution
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
@@ -24,7 +24,7 @@ oc new-project rag-stack
 
 ### 3. Deploy the Stack
 
-The project offers three deployment options:
+The project offers multiple deployment options:
 
 #### Option A: Default Setup (KServe vLLM + Llama 3.2)
 ```bash
@@ -42,6 +42,31 @@ oc apply -k stack/overlays/vllm-standalone-llama3.2
 oc patch secret hf-token-secret --type='merge' -p='{"data":{"HF_TOKEN":"'$(echo -n "hf_your_token" | base64)'"}}'
 ```
 
+#### Option D: setup using an inference model deployed remotely
+
+
+```bash
+# Create secret llama-stack-inference-model-secret providing model info
+# Important: 
+#  - Make sure that the value for INFERENCE_MODEL is correct (it doesn't have points)
+#  - In VLLM_URL you can use internal or external endpoints for the model. Add /v1 at the end
+#  - NEVER set VLLM_TLS_VERIFY=false in production
+export INFERENCE_MODEL="llama-3-2-3b"
+export VLLM_URL="https://llama-3-2-3b.apps.remote-cluster.com:443/v1"
+export VLLM_TLS_VERIFY="false"
+export VLLM_API_TOKEN="XXXXXXXXXXXXXXXXXXXXXXX"
+
+oc create secret generic llama-stack-inference-model-secret \
+  --from-literal INFERENCE_MODEL="$INFERENCE_MODEL"   \
+  --from-literal VLLM_URL="$VLLM_URL"                 \
+  --from-literal VLLM_TLS_VERIFY="$VLLM_TLS_VERIFY"   \
+  --from-literal VLLM_API_TOKEN="$VLLM_API_TOKEN"     
+  
+# Deploy the LlamaStackDistribution
+oc apply -k stack/overlays/vllm-remote-inference-model
+```
+
+
 ### 4. Verify Deployment
 
 Check if all pods are running:
diff --git a/stack/overlays/vllm-remote-inference-model/kustomization.yaml b/stack/overlays/vllm-remote-inference-model/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - llama-stack-distribution.yaml
+
diff --git a/stack/overlays/vllm-remote-inference-model/llama-stack-distribution.yaml b/stack/overlays/vllm-remote-inference-model/llama-stack-distribution.yaml
@@ -0,0 +1,51 @@
+---
+apiVersion: llamastack.io/v1alpha1
+kind: LlamaStackDistribution
+metadata:
+  name: lsd-llama-milvus
+spec:
+  replicas: 1
+  server:
+    containerSpec:
+      resources:
+        requests:
+          cpu: "250m"
+          memory: "500Mi"
+        limits:
+          cpu: "2"
+          memory: "12Gi"
+      env:
+        - name: INFERENCE_MODEL
+          valueFrom:
+            secretKeyRef:
+              key: INFERENCE_MODEL
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_URL
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_URL
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_TLS_VERIFY
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_TLS_VERIFY
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_API_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_API_TOKEN
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: MILVUS_DB_PATH
+          value: ~/.llama/milvus.db
+        - name: FMS_ORCHESTRATOR_URL
+          value: "http://localhost"
+      name: llama-stack
+      port: 8321
+    distribution:
+      image: quay.io/opendatahub/llama-stack:odh
+    storage:
+      size: "5Gi"
diff --git a/stack/overlays/vllm-remote-inference-model/llama-stack-inference-model-secret.yaml b/stack/overlays/vllm-remote-inference-model/llama-stack-inference-model-secret.yaml
@@ -0,0 +1,16 @@
+# Secret added as example. It should be manually created with the right values via
+# oc create secret generic ... before creating the llama-stack-distribution
+# Important: 
+#  - Make sure that the value for INFERENCE_MODEL is correct (it doesn't have points)
+#  - In VLLM_URL you can use internal or external endpoints for the model. Add /v1 at the end
+#  - NEVER set VLLM_TLS_VERIFY=false in production
+apiVersion: v1
+kind: Secret
+metadata:
+  name: llama-stack-inference-model-secret
+type: Opaque  
+stringData:
+  INFERENCE_MODEL: "<your-model-id>"
+  VLLM_API_TOKEN: "<paste-api-token>"
+  VLLM_TLS_VERIFY: "true"                # or "false"
+  VLLM_URL: "https://your-model-id.example.com/v1"