File tree Expand file tree Collapse file tree 4 files changed +91
-1
lines changed
stack/overlays/vllm-remote-inference-model Expand file tree Collapse file tree 4 files changed +91
-1
lines changed Original file line number Diff line number Diff line change @@ -24,7 +24,7 @@ oc new-project rag-stack
24
24
25
25
### 3. Deploy the Stack
26
26
27
- The project offers three deployment options:
27
+ The project offers multiple deployment options:
28
28
29
29
#### Option A: Default Setup (KServe vLLM + Llama 3.2)
30
30
``` bash
@@ -42,6 +42,27 @@ oc apply -k stack/overlays/vllm-standalone-llama3.2
42
42
oc patch secret hf-token-secret --type=' merge' -p=' {"data":{"HF_TOKEN":"' $( echo -n " hf_your_token" | base64) ' "}}'
43
43
```
44
44
45
+ #### Option D: setup using an inference model deployed remotely
46
+
47
+ Note: do not use VLLM_TLS_VERIFY=false in production environments
48
+ ``` bash
49
+ # Create secret llama-stack-remote-inference-model-secret providing remote model info
50
+ export INFERENCE_MODEL=llama-3-2-3b
51
+ export VLLM_URL=https://llama-3-2-3b.apps.remote-cluster.com:443/v1
52
+ export VLLM_TLS_VERIFY=false
53
+ export VLLM_API_TOKEN=XXXXXXXXXXXXXXXXXXXXXXX
54
+
55
+ oc create secret generic llama-stack-remote-inference-model-secret \
56
+ --from-literal INFERENCE_MODEL=$INFERENCE_MODEL \
57
+ --from-literal VLLM_URL=$VLLM_URL \
58
+ --from-literal VLLM_TLS_VERIFY=$VLLM_TLS_VERIFY \
59
+ --from-literal VLLM_API_TOKEN=$VLLM_API_TOKEN
60
+
61
+ # Deploy the LlamaStackDistribution
62
+ oc apply -k stack/overlays/vllm-remote-inference-model
63
+ ```
64
+
65
+
45
66
### 4. Verify Deployment
46
67
47
68
Check if all pods are running:
Original file line number Diff line number Diff line change
1
+ apiVersion : kustomize.config.k8s.io/v1beta1
2
+ kind : Kustomization
3
+
4
+ resources :
5
+ - llama-stack-distribution.yaml
6
+
Original file line number Diff line number Diff line change
1
+ ---
2
+ apiVersion : llamastack.io/v1alpha1
3
+ kind : LlamaStackDistribution
4
+ metadata :
5
+ name : lsd-llama-milvus
6
+ spec :
7
+ replicas : 1
8
+ server :
9
+ containerSpec :
10
+ resources :
11
+ requests :
12
+ cpu : " 250m"
13
+ memory : " 500Mi"
14
+ limits :
15
+ cpu : " 2"
16
+ memory : " 12Gi"
17
+ env :
18
+ - name : INFERENCE_MODEL
19
+ valueFrom :
20
+ secretKeyRef :
21
+ key : INFERENCE_MODEL
22
+ name : llama-stack-remote-inference-model-secret
23
+ optional : true
24
+ - name : VLLM_URL
25
+ valueFrom :
26
+ secretKeyRef :
27
+ key : VLLM_URL
28
+ name : llama-stack-remote-inference-model-secret
29
+ optional : true
30
+ - name : VLLM_TLS_VERIFY
31
+ valueFrom :
32
+ secretKeyRef :
33
+ key : VLLM_TLS_VERIFY
34
+ name : llama-stack-remote-inference-model-secret
35
+ optional : true
36
+ - name : VLLM_API_TOKEN
37
+ valueFrom :
38
+ secretKeyRef :
39
+ key : VLLM_API_TOKEN
40
+ name : llama-stack-remote-inference-model-secret
41
+ optional : true
42
+ - name : MILVUS_DB_PATH
43
+ value : ~/.llama/milvus.db
44
+ - name : FMS_ORCHESTRATOR_URL
45
+ value : " http://localhost"
46
+ name : llama-stack
47
+ port : 8321
48
+ distribution :
49
+ image : quay.io/opendatahub/llama-stack:odh
50
+ storage :
51
+ size : " 5Gi"
Original file line number Diff line number Diff line change
1
+ # Secret added as example. It should be manually created with the right values via
2
+ # oc create secret generic ... before creating the llama-stack-distribution
3
+ apiVersion : v1
4
+ stringData :
5
+ INFERENCE_MODEL : " "
6
+ VLLM_API_TOKEN : " "
7
+ VLLM_TLS_VERIFY : " "
8
+ VLLM_URL : " "
9
+ kind : Secret
10
+ metadata :
11
+ name : llama-stack-remote-inference-model-secret
12
+ type : Opaque
You can’t perform that action at this time.
0 commit comments