Merge pull request #7 from Bobbins228/llama-stack-rhoai

dmaniloff · web-flow · commit 2c60a92195e2 · 2025-10-17T08:25:33.000-04:00
Guide for deploying llama stack with eval provider on openshift &amp;&amp; AWS credentials on the LLS CR
diff --git a/demos/llama-stack-openshift/README.md b/demos/llama-stack-openshift/README.md
@@ -0,0 +1,55 @@
+# Deploying Llama Stack on OpenShift AI with the remote Ragas eval provider
+
+## Prerequisites
+* OpenShift AI or Open Data Hub installed on your OpenShift Cluster
+* Data Science Pipeline Server configured
+* Llama Stack Operator installed
+* A VLLM hosted Model either through Kserve or MaaS. You can follow these [docs](https://docs.redhat.com/en/documentation/red_hat_openshift_ai_cloud_service/1/html/working_with_rag/deploying-a-rag-stack-in-a-data-science-project_rag#Deploying-a-llama-model-with-kserve_rag) until step 3.4
+
+## Setup
+Create a secret for storing your model's information.
+```
+export INFERENCE_MODEL="llama-3-2-3b"
+export VLLM_URL="https://llama-32-3b-instruct-predictor:8443/v1"
+export VLLM_TLS_VERIFY="false" # Use "true" in production!
+export VLLM_API_TOKEN="<token identifier>"
+
+oc create secret generic llama-stack-inference-model-secret \
+  --from-literal INFERENCE_MODEL="$INFERENCE_MODEL" \
+  --from-literal VLLM_URL="$VLLM_URL" \
+  --from-literal VLLM_TLS_VERIFY="$VLLM_TLS_VERIFY" \
+  --from-literal VLLM_API_TOKEN="$VLLM_API_TOKEN"
+```
+
+## Setup Deployment files
+### Configuring the `kubeflow-ragas-config` ConfigMap
+Update the [kubeflow-ragas-config](deployment/kubeflow-ragas-config.yaml) with the following data:
+``` bash
+# See project README for more details
+EMBEDDING_MODEL=all-MiniLM-L6-v2
+KUBEFLOW_LLAMA_STACK_URL=<your-llama-stack-url>
+KUBEFLOW_PIPELINES_ENDPOINT=<your-kfp-endpoint>
+KUBEFLOW_NAMESPACE=<your-namespace>
+KUBEFLOW_BASE_IMAGE=quay.io/diegosquayorg/my-ragas-provider-image:latest
+KUBEFLOW_RESULTS_S3_PREFIX=s3://my-bucket/ragas-results
+KUBEFLOW_S3_CREDENTIALS_SECRET_NAME=<secret-name>
+```
+
+> [!NOTE]
+> The `KUBEFLOW_LLAMA_STACK_URL` must be an external route.
+
+### Configuring the `pipelines_token` Secret
+Unfortunately the Llama Stack distribution service account does not have privilages to create pipeline runs. In order to work around this we must provide a user token as a secret to the Llama Stack Distribution.
+
+Create the secret with:
+``` bash
+# Gather your token with `oc whoami -t`
+kubectl create secret generic kubeflow-pipelines-token \
+  --from-literal=KUBEFLOW_PIPELINES_TOKEN=<your-pipelines-token>
+```
+
+## Deploy Llama Stack on OpenShift
+You can now deploy the configuration files and the Llama Stack distribution with `oc apply -f deployment/kubeflow-ragas-config.yaml` and `oc apply -f deployment/llama-stack-distribution.yaml`
+
+You should now have a Llama Stack server on OpenShift with the remote ragas eval provider configured.
+You can now follow the [remote_demo.ipynb](../../demos/remote_demo.ipynb) demo but ensure you are running it in a Data Science workbench and use the `LLAMA_STACK_URL` defined earlier. Alternatively you can run it locally if you create a Route.
diff --git a/demos/llama-stack-openshift/deployment/kubeflow-ragas-config.yaml b/demos/llama-stack-openshift/deployment/kubeflow-ragas-config.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kubeflow-ragas-config
+data:
+  EMBEDDING_MODEL: "all-MiniLM-L6-v2"
+  KUBEFLOW_LLAMA_STACK_URL: "<your-llama-stack-url>"
+  KUBEFLOW_PIPELINES_ENDPOINT: "<your-kfp-endpoint>"
+  KUBEFLOW_NAMESPACE: "<your-namespace>"
+  KUBEFLOW_BASE_IMAGE: "quay.io/diegosquayorg/my-ragas-provider-image:latest"
+  KUBEFLOW_RESULTS_S3_PREFIX: "s3://my-bucket/ragas-results"
+  KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: "<secret-name>"
diff --git a/demos/llama-stack-openshift/deployment/llama-stack-distribution.yaml b/demos/llama-stack-openshift/deployment/llama-stack-distribution.yaml
@@ -0,0 +1,116 @@
+apiVersion: llamastack.io/v1alpha1
+kind: LlamaStackDistribution
+metadata:
+  name: lsd-ragas-example
+spec:
+  replicas: 1
+  server:
+    containerSpec:
+      resources:
+        requests:
+          cpu: 4
+          memory: "12Gi"
+        limits:
+          cpu: 6
+          memory: "14Gi"
+      env:
+        - name: INFERENCE_MODEL
+          valueFrom:
+            secretKeyRef:
+              key: INFERENCE_MODEL
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_MAX_TOKENS
+          value: "4096"
+        - name: VLLM_URL
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_URL
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_TLS_VERIFY
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_TLS_VERIFY
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: VLLM_API_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: VLLM_API_TOKEN
+              name: llama-stack-inference-model-secret
+              optional: true
+        - name: MILVUS_DB_PATH
+          value: ~/milvus.db
+        - name: FMS_ORCHESTRATOR_URL
+          value: "http://localhost"
+        - name: KUBEFLOW_PIPELINES_ENDPOINT
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_PIPELINES_ENDPOINT
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_NAMESPACE
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_NAMESPACE
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_BASE_IMAGE
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_BASE_IMAGE
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_LLAMA_STACK_URL
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_LLAMA_STACK_URL
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_RESULTS_S3_PREFIX
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_RESULTS_S3_PREFIX
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
+          valueFrom:
+            configMapKeyRef:
+              key: KUBEFLOW_S3_CREDENTIALS_SECRET_NAME
+              name: kubeflow-ragas-config
+              optional: true
+        - name: EMBEDDING_MODEL
+          valueFrom:
+            configMapKeyRef:
+              key: EMBEDDING_MODEL
+              name: kubeflow-ragas-config
+              optional: true
+        - name: KUBEFLOW_PIPELINES_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: KUBEFLOW_PIPELINES_TOKEN
+              name: kubeflow-pipelines-token
+              optional: true
+        - name: AWS_ACCESS_KEY_ID
+          valueFrom:
+            secretKeyRef:
+              key: AWS_ACCESS_KEY_ID
+              name: aws-credentials
+              optional: true
+        - name: AWS_SECRET_ACCESS_KEY
+          valueFrom:
+            secretKeyRef:
+              key: AWS_SECRET_ACCESS_KEY
+              name: aws-credentials
+              optional: true
+        - name: AWS_DEFAULT_REGION
+          valueFrom:
+            secretKeyRef:
+              key: AWS_DEFAULT_REGION
+              name: aws-credentials
+              optional: true
+      name: llama-stack
+      port: 8321
+    distribution:
+      name: rh-dev
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-stack-provider-ragas"
-version = "0.3.5"
+version = "0.3.6"
 description = "Ragas evaluation as an out-of-tree Llama Stack provider"
 readme = "README.md"
 requires-python = ">=3.12"
diff --git a/uv.lock b/uv.lock