added sample LLM app to AI PODs workshop

dmitchsplunk · dmitchsplunk · commit d3e2819404cc · 2025-09-30T10:32:32.000-07:00
diff --git a/content/en/ninja-workshops/14-cisco-ai-pods/8-deploy-vector-db.md b/content/en/ninja-workshops/14-cisco-ai-pods/8-deploy-vector-db.md
@@ -195,6 +195,7 @@ We'll deploy a Kubernetes Job to our OpenShift cluster to load the embeddings.
 A job is used rather than a pod to ensure that this process runs only once: 
 
 ``` bash
+oc create namespace llm-app
 oc apply -f k8s-job.yaml
 ```
 
diff --git a/content/en/ninja-workshops/14-cisco-ai-pods/9-deploy-llm-app.md b/content/en/ninja-workshops/14-cisco-ai-pods/9-deploy-llm-app.md
@@ -5,7 +5,58 @@ weight: 9
 time: 10 minutes
 ---
 
-Now that our LLM is up and running, we'll add the Prometheus receiver to our
-OpenTelemetry collector to gather metrics from it.
+In the final step of the workshop, we'll deploy an application to our Cisco AI POD 
+that uses the instruct and embeddings models that we deployed earlier using the 
+NVIDIA NIM operator. 
 
-## Capture the NVIDIA DCGM Exporter metrics
+## Deploy the LLM Application
+
+Let's deploy an application to our OpenShift cluster that answers questions 
+using the context that we loaded into the Weaviate vector database earlier. 
+
+``` bash
+cd workshop/cisco-ai-pods/llm-app
+oc apply -f k8s-manifest.yaml
+```
+
+> Note: to build a Docker image for this Python application, we executed the following commands:
+> ``` bash
+> cd workshop/cisco-ai-pods/llm-app
+> docker build --platform linux/amd64 -t derekmitchell399/llm-app:1.0 .
+> docker push derekmitchell399/llm-app:1.0
+> ```
+
+## Test the LLM Application
+
+Let's ensure the application is working as expected.
+
+Start a pod that has access to the curl command:
+
+``` bash
+oc run --rm -it -n default curl --image=curlimages/curl:latest -- sh
+```
+
+Then run the following command to send a question to the LLM:
+
+{{< tabs >}}
+{{% tab title="Script" %}}
+
+``` bash
+curl -X "POST" \
+ 'http://llm-app.llm-app:8080/askquestion"' \
+  -H 'Accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "question": "How much memory does the NVIDIA H200 have?"
+  }'
+```
+
+{{% /tab %}}
+{{% tab title="Example Output" %}}
+
+``` bash
+TBD
+```
+
+{{% /tab %}}
+{{< /tabs >}}
diff --git a/workshop/cisco-ai-pods/llm-app/.gitignore b/workshop/cisco-ai-pods/llm-app/.gitignore
@@ -0,0 +1 @@
+.venv/
diff --git a/workshop/cisco-ai-pods/llm-app/Dockerfile b/workshop/cisco-ai-pods/llm-app/Dockerfile
@@ -0,0 +1,22 @@
+# Use an official Python runtime as a parent image
+FROM python:3.12-slim
+
+# Set working directory
+WORKDIR /app
+
+COPY requirements.txt /app/
+
+# Install dependencies separately
+RUN pip install -r requirements.txt
+
+
+# Add additional OpenTelemetry instrumentation packages
+RUN opentelemetry-bootstrap --action=install
+
+# Copy the application code
+COPY . /app
+
+# Expose the application on port 8080
+EXPOSE 8080
+
+ENTRYPOINT ["opentelemetry-instrument", "flask", "run", "-p", "8080", "--host", "0.0.0.0"]
diff --git a/workshop/cisco-ai-pods/llm-app/app.py b/workshop/cisco-ai-pods/llm-app/app.py
@@ -0,0 +1,75 @@
+import os
+import weaviate
+import openlit
+
+from flask import Flask, request
+from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_weaviate import WeaviateVectorStore
+
+app = Flask(__name__)
+
+openlit.init()
+
+# Read environment variables
+INSTRUCT_MODEL_URL = os.getenv('INSTRUCT_MODEL_URL') # i.e. http://localhost:8000/v1
+EMBEDDINGS_MODEL_URL = os.getenv('EMBEDDINGS_MODEL_URL') # i.e. http://localhost:8001/v1
+
+# connect to a LLM NIM at the specified endpoint, specifying a specific model
+llm = ChatNVIDIA(base_url=INSTRUCT_MODEL_URL, model="meta/llama-3.2-1b-instruct")
+
+# Initialize and connect to a NeMo Retriever Text Embedding NIM (nvidia/llama-3.2-nv-embedqa-1b-v2)
+embeddings_model = NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2",
+                                   base_url=EMBEDDINGS_MODEL_URL)
+
+prompt = ChatPromptTemplate.from_messages([
+    ("system",
+        "You are a helpful and friendly AI!"
+        "Your responses should be concise and no longer than two sentences."
+        "Do not hallucinate. Say you don't know if you don't have this information."
+        # "Answer the question using only the context"
+        "\n\nQuestion: {question}\n\nContext: {context}"
+    ),
+    ("user", "{question}")
+])
+
+@app.route("/askquestion", methods=['POST'])
+def ask_question():
+
+    data = request.json
+    question = data.get('question')
+
+    weaviate_client = weaviate.connect_to_custom(
+        # url is:  http://weaviate.weaviate.svc.cluster.local:80
+        http_host=os.getenv('WEAVIATE_HTTP_HOST'),
+        http_port=os.getenv('WEAVIATE_HTTP_PORT'),
+        http_secure=False,
+        grpc_host=os.getenv('WEAVIATE_GRPC_HOST'),
+        grpc_port=os.getenv('WEAVIATE_GRPC_PORT'),
+        grpc_secure=False
+    )
+
+    # connect with the vector store that was populated earlier
+    vector_store = Weaviate(
+        client=weaviate_client,
+        embedding=embeddings_model
+    )
+
+    chain = (
+        {
+            "context": vector_store.as_retriever(),
+            "question": RunnablePassthrough()
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    response = chain.invoke(question)
+    print(response)
+
+    weaviate_client.close()
+
+    return response
diff --git a/workshop/cisco-ai-pods/llm-app/k8s-manifest.yaml b/workshop/cisco-ai-pods/llm-app/k8s-manifest.yaml
@@ -0,0 +1,71 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llm-app
+  namespace: llm-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llm-app
+      app.kubernetes.io/instance: llm-app
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llm-app
+        app.kubernetes.io/instance: llm-app
+    spec:
+      containers:
+        - name: llm-app
+          image: "derekmitchell399/llm-app:1.0"
+          imagePullPolicy: Always
+          ports:
+            - name: http
+              containerPort: 8080
+          env:
+            - name: OTEL_SERVICE_NAME
+              value: "llm-app"
+            - name: OTEL_RESOURCE_ATTRIBUTES
+              value: "deployment.environment=llm-app"
+            - name: SPLUNK_OTEL_AGENT
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.hostIP
+            - name: OTEL_EXPORTER_OTLP_ENDPOINT
+              value: "http://$(SPLUNK_OTEL_AGENT):4317"
+            - name: OTEL_EXPORTER_OTLP_PROTOCOL
+              value: "grpc"
+              # filter out health check requests to the root URL
+            - name: OTEL_PYTHON_EXCLUDED_URLS
+              value: "^(https?://)?[^/]+(/)?$"
+            - name: SPLUNK_PROFILER_ENABLED
+              value: "true"
+            - name: INSTRUCT_MODEL_URL
+              value: "http://meta-llama-3-2-1b-instruct.nim-service:8000/v1"
+            - name: EMBEDDINGS_MODEL_URL
+              value: "http://llama-32-nv-embedqa-1b-v2.nim-service:8000/v1"
+            - name: WEAVIATE_HTTP_HOST
+              value: "weaviate.weaviate.svc.cluster.local"
+            - name: WEAVIATE_HTTP_PORT
+              value: "80"
+            - name: WEAVIATE_GRPC_HOST
+              value: "weaviate.weaviate.svc.cluster.local"
+            - name: WEAVIATE_GRPC_PORT
+              value: "50051"
+          resources: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llm-app
+  namespace: llm-app
+spec:
+  type: NodePort
+  ports:
+    - protocol: TCP
+      port: 8080
+      targetPort: 8080
+  selector:
+    app.kubernetes.io/name: llm-app
+    app.kubernetes.io/instance: llm-app
diff --git a/workshop/cisco-ai-pods/llm-app/requirements.txt b/workshop/cisco-ai-pods/llm-app/requirements.txt
@@ -0,0 +1,7 @@
+Flask==3.0.3
+langchain_community==0.3.22
+langchain-nvidia-ai-endpoints==0.3.7
+langchain-weaviate==0.0.5
+weaviate-client==4.17.0
+splunk-opentelemetry==2.7.0
+openlit==1.35.4
diff --git a/workshop/cisco-ai-pods/load-embeddings/k8s-job.yaml b/workshop/cisco-ai-pods/load-embeddings/k8s-job.yaml
@@ -3,6 +3,7 @@ apiVersion: batch/v1
 kind: Job
 metadata:
   name: load-embeddings
+  namespace: llm-app
 spec:
   template:
     spec: