Instructions to run deployment V3

diegocastanibm · diegocastanibm · commit 4d8bec561922 · 2025-08-15T11:43:22.000-04:00
Signed-off-by: Diego-Castan &lt;diego.castan@ibm.com&gt;
diff --git a/install_pod_files/v3/README.md b/install_pod_files/v3/README.md
@@ -0,0 +1,30 @@
+To deploy a pod with the vllm that we can edit into a pod to run experimetns do:
+
+1- deploy the pod using vm-fm-vllm.yaml
+2- you can connect to the pod using
+>> oc exec -it pod/llama-3-8b-768cdff9f5-srt77 -- /bin/sh
+3- inside the pod run this commands:
+
+# uninstall vllm
+
+pip uninstall -y vllm
+
+# install vllm developer mode, I think that is where the vllm source is inside the image
+
+git clone <git@github.com>:diegocastanibm/vllm.git
+VLLM_USE_PRECOMPILED=1 pip install -e vllm
+
+# Install benchmarch packages and run server and benchmark
+
+1- Run in terminal 1:
+>> VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B
+
+2- Run in terminal 2:
+>> python3 vllm/benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.2-1B --endpoint /v1/completions --dataset-name sharegpt --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10
+
+If you need to download the dataset (shareGPT) do:
+wget <https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json>
+
+Note:
+If you have an error that pandas or dataset packages are missed, we need to run:
+pip install vllm[bench]
diff --git a/install_pod_files/v3/vm-fm-vllm.yaml b/install_pod_files/v3/vm-fm-vllm.yaml
@@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-3-8b
+  namespace: caching
+  labels:
+    app: llama-3-8b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama-3-8b
+  template:
+    metadata:
+      labels:
+        app: llama-3-8b
+    spec:
+      volumes:
+      - name: cache-volume
+        persistentVolumeClaim:
+          claimName: llama-3-8b
+      # vLLM needs to access the host's shared memory for tensor parallel inference.
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: "2Gi"
+      containers:
+      - name: llama-3-8b
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c", "while true; do sleep 10000; done"]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret-llama3
+              key: token
+        resources:
+          limits:
+            cpu: "10"
+            memory: 20G
+            nvidia.com/gpu: "1"
+          requests:
+            cpu: "2"
+            memory: 6G
+            nvidia.com/gpu: "1"
+        volumeMounts:
+        - mountPath: /root/.cache/huggingface
+          name: cache-volume
+        - name: shm
+          mountPath: /dev/shm
+
+---
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-3-8b
+  namespace: caching
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 300Gi
+  storageClassName: ocs-storagecluster-cephfs
+