File tree Expand file tree Collapse file tree 2 files changed +96
-0
lines changed Expand file tree Collapse file tree 2 files changed +96
-0
lines changed Original file line number Diff line number Diff line change
1
+ To deploy a pod with the vllm that we can edit into a pod to run experimetns do:
2
+
3
+ 1- deploy the pod using vm-fm-vllm.yaml
4
+ 2- you can connect to the pod using
5
+ >> oc exec -it pod/llama-3-8b-768cdff9f5-srt77 -- /bin/sh
6
+ 3- inside the pod run this commands:
7
+
8
+ # uninstall vllm
9
+
10
+ pip uninstall -y vllm
11
+
12
+ # install vllm developer mode, I think that is where the vllm source is inside the image
13
+
14
+ git clone
< [email protected] > : diegocastanibm /vllm.git
15
+ VLLM_USE_PRECOMPILED=1 pip install -e vllm
16
+
17
+ # Install benchmarch packages and run server and benchmark
18
+
19
+ 1- Run in terminal 1:
20
+ >> VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B
21
+
22
+ 2- Run in terminal 2:
23
+ >> python3 vllm/benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.2-1B --endpoint /v1/completions --dataset-name sharegpt --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10
24
+
25
+ If you need to download the dataset (shareGPT) do:
26
+ wget < https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json >
27
+
28
+ Note:
29
+ If you have an error that pandas or dataset packages are missed, we need to run:
30
+ pip install vllm[ bench]
Original file line number Diff line number Diff line change
1
+ apiVersion : apps/v1
2
+ kind : Deployment
3
+ metadata :
4
+ name : llama-3-8b
5
+ namespace : caching
6
+ labels :
7
+ app : llama-3-8b
8
+ spec :
9
+ replicas : 1
10
+ selector :
11
+ matchLabels :
12
+ app : llama-3-8b
13
+ template :
14
+ metadata :
15
+ labels :
16
+ app : llama-3-8b
17
+ spec :
18
+ volumes :
19
+ - name : cache-volume
20
+ persistentVolumeClaim :
21
+ claimName : llama-3-8b
22
+ # vLLM needs to access the host's shared memory for tensor parallel inference.
23
+ - name : shm
24
+ emptyDir :
25
+ medium : Memory
26
+ sizeLimit : " 2Gi"
27
+ containers :
28
+ - name : llama-3-8b
29
+ image : vllm/vllm-openai:latest
30
+ command : ["/bin/sh", "-c", "while true; do sleep 10000; done"]
31
+ env :
32
+ - name : HUGGING_FACE_HUB_TOKEN
33
+ valueFrom :
34
+ secretKeyRef :
35
+ name : hf-token-secret-llama3
36
+ key : token
37
+ resources :
38
+ limits :
39
+ cpu : " 10"
40
+ memory : 20G
41
+ nvidia.com/gpu : " 1"
42
+ requests :
43
+ cpu : " 2"
44
+ memory : 6G
45
+ nvidia.com/gpu : " 1"
46
+ volumeMounts :
47
+ - mountPath : /root/.cache/huggingface
48
+ name : cache-volume
49
+ - name : shm
50
+ mountPath : /dev/shm
51
+
52
+ ---
53
+
54
+ apiVersion : v1
55
+ kind : PersistentVolumeClaim
56
+ metadata :
57
+ name : llama-3-8b
58
+ namespace : caching
59
+ spec :
60
+ accessModes :
61
+ - ReadWriteMany
62
+ resources :
63
+ requests :
64
+ storage : 300Gi
65
+ storageClassName : ocs-storagecluster-cephfs
66
+
You can’t perform that action at this time.
0 commit comments