dynamo/recipes/llama-3-70b/vllm/disagg-multi-node/perf.yaml at main · drivenets/dynamo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
  name: llama3-70b-disagg-mn-perf
spec:
  backoffLimit: 1
  completions: 1
  parallelism: 1
  template:
    metadata:
      labels:
        app: llama3-70b-disagg-mn-perf
    spec:
      restartPolicy: Never
      containers:
      - command:
        - /bin/sh
        - -c
        - |
          apt-get update && apt-get install -y curl jq procps git && apt-get clean
          pip install git+https://github.com/ai-dynamo/aiperf.git@54cd6dc820bff8bfebc875da104e59d745e14f75;
          echo "aiperf installation completed";
          sysctl -w net.ipv4.ip_local_port_range="1024 65000"
          cat /proc/sys/net/ipv4/ip_local_port_range
          export COLUMNS=200
          EPOCH=$(date +%s)
          ## utility functions -- can be moved to a bash script / configmap
          wait_for_model_ready() {
            echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
            while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
                echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
                sleep 5
            done
            echo "✅ Model '$TARGET_MODEL' is now available!"
            echo "Model '$TARGET_MODEL' is now available!"
            curl -s "http://$ENDPOINT/v1/models" | jq .
          }
          run_perf() {
            local concurrency=$1
            local isl=$2
            local osl=$3
            local max_threads=${concurrency}
            key=concurrency_${concurrency}
            export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
            mkdir -p "$ARTIFACT_DIR"
            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
            aiperf profile --artifact-dir $ARTIFACT_DIR \
                --model $TARGET_MODEL \
                --tokenizer /root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd  \
                --endpoint-type chat  \
                --endpoint /v1/chat/completions \
                --streaming \
                --url http://$ENDPOINT \
                --synthetic-input-tokens-mean $isl \
                --synthetic-input-tokens-stddev 0 \
                --output-tokens-mean $osl \
                --output-tokens-stddev 0 \
                --extra-inputs max_tokens:$osl \
                --extra-inputs min_tokens:$osl \
                --extra-inputs ignore_eos:true \
                --extra-inputs repetition_penalty:1.0 \
                --extra-inputs temperature:0.0 \
                --extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
                --concurrency $concurrency \
                --request-count $((10*concurrency)) \
                --warmup-request-count $concurrency \
                --num-dataset-entries 12800 \
                --random-seed 100 \
                --workers-max $max_threads \
                -H 'Authorization: Bearer NOT USED' \
                -H 'Accept: text/event-stream'\
                --record-processors 32 \
                --ui simple
            echo "ARTIFACT_DIR: $ARTIFACT_DIR"
            ls -la $ARTIFACT_DIR
          }
          #### Actual execution ####
          wait_for_model_ready
          mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
          # Calculate total concurrency based on per-GPU concurrency and GPU count
          TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
          echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
          # Write input_config.json
          cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
          {
            "gpu_count": $DEPLOYMENT_GPU_COUNT,
            "max_threads": $max_threads,
            "concurrency_per_gpu": $CONCURRENCY_PER_GPU,
            "total_concurrency": $TOTAL_CONCURRENCY,
            "mode": "$DEPLOYMENT_MODE",
            "isl": $ISL,
            "osl": $OSL,
            "endpoint": "$ENDPOINT",
            "model endpoint": "$TARGET_MODEL"
          }
          EOF

          # Run perf with calculated total concurrency
          run_perf $TOTAL_CONCURRENCY $ISL $OSL
          echo "done with concurrency $TOTAL_CONCURRENCY"
        env:
        - name: TARGET_MODEL
          value: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
        - name: ENDPOINT
          value: llama3-70b-disagg-mn-frontend:8000
        - name: CONCURRENCY_PER_GPU
          value: "16"
        - name: DEPLOYMENT_GPU_COUNT
          value: "16"
        - name: ISL
          value: "8192"
        - name: OSL
          value: "1024"
        - name: DEPLOYMENT_MODE
          value: disagg-mn
        - name: AIPERF_HTTP_CONNECTION_LIMIT
          value: "200"
        - name: JOB_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.labels['job-name']
        - name: ROOT_ARTIFACT_DIR
          value: /root/.cache/huggingface/perf
        - name: HF_HOME
          value: /root/.cache/huggingface
        - name: PYTHONUNBUFFERED
          value: "1"
        image: python:3.12-slim
        imagePullPolicy: IfNotPresent
        name: perf
        securityContext:
          privileged: true
        volumeMounts:
        - name: model-cache
          mountPath: /root/.cache/huggingface
        workingDir: /workspace
      imagePullSecrets:
      - name: nvcrimagepullsecret
      volumes:
      - name: model-cache
        persistentVolumeClaim:
          claimName: model-cache