|
| 1 | +name: 'e2e nightly test multi_node' |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_call: |
| 5 | + inputs: |
| 6 | + soc_version: |
| 7 | + required: true |
| 8 | + type: string |
| 9 | + description: use a2 or a3 |
| 10 | + image: |
| 11 | + required: false |
| 12 | + type: string |
| 13 | + description: base image for pods |
| 14 | + default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11" |
| 15 | + config_file_path: |
| 16 | + required: true |
| 17 | + type: string |
| 18 | + description: the model config for multi_node test |
| 19 | + replicas: |
| 20 | + required: false |
| 21 | + default: "1" |
| 22 | + type: string |
| 23 | + description: replicas of the k8s cluster |
| 24 | + size: |
| 25 | + required: false |
| 26 | + default: "2" |
| 27 | + type: string |
| 28 | + description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need |
| 29 | + vllm_version: |
| 30 | + required: false |
| 31 | + default: "v0.11.0" |
| 32 | + type: string |
| 33 | + description: vllm version to use |
| 34 | + vllm_ascend_remote_url: |
| 35 | + required: false |
| 36 | + default: https://github.com/vllm-project/vllm-ascend.git |
| 37 | + type: string |
| 38 | + description: used for pr level tests |
| 39 | + vllm_ascend_ref: |
| 40 | + required: false |
| 41 | + default: main |
| 42 | + type: string |
| 43 | + description: used for pr level tests |
| 44 | + |
| 45 | + |
| 46 | +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly |
| 47 | +# declared as "shell: bash -el {0}" on steps that need to be properly activated. |
| 48 | +# It's used to activate ascend-toolkit environment variables. |
| 49 | +defaults: |
| 50 | + run: |
| 51 | + shell: bash -el {0} |
| 52 | + |
| 53 | +# only cancel in-progress runs of the same workflow |
| 54 | +# and ignore the lint / 8 cards test type |
| 55 | +concurrency: |
| 56 | + group: ${{ github.workflow }}-${{ github.ref }} |
| 57 | + cancel-in-progress: true |
| 58 | + |
| 59 | +jobs: |
| 60 | + e2e: |
| 61 | + # This is a runner with no NPU for k8s controller |
| 62 | + runs-on: linux-aarch64-a3-0 |
| 63 | + container: |
| 64 | + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 |
| 65 | + env: |
| 66 | + KUBECONFIG: /tmp/kubeconfig |
| 67 | + KUBECTL: /root/.cache/.kube/kubectl |
| 68 | + NAMESPACE: vllm-project |
| 69 | + LEADER_POD: vllm-0 |
| 70 | + RESULT_FILE: /root/.cache/tests/ret/test_result.txt |
| 71 | + steps: |
| 72 | + - name: Install system denpendencies |
| 73 | + run: | |
| 74 | + # configure apt and pip source |
| 75 | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list |
| 76 | + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple |
| 77 | + pip install jinja2-cli |
| 78 | +
|
| 79 | + apt-get update -y && apt-get install -y git curl |
| 80 | +
|
| 81 | + - name: Install kubectl |
| 82 | + run: | |
| 83 | + # Install kubectl |
| 84 | + install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl |
| 85 | + |
| 86 | + # Verify kubectl installation |
| 87 | + kubectl version --client=true |
| 88 | +
|
| 89 | + # TODO: Add A2 tests |
| 90 | + - name: Setup kubeconfig for A3 |
| 91 | + if: inputs.soc_version == 'a3' |
| 92 | + run: | |
| 93 | + # Decode and save kubeconfig |
| 94 | + echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG |
| 95 | +
|
| 96 | + - name: Checkout code |
| 97 | + uses: actions/checkout@v4 |
| 98 | + |
| 99 | + - name: Prepare scripts |
| 100 | + run: | |
| 101 | + # prepare for lws entrypoint scripts |
| 102 | + install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh |
| 103 | +
|
| 104 | + - name: Clear result ret |
| 105 | + run: | |
| 106 | + rm -f $RESULT_FILE |
| 107 | +
|
| 108 | + - name: Launch cluster |
| 109 | + run: | |
| 110 | + set -e |
| 111 | +
|
| 112 | + size="${{ inputs.size }}" |
| 113 | + replicas="${{ inputs.replicas }}" |
| 114 | + image="${{ inputs.image }}" |
| 115 | + config_file_path="${{ inputs.config_file_path }}" |
| 116 | + vllm_version="${{ inputs.vllm_version }}" |
| 117 | + vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}" |
| 118 | + vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}" |
| 119 | + result_file_path="$RESULT_FILE" |
| 120 | +
|
| 121 | + required_params=("size" "replicas" "image" "config_file_path") |
| 122 | + for param in "${required_params[@]}"; do |
| 123 | + if [ -z "${!param}" ]; then |
| 124 | + echo "Error: Parameter '$param' is required but empty" |
| 125 | + exit 1 |
| 126 | + fi |
| 127 | + done |
| 128 | +
|
| 129 | + jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \ |
| 130 | + -D size="$size" \ |
| 131 | + -D replicas="$replicas" \ |
| 132 | + -D image="$image" \ |
| 133 | + -D config_file_path="$config_file_path" \ |
| 134 | + -D vllm_version="$vllm_version" \ |
| 135 | + -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \ |
| 136 | + -D vllm_ascend_ref="$vllm_ascend_ref" \ |
| 137 | + -D result_file_path="$result_file_path" \ |
| 138 | + --outfile lws.yaml |
| 139 | +
|
| 140 | + kubectl apply -f ./lws.yaml |
| 141 | +
|
| 142 | + - name: Waiting for pod ready |
| 143 | + run: | |
| 144 | + echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..." |
| 145 | +
|
| 146 | + while true; do |
| 147 | + # get pod status |
| 148 | + READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}') |
| 149 | +
|
| 150 | + if [[ "$READY_STATUS" == "true" ]]; then |
| 151 | + echo "Pod [$LEADER_POD] is Ready!" |
| 152 | + break |
| 153 | + else |
| 154 | + echo "Pod [$LEADER_POD] not ready, waiting..." |
| 155 | + sleep 3 |
| 156 | + fi |
| 157 | + done |
| 158 | +
|
| 159 | + - name: Stream logs |
| 160 | + run: | |
| 161 | + kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" |
| 162 | +
|
| 163 | + - name: Determine is success |
| 164 | + run: | |
| 165 | + TIMEOUT=600 |
| 166 | + ELAPSED=0 |
| 167 | + while [ ! -f "$RESULT_FILE" ]; do |
| 168 | + sleep 5 |
| 169 | + ELAPSED=$((ELAPSED + 5)) |
| 170 | + if [ $ELAPSED -ge $TIMEOUT ]; then |
| 171 | + echo "Timeout waiting for test result file" |
| 172 | + exit 1 |
| 173 | + fi |
| 174 | + done |
| 175 | +
|
| 176 | + RET=$(cat "$RESULT_FILE") |
| 177 | + echo "Test result: $RET" |
| 178 | +
|
| 179 | + if [ "$RET" -ne 0 ]; then |
| 180 | + echo "Test failed" |
| 181 | + exit 1 |
| 182 | + else |
| 183 | + echo "Test succeeded" |
| 184 | + fi |
| 185 | +
|
| 186 | + - name: Post process |
| 187 | + if: always() |
| 188 | + run: | |
| 189 | + kubectl get pods -n $NAMESPACE |
| 190 | + kubectl delete -f ./lws.yaml |
0 commit comments