Clorist33
diff --git a/‎.github/workflows/_e2e_nightly_multi_node.yaml‎
Lines changed: 190 additions & 0 deletions b/‎.github/workflows/_e2e_nightly_multi_node.yaml‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎.github/workflows/_e2e_nightly.yaml‎ ‎…/workflows/_e2e_nightly_single_node.yaml‎.github/workflows/_e2e_nightly.yaml renamed to .github/workflows/_e2e_nightly_single_node.yaml b/‎.github/workflows/_e2e_nightly.yaml‎ ‎…/workflows/_e2e_nightly_single_node.yaml‎.github/workflows/_e2e_nightly.yaml renamed to .github/workflows/_e2e_nightly_single_node.yaml
diff --git a/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 0 additions & 125 deletions b/‎.github/workflows/multi_node_test.yaml‎
Lines changed: 0 additions & 125 deletions
@@ -0,0 +1,190 @@
+name: 'e2e nightly test multi_node'
+
+on:
+  workflow_call:
+    inputs:
+      soc_version:
+        required: true
+        type: string
+        description: use a2 or a3
+      image:
+        required: false
+        type: string
+        description: base image for pods
+        default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
+      config_file_path:
+        required: true
+        type: string
+        description: the model config for multi_node test
+      replicas:
+        required: false
+        default: "1"
+        type: string
+        description: replicas of the k8s cluster
+      size:
+        required: false
+        default: "2"
+        type: string
+        description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
+      vllm_version:
+        required: false
+        default: "v0.11.0"
+        type: string
+        description: vllm version to use
+      vllm_ascend_remote_url:
+        required: false
+        default: https://github.com/vllm-project/vllm-ascend.git
+        type: string
+        description: used for pr level tests
+      vllm_ascend_ref:
+        required: false
+        default: main
+        type: string
+        description: used for pr level tests
+
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # This is a runner with no NPU for k8s controller
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /tmp/kubeconfig
+        KUBECTL: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        LEADER_POD: vllm-0
+        RESULT_FILE: /root/.cache/tests/ret/test_result.txt
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+           pip install jinja2-cli
+
+           apt-get update -y && apt-get install -y git curl
+
+        - name: Install kubectl
+          run: |
+            # Install kubectl
+            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+            
+            # Verify kubectl installation
+            kubectl version --client=true
+
+        # TODO: Add A2 tests
+        - name: Setup kubeconfig for A3
+          if: inputs.soc_version == 'a3'
+          run: |
+            # Decode and save kubeconfig
+            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            # prepare for lws entrypoint scripts
+            install -D tests/e2e/nightly/multi_node/scripts/run.sh /root/.cache/tests/run.sh
+
+        - name: Clear result ret
+          run: |
+            rm -f $RESULT_FILE
+
+        - name: Launch cluster
+          run: |
+            set -e
+
+            size="${{ inputs.size }}"
+            replicas="${{ inputs.replicas }}"
+            image="${{ inputs.image }}"
+            config_file_path="${{ inputs.config_file_path }}"
+            vllm_version="${{ inputs.vllm_version }}"
+            vllm_ascend_ref="${{ inputs.vllm_ascend_ref }}"
+            vllm_ascend_remote_url="${{ inputs.vllm_ascend_remote_url }}"
+            result_file_path="$RESULT_FILE"
+
+            required_params=("size" "replicas" "image" "config_file_path")
+            for param in "${required_params[@]}"; do
+              if [ -z "${!param}" ]; then
+                echo "Error: Parameter '$param' is required but empty"
+                exit 1
+              fi
+            done
+
+            jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
+              -D size="$size" \
+              -D replicas="$replicas" \
+              -D image="$image" \
+              -D config_file_path="$config_file_path" \
+              -D vllm_version="$vllm_version" \
+              -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
+              -D vllm_ascend_ref="$vllm_ascend_ref" \
+              -D result_file_path="$result_file_path" \
+              --outfile lws.yaml
+
+            kubectl apply -f ./lws.yaml
+
+        - name: Waiting for pod ready
+          run: |
+            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
+
+            while true; do
+              # get pod status
+              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
+
+              if [[ "$READY_STATUS" == "true" ]]; then
+                echo "Pod [$LEADER_POD] is Ready!"
+                break
+              else
+                echo "Pod [$LEADER_POD] not ready, waiting..."
+                sleep 3
+              fi
+            done
+
+        - name: Stream logs
+          run: |
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE"
+
+        - name: Determine is success
+          run: |
+            TIMEOUT=600
+            ELAPSED=0
+            while [ ! -f "$RESULT_FILE" ]; do
+              sleep 5
+              ELAPSED=$((ELAPSED + 5))
+              if [ $ELAPSED -ge $TIMEOUT ]; then
+                echo "Timeout waiting for test result file"
+                exit 1
+              fi
+            done
+
+            RET=$(cat "$RESULT_FILE")
+            echo "Test result: $RET"
+
+            if [ "$RET" -ne 0 ]; then
+              echo "Test failed"
+              exit 1
+            else
+              echo "Test succeeded"
+            fi
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f ./lws.yaml