Upgrade: 25.10.5 → 26.1.1 | e2e-yaml-file | 192.168.10.211 #4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Upgrade Bootstrap + Run Upgrade Tests | |
| run-name: "Upgrade: ${{ inputs.BASE_SBCLI_BRANCH }} → ${{ inputs.TARGET_SBCLI_BRANCH }} | ${{ github.ref_name }} | ${{ inputs.MNODES }}" | |
| # Bootstraps a cluster using BASE_SBCLI_BRANCH + BASE_SPDK_IMAGE, then runs | |
| # upgrade_e2e.py (TestMajorUpgrade) to upgrade to TARGET_SBCLI_BRANCH + TARGET_SPDK_IMAGE. | |
| on: | |
| workflow_call: | |
| inputs: | |
| BASE_SBCLI_BRANCH: | |
| type: string | |
| required: true | |
| description: "sbcli branch / version tag to bootstrap the cluster with (e.g. R25.10-Hotfix)" | |
| CUSTOM_IMAGES: | |
| type: string | |
| default: 'base_spdk="" target_spdk="" base_docker="" target_docker=""' | |
| description: "Image overrides: set base_spdk, target_spdk, base_docker and/or target_docker values, leave as \"\" to skip." | |
| TARGET_SBCLI_BRANCH: | |
| type: string | |
| required: true | |
| description: "sbcli branch / version tag to upgrade to (e.g. main)" | |
| STORAGE_PRIVATE_IPS: | |
| type: string | |
| default: "192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208" | |
| API_INVOKE_URL: | |
| type: string | |
| default: "http://192.168.10.211/" | |
| BASTION_IP: | |
| type: string | |
| default: "192.168.10.211" | |
| MNODES: | |
| type: string | |
| default: "192.168.10.211" | |
| NR_HUGEPAGES: | |
| type: string | |
| default: "2048" | |
| GRAFANA_ENDPOINT: | |
| type: string | |
| default: "http://192.168.10.211/grafana" | |
| SBCLI_CMD: | |
| type: string | |
| default: "sbctl" | |
| SSH_USER: | |
| type: string | |
| default: "root" | |
| KEY_PATH: | |
| type: string | |
| default: "/home/ec2-user/.ssh/simplyblock-us-east-2.pem" | |
| CLIENTNODES: | |
| type: string | |
| default: "192.168.10.165 192.168.10.166" | |
| NFS_MOUNTPOINT: | |
| type: string | |
| default: "/mnt/nfs_share" | |
| BOOTSTRAP_MAX_LVOL: | |
| type: string | |
| default: "300" | |
| BOOTSTRAP_DATA_CHUNKS: | |
| type: string | |
| default: "2" | |
| BOOTSTRAP_PARITY_CHUNKS: | |
| type: string | |
| default: "2" | |
| BOOTSTRAP_JOURNAL_PARTITION: | |
| type: string | |
| default: "1" | |
| BOOTSTRAP_HA_JM_COUNT: | |
| type: string | |
| default: "3" | |
| BOOTSTRAP_HA_TYPE: | |
| type: string | |
| default: "ha" | |
| BOOTSTRAP_DATA_NIC: | |
| type: string | |
| default: "eth1" | |
| BOOTSTRAP_IS_SINGLE_NODE: | |
| type: boolean | |
| default: false | |
| BOOTSTRAP_ENABLE_NODE_AFFINITY: | |
| type: boolean | |
| default: false | |
| TEST_CLASS: | |
| type: string | |
| default: "major_upgrade" | |
| RUN_LABEL: | |
| type: string | |
| default: "" | |
| description: "Optional label appended to artifact names to avoid collisions (e.g. 'run1')" | |
| workflow_dispatch: | |
| inputs: | |
| BASE_SBCLI_BRANCH: | |
| description: "sbcli branch / version tag to bootstrap the cluster with (e.g. R25.10-Hotfix)" | |
| required: true | |
| CUSTOM_IMAGES: | |
| description: "Image overrides: set base_spdk, target_spdk, base_docker and/or target_docker values, leave as \"\" to skip." | |
| required: false | |
| default: 'base_spdk="" target_spdk="" base_docker="" target_docker=""' | |
| TARGET_SBCLI_BRANCH: | |
| description: "sbcli branch / version tag to upgrade to (e.g. main)" | |
| required: true | |
| STORAGE_PRIVATE_IPS: | |
| description: "Space-separated storage node IPs (also used for cleanup)" | |
| required: true | |
| default: "192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208" | |
| API_INVOKE_URL: | |
| description: "API invoke URL" | |
| required: true | |
| default: "http://192.168.10.211/" | |
| BASTION_IP: | |
| description: "Bastion IP" | |
| required: true | |
| default: "192.168.10.211" | |
| MNODES: | |
| description: "Management node(s) IPs" | |
| required: true | |
| default: "192.168.10.211" | |
| NR_HUGEPAGES: | |
| description: "Hugepages" | |
| required: true | |
| default: "2048" | |
| GRAFANA_ENDPOINT: | |
| description: "Grafana endpoint" | |
| required: true | |
| default: "http://192.168.10.211/grafana" | |
| SBCLI_CMD: | |
| description: "sbcli command name" | |
| required: true | |
| default: "sbctl" | |
| SSH_USER: | |
| description: "SSH user" | |
| required: true | |
| default: "root" | |
| KEY_PATH: | |
| description: "SSH private key path on runner" | |
| required: true | |
| default: "/home/ec2-user/.ssh/simplyblock-us-east-2.pem" | |
| CLIENTNODES: | |
| description: "Space-separated client node IPs" | |
| required: true | |
| default: "192.168.10.165 192.168.10.166" | |
| NFS_MOUNTPOINT: | |
| description: "NFS mountpoint to unmount everywhere" | |
| required: true | |
| default: "/mnt/nfs_share" | |
| BOOTSTRAP_MAX_LVOL: | |
| description: "bootstrap: --max-lvol" | |
| required: true | |
| default: "300" | |
| BOOTSTRAP_DATA_CHUNKS: | |
| description: "bootstrap: --data-chunks-per-stripe" | |
| required: true | |
| default: "2" | |
| BOOTSTRAP_PARITY_CHUNKS: | |
| description: "bootstrap: --parity-chunks-per-stripe" | |
| required: true | |
| default: "2" | |
| BOOTSTRAP_JOURNAL_PARTITION: | |
| description: "bootstrap: --journal-partition" | |
| required: true | |
| default: "1" | |
| BOOTSTRAP_HA_JM_COUNT: | |
| description: "bootstrap: --ha-jm-count" | |
| required: true | |
| default: "3" | |
| BOOTSTRAP_HA_TYPE: | |
| description: "bootstrap: --ha-type" | |
| required: true | |
| default: "ha" | |
| BOOTSTRAP_DATA_NIC: | |
| description: "bootstrap: --data-nics" | |
| required: true | |
| default: "eth1" | |
| BOOTSTRAP_IS_SINGLE_NODE: | |
| description: "Bootstrap: deploy as single-node" | |
| type: boolean | |
| required: false | |
| default: false | |
| BOOTSTRAP_ENABLE_NODE_AFFINITY: | |
| description: "Bootstrap: enable node affinity" | |
| type: boolean | |
| required: false | |
| default: false | |
| TEST_CLASS: | |
| description: "Upgrade test class name (--testname); leave empty to run all upgrade tests" | |
| required: false | |
| type: string | |
| default: "major_upgrade" | |
| concurrency: | |
| group: simplyblock-lab-upgrade | |
| cancel-in-progress: false | |
| jobs: | |
| bootstrap-and-upgrade: | |
| name: Pre-clean -> Bootstrap (${{ inputs.BASE_SBCLI_BRANCH }}) -> Upgrade (${{ inputs.TARGET_SBCLI_BRANCH }}) | |
| runs-on: [self-hosted] | |
| timeout-minutes: 300 | |
| env: | |
| # Upgrade-specific | |
| BASE_SBCLI_BRANCH: ${{ inputs.BASE_SBCLI_BRANCH }} | |
| TARGET_SBCLI_BRANCH: ${{ inputs.TARGET_SBCLI_BRANCH }} | |
| CUSTOM_IMAGES: ${{ inputs.CUSTOM_IMAGES || 'base_spdk="" target_spdk="" base_docker="" target_docker=""' }} | |
| # Cluster/lab env | |
| STORAGE_PRIVATE_IPS: ${{ inputs.STORAGE_PRIVATE_IPS || '192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208' }} | |
| API_INVOKE_URL: ${{ inputs.API_INVOKE_URL || 'http://192.168.10.211/' }} | |
| API_BASE_URL: ${{ inputs.API_INVOKE_URL || 'http://192.168.10.211/' }} | |
| BASTION_IP: ${{ inputs.BASTION_IP || '192.168.10.211' }} | |
| BASTION_SERVER: ${{ inputs.BASTION_IP || '192.168.10.211' }} | |
| MNODES: ${{ inputs.MNODES || '192.168.10.211' }} | |
| NR_HUGEPAGES: ${{ inputs.NR_HUGEPAGES || '2048' }} | |
| GRAFANA_ENDPOINT: ${{ inputs.GRAFANA_ENDPOINT || 'http://192.168.10.211/grafana' }} | |
| SBCLI_CMD: ${{ inputs.SBCLI_CMD || 'sbctl' }} | |
| # SSH/client env | |
| SSH_USER: ${{ inputs.SSH_USER || 'root' }} | |
| KEY_PATH: ${{ inputs.KEY_PATH || '/home/ec2-user/.ssh/simplyblock-us-east-2.pem' }} | |
| CLIENTNODES: ${{ inputs.CLIENTNODES || '192.168.10.165 192.168.10.166' }} | |
| CLIENT_IP: ${{ inputs.CLIENTNODES || '192.168.10.165 192.168.10.166' }} | |
| # Cleanup | |
| NFS_MOUNTPOINT: ${{ inputs.NFS_MOUNTPOINT || '/mnt/nfs_share' }} | |
| # Bootstrap params | |
| BOOTSTRAP_MAX_LVOL: ${{ inputs.BOOTSTRAP_MAX_LVOL || '300' }} | |
| BOOTSTRAP_DATA_CHUNKS: ${{ inputs.BOOTSTRAP_DATA_CHUNKS || '2' }} | |
| BOOTSTRAP_PARITY_CHUNKS: ${{ inputs.BOOTSTRAP_PARITY_CHUNKS || '2' }} | |
| BOOTSTRAP_JOURNAL_PARTITION: ${{ inputs.BOOTSTRAP_JOURNAL_PARTITION || '1' }} | |
| BOOTSTRAP_HA_JM_COUNT: ${{ inputs.BOOTSTRAP_HA_JM_COUNT || '3' }} | |
| BOOTSTRAP_HA_TYPE: ${{ inputs.BOOTSTRAP_HA_TYPE || 'ha' }} | |
| BOOTSTRAP_DATA_NIC: ${{ inputs.BOOTSTRAP_DATA_NIC || 'eth1' }} | |
| TEST_CLASS: ${{ inputs.TEST_CLASS || 'major_upgrade' }} | |
| # Secrets | |
| SSH_PASSWORD: ${{ secrets.SSH_PASSWORD }} | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| MINIO_ACCESS_KEY: ${{ secrets.MINIO_ACCESS_KEY }} | |
| MINIO_SECRET_KEY: ${{ secrets.MINIO_SECRET_KEY }} | |
| SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }} | |
| # Filled after bootstrap parsing | |
| CLUSTER_ID: "" | |
| CLUSTER_SECRET: "" | |
| steps: | |
| - name: Runner diagnostics | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| uname -a | |
| whoami | |
| pwd | |
| python3 --version || true | |
| git --version | |
| - name: Clear stale test artifacts from previous run | |
| shell: bash | |
| run: | | |
| rm -f sbcli/e2e/output.log || true | |
| - name: Install prereqs (sshpass) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| if command -v sshpass >/dev/null 2>&1; then | |
| exit 0 | |
| fi | |
| if command -v apt-get >/dev/null 2>&1; then | |
| sudo apt-get update -y | |
| sudo apt-get install -y sshpass | |
| elif command -v yum >/dev/null 2>&1; then | |
| sudo yum install -y epel-release || true | |
| sudo yum install -y sshpass | |
| elif command -v dnf >/dev/null 2>&1; then | |
| sudo dnf install -y sshpass | |
| else | |
| echo "ERROR: Cannot install sshpass (unknown package manager)." | |
| exit 1 | |
| fi | |
| - name: Parse CUSTOM_IMAGES overrides | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| custom="${CUSTOM_IMAGES}" | |
| for item in $custom; do | |
| key="${item%%=*}" | |
| value="${item#*=}" | |
| value="${value//\"/}" | |
| if [[ -z "$value" ]]; then | |
| echo "Skipping $key (empty)" | |
| continue | |
| fi | |
| case "$key" in | |
| base_spdk) echo "BASE_SPDK_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "Base SPDK image set: $value" ;; | |
| target_spdk) echo "TARGET_SPDK_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "Target SPDK image set: $value" ;; | |
| base_docker) echo "BASE_DOCKER_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "SIMPLY_BLOCK_DOCKER_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "Base Docker image set: $value" ;; | |
| target_docker) echo "TARGET_DOCKER_IMAGE=$value" >> "$GITHUB_ENV" | |
| echo "Target Docker image set: $value" ;; | |
| *) echo "Unknown image key: $key (ignored)" ;; | |
| esac | |
| done | |
| - name: Resolve KEY_PATH and validate key exists | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| kp="${KEY_PATH}" | |
| kp="${kp%\"}"; kp="${kp#\"}" | |
| kp="${kp%\'}"; kp="${kp#\'}" | |
| if [[ "$kp" == .ssh/* ]]; then kp="${HOME}/${kp}"; fi | |
| if [[ "$kp" == ~/* ]]; then kp="${HOME}/${kp#~/}"; fi | |
| if [[ "$kp" == "~.ssh/"* ]]; then kp="${HOME}/.${kp#~.}"; fi | |
| echo "Resolved KEY_PATH=$kp" | |
| echo "KEY_PATH=$kp" >> "$GITHUB_ENV" | |
| test -f "$kp" || (echo "ERROR: SSH key not found at $kp" && exit 1) | |
| chmod 600 "$kp" || true | |
| - name: Export KEY_NAME from KEY_PATH | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| key_name="$(basename "${KEY_PATH}")" | |
| echo "KEY_NAME=${key_name}" >> "$GITHUB_ENV" | |
| echo "Exported KEY_NAME=${key_name}" | |
| - name: Validate required secrets exist | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| [[ -n "${SSH_PASSWORD}" ]] || (echo "ERROR: secrets.SSH_PASSWORD required" && exit 1) | |
| # ============================================================ | |
| # PRE-BOOTSTRAP CLEANUP | |
| # ============================================================ | |
| - name: Pre-clean kill fio/tmux and unmount NFS on MNODES + storage + clients | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| run_remote() { | |
| local ip="$1" | |
| local script="$2" | |
| sshpass -p "${SSH_PASSWORD}" ssh \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | |
| } | |
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | |
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | |
| for ip in $uniq_targets; do | |
| echo "---- $ip: kill fio/tmux + umount ${NFS_MOUNTPOINT} ----" | |
| run_remote "$ip" "set -euxo pipefail; | |
| pkill -9 fio || true; | |
| pkill -9 tmux || true; | |
| mp='${NFS_MOUNTPOINT}'; | |
| if mountpoint -q \"\$mp\"; then umount -f \"\$mp\" || umount \"\$mp\"; else | |
| if mount | grep -q \" \$mp \"; then umount -f \"\$mp\" || umount \"\$mp\" || true; fi | |
| fi" | |
| done | |
| - name: Destroy/clean storage nodes (deploy-cleaner, docker prune, uninstall sbcli, k3s) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| run_remote() { | |
| local ip="$1" | |
| local script="$2" | |
| sshpass -p "${SSH_PASSWORD}" ssh \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | |
| } | |
| targets="$MNODES $STORAGE_PRIVATE_IPS" | |
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | |
| for ip in $uniq_targets; do | |
| echo "---- storage destroy/clean: $ip ----" | |
| run_remote "$ip" "set -euxo pipefail; | |
| systemctl stop firewalld || true; | |
| systemctl stop ufw || true; | |
| sysctl -w net.ipv6.conf.all.disable_ipv6=1 || true; | |
| '${SBCLI_CMD}' sn deploy-cleaner || echo 'WARN: deploy-cleaner failed'; | |
| docker stop \$(docker ps -aq) || true; | |
| docker rm -f \$(docker ps -aq) || true; | |
| docker builder prune --all -f || true; | |
| docker system prune -af || true; | |
| docker volume prune -f || true; | |
| docker rmi -f \$(docker images -aq) || true; | |
| pip uninstall -y '${SBCLI_CMD}' || echo 'WARN: uninstall sbcli failed'; | |
| pip uninstall -y sbctl || echo 'WARN: uninstall sbctl failed'; | |
| rm -rf /usr/local/bin/sbc* || true; | |
| k3s-agent-uninstall.sh || true" | |
| sleep 10 | |
| done | |
| - name: Client cleanup disconnect lvols; unmount all /mnt; remove /mnt dirs | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| run_remote() { | |
| local ip="$1" | |
| local script="$2" | |
| sshpass -p "${SSH_PASSWORD}" ssh \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | |
| } | |
| for ip in $CLIENTNODES; do | |
| echo "---- client disconnect lvols: $ip ----" | |
| run_remote "$ip" "set -euxo pipefail; | |
| subsystems=\$(nvme list-subsys | grep -i lvol | awk '{print \$3}' | cut -d '=' -f 2 || true); | |
| for s in \$subsystems; do nvme disconnect -n \"\$s\" || true; done" | |
| done | |
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | |
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | |
| still=0 | |
| for ip in $uniq_targets; do | |
| if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \ | |
| "mount | grep -q \" ${NFS_MOUNTPOINT} \""; then | |
| echo "ERROR: ${NFS_MOUNTPOINT} still mounted on $ip" | |
| still=1 | |
| fi | |
| done | |
| [[ "$still" -eq 0 ]] || exit 1 | |
| for ip in $CLIENTNODES; do | |
| echo "---- client unmount all /mnt and remove dirs: $ip ----" | |
| run_remote "$ip" "set -euxo pipefail; | |
| mps=\$(mount | grep ' /mnt' | awk '{print \$3}' || true); | |
| for mp in \$mps; do umount -f \"\$mp\" || umount \"\$mp\" || true; done; | |
| dirs=\$(find /mnt -mindepth 1 -type d 2>/dev/null || true); | |
| for d in \$dirs; do rm -rf \"\$d\" || true; done" | |
| done | |
| - name: Remove /etc/simplyblock; reboot storage; disk reset + PCI rebind + mklabel | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| run_remote() { | |
| local ip="$1" | |
| local script="$2" | |
| sshpass -p "${SSH_PASSWORD}" ssh \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | |
| } | |
| targets="$MNODES $STORAGE_PRIVATE_IPS" | |
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | |
| for ip in $uniq_targets; do | |
| run_remote "$ip" "rm -rf /etc/simplyblock || true" | |
| done | |
| for ip in $STORAGE_PRIVATE_IPS; do | |
| echo "---- reboot storage: $ip ----" | |
| sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \ | |
| "nohup reboot >/dev/null 2>&1 &" || true | |
| done | |
| for ip in $STORAGE_PRIVATE_IPS; do | |
| echo "Waiting for $ip..." | |
| for i in {1..60}; do | |
| if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 "${SSH_USER}@${ip}" \ | |
| "echo online" >/dev/null 2>&1; then | |
| echo "$ip online" | |
| break | |
| fi | |
| sleep 10 | |
| [[ "$i" -lt 60 ]] || (echo "ERROR: $ip did not come online" && exit 1) | |
| done | |
| done | |
| for ip in $STORAGE_PRIVATE_IPS; do | |
| echo "---- disk reset on $ip ----" | |
| run_remote "$ip" "set -euxo pipefail; | |
| for dev in /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1; do | |
| if [[ -b \$dev ]]; then | |
| parted \$dev --script rm 1 || true; | |
| parted \$dev --script rm 2 || true; | |
| parted \$dev --script rm 3 || true; | |
| fi | |
| done; | |
| for pci in \$(lspci -D | grep 'QEMU NVM Express' | awk '{print \$1}' || true); do | |
| echo \$pci > /sys/bus/pci/drivers/nvme/unbind || true; | |
| done; | |
| for i in 2 3 4 5; do | |
| echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/uio_pci_generic/unbind || true; | |
| done; | |
| for i in 2 3 4 5; do | |
| echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/vfio-pci/unbind || true; | |
| done; | |
| for i in 2 3 4 5; do | |
| echo nvme > /sys/bus/pci/devices/0000:00:0\${i}.0/driver_override || true; | |
| done; | |
| for i in 2 3 4 5; do | |
| echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/nvme/bind || true; | |
| done; | |
| for i in 0 1 2 3; do | |
| dev=/dev/nvme\${i}n1; | |
| if [[ -b \$dev ]]; then | |
| parted -fs \$dev mklabel gpt || true; | |
| fi | |
| done" | |
| done | |
| # ============================================================ | |
| # BOOTSTRAP (using BASE_SBCLI_BRANCH + BASE_SPDK_IMAGE) | |
| # ============================================================ | |
| - name: Clone simplyBlockDeploy (bootstrap repo) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| rm -rf simplyBlockDeploy | |
| git clone https://github.com/simplyblock-io/simplyBlockDeploy.git simplyBlockDeploy | |
| test -f simplyBlockDeploy/bare-metal/bootstrap-cluster.sh | |
| - name: Bootstrap cluster (BASE_SBCLI_BRANCH=${{ inputs.BASE_SBCLI_BRANCH }}) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| cd simplyBlockDeploy/bare-metal | |
| chmod +x ./bootstrap-cluster.sh | |
| extra_args=() | |
| # Images exported by the "Parse CUSTOM_IMAGES overrides" step | |
| if [[ -n "${SIMPLY_BLOCK_SPDK_ULTRA_IMAGE-}" ]]; then | |
| extra_args+=( --spdk-image "${SIMPLY_BLOCK_SPDK_ULTRA_IMAGE}" ) | |
| fi | |
| if [[ "${{ inputs.BOOTSTRAP_IS_SINGLE_NODE }}" == "true" ]]; then | |
| extra_args+=( --is-single-node true ) | |
| fi | |
| if [[ "${{ inputs.BOOTSTRAP_ENABLE_NODE_AFFINITY }}" == "true" ]]; then | |
| extra_args+=( --enable-node-affinity true ) | |
| fi | |
| set +e | |
| ./bootstrap-cluster.sh \ | |
| --sbcli-cmd "${SBCLI_CMD}" \ | |
| --max-lvol "${BOOTSTRAP_MAX_LVOL}" \ | |
| --data-chunks-per-stripe "${BOOTSTRAP_DATA_CHUNKS}" \ | |
| --parity-chunks-per-stripe "${BOOTSTRAP_PARITY_CHUNKS}" \ | |
| --journal-partition "${BOOTSTRAP_JOURNAL_PARTITION}" \ | |
| --ha-jm-count "${BOOTSTRAP_HA_JM_COUNT}" \ | |
| --ha-type "${BOOTSTRAP_HA_TYPE}" \ | |
| --data-nics "${BOOTSTRAP_DATA_NIC}" \ | |
| "${extra_args[@]}" | tee bootstrap.log | |
| rc=${PIPESTATUS[0]} | |
| set -e | |
| [[ "$rc" -eq 0 ]] || (echo "ERROR: bootstrap failed (rc=$rc)" && exit "$rc") | |
| - name: Fetch CLUSTER_ID and CLUSTER_SECRET from MNODES | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" | |
| ssh_common=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i "${KEY_PATH}") | |
| cluster_id="$(ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" \ | |
| "${SBCLI_CMD} cluster list" | grep -Eo '[0-9a-fA-F]{8}-([0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}' | head -n 1)" | |
| if [[ -z "${cluster_id}" ]]; then | |
| echo "ERROR: Could not extract cluster_id from '${SBCLI_CMD} cluster list' on ${mgmt_ip}" | |
| ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" "${SBCLI_CMD} cluster list" || true | |
| exit 1 | |
| fi | |
| cluster_secret="$(ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" \ | |
| "${SBCLI_CMD} cluster get-secret ${cluster_id}" | tr -d '\r' | tail -n 1 | xargs)" | |
| if [[ -z "${cluster_secret}" ]]; then | |
| echo "ERROR: Could not get cluster_secret from '${SBCLI_CMD} cluster get-secret ${cluster_id}' on ${mgmt_ip}" | |
| exit 1 | |
| fi | |
| echo "CLUSTER_ID=${cluster_id}" >> "$GITHUB_ENV" | |
| echo "CLUSTER_SECRET=${cluster_secret}" >> "$GITHUB_ENV" | |
| echo "Fetched CLUSTER_ID=${cluster_id}" | |
| echo "Fetched CLUSTER_SECRET=***set***" | |
| # ============================================================ | |
| # UPGRADE E2E TESTS (clone from TARGET_SBCLI_BRANCH) | |
| # ============================================================ | |
| - name: Clone sbcli repo (prefer workflow branch; fallback to TARGET_SBCLI_BRANCH) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| rm -rf sbcli | |
| wf_branch="${{ github.ref_name }}" | |
| fallback_branch="${TARGET_SBCLI_BRANCH}" | |
| echo "Workflow branch: $wf_branch" | |
| echo "Fallback sbcli branch (TARGET_SBCLI_BRANCH): $fallback_branch" | |
| if git ls-remote --heads https://github.com/simplyblock-io/sbcli.git "$wf_branch" | grep -q "$wf_branch"; then | |
| echo "Cloning sbcli on workflow branch: $wf_branch" | |
| git clone --branch "$wf_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | |
| else | |
| echo "Branch '$wf_branch' not found in sbcli; cloning fallback branch: $fallback_branch" | |
| git clone --branch "$fallback_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | |
| fi | |
| test -f sbcli/e2e/upgrade_e2e.py | |
| test -f sbcli/e2e/e2e_tests/upgrade_tests/major_upgrade.py | |
| test -f sbcli/e2e/logs/cleanup.py | |
| - name: Install Python deps (best-effort) | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| python3 -m pip install --upgrade pip | |
| if [[ -f "sbcli/e2e/requirements.txt" ]]; then | |
| pip install -r sbcli/e2e/requirements.txt | |
| fi | |
| - name: Cleanup logs before upgrade e2e | |
| shell: bash | |
| working-directory: sbcli/e2e | |
| run: | | |
| set -euxo pipefail | |
| python3 logs/cleanup.py | |
| - name: Set RUN_BASE_DIR | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| RUN_TIMESTAMP="$(date +%Y%m%d-%H%M%S)" | |
| RUN_BASE_DIR="${NFS_MOUNTPOINT}/upgrade-run-${RUN_TIMESTAMP}-${GITHUB_RUN_ID}" | |
| echo "RUN_BASE_DIR=${RUN_BASE_DIR}" >> "$GITHUB_ENV" | |
| mkdir -p "${RUN_BASE_DIR}" | |
| - name: Record test start time | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | |
| echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | |
| - name: Run upgrade e2e tests (major_upgrade) | |
| shell: bash | |
| working-directory: sbcli/e2e | |
| run: | | |
| set -euxo pipefail | |
| TESTNAME_ARGS=() | |
| if [[ -n "${TEST_CLASS:-}" ]]; then | |
| TESTNAME_ARGS=(--testname "${TEST_CLASS}") | |
| fi | |
| python3 -u upgrade_e2e.py \ | |
| --base_version "${BASE_SBCLI_BRANCH}" \ | |
| --target_version "${TARGET_SBCLI_BRANCH}" \ | |
| --base_spdk_image "${BASE_SPDK_IMAGE:-}" \ | |
| --target_spdk_image "${TARGET_SPDK_IMAGE:-}" \ | |
| --target_docker_image "${TARGET_DOCKER_IMAGE:-}" \ | |
| "${TESTNAME_ARGS[@]}" \ | |
| 2>&1 | tee output.log | |
| - name: Mark test end time (always) | |
| if: always() | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | |
| echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | |
| - name: Collect mgmt snapshots into RUN_BASE_DIR (always) | |
| if: always() | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| python3 - <<'PY' | |
| import os, subprocess, json | |
| mgmt_ip = os.environ["MNODES"].split()[0] | |
| key = os.environ["KEY_PATH"] | |
| user = os.environ["SSH_USER"] | |
| sbcli = os.environ["SBCLI_CMD"] | |
| cluster_id = os.environ["CLUSTER_ID"] | |
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | |
| outdir = f"{run_base}/{mgmt_ip}/mgmt_details" | |
| os.makedirs(f"{outdir}/mgmt", exist_ok=True) | |
| os.makedirs(f"{outdir}/subtasks", exist_ok=True) | |
| os.makedirs(f"{outdir}/storage_nodes", exist_ok=True) | |
| ssh_base = [ | |
| "ssh", "-i", key, | |
| "-o", "StrictHostKeyChecking=no", | |
| "-o", "UserKnownHostsFile=/dev/null", | |
| "-o", "ConnectTimeout=10", | |
| f"{user}@{mgmt_ip}", | |
| ] | |
| def run_cmd(cmd, out_file): | |
| print(f" {cmd} -> {out_file}", flush=True) | |
| try: | |
| with open(out_file, "w") as f: | |
| subprocess.run(ssh_base + [cmd], stdout=f, stderr=subprocess.STDOUT, check=False, timeout=60) | |
| except Exception as e: | |
| print(f" WARN: failed: {e}", flush=True) | |
| run_cmd(f"{sbcli} cluster list", f"{outdir}/mgmt/cluster_list.txt") | |
| run_cmd(f"{sbcli} cluster status {cluster_id}", f"{outdir}/mgmt/cluster_status.txt") | |
| run_cmd(f"{sbcli} cluster show {cluster_id}", f"{outdir}/mgmt/cluster_show.txt") | |
| run_cmd(f"{sbcli} cluster get-capacity {cluster_id}", f"{outdir}/mgmt/cluster_capacity.txt") | |
| run_cmd(f"{sbcli} cluster get-logs {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_get_logs.txt") | |
| run_cmd(f"{sbcli} pool list", f"{outdir}/mgmt/pool_list.txt") | |
| run_cmd(f"{sbcli} lvol list", f"{outdir}/mgmt/lvol_list.txt") | |
| run_cmd(f"{sbcli} snapshot list", f"{outdir}/mgmt/snapshot_list.txt") | |
| run_cmd(f"{sbcli} sn list", f"{outdir}/mgmt/sn_list.txt") | |
| run_cmd(f"{sbcli} sn list --json", f"{outdir}/mgmt/sn_list.json") | |
| sn_uuids = [] | |
| try: | |
| with open(f"{outdir}/mgmt/sn_list.json") as f: | |
| data = json.load(f) | |
| for item in (data if isinstance(data, list) else []): | |
| uid = item.get("UUID") or item.get("uuid") or item.get("Id") or item.get("id") | |
| if uid: | |
| sn_uuids.append(uid) | |
| except Exception: | |
| pass | |
| for idx, uuid in enumerate(sn_uuids, 1): | |
| run_cmd(f"{sbcli} sn list-devices {uuid}", f"{outdir}/storage_nodes/node{idx}_list_devices.txt") | |
| run_cmd(f"{sbcli} sn check {uuid}", f"{outdir}/storage_nodes/node{idx}_check.txt") | |
| run_cmd(f"{sbcli} sn get {uuid}", f"{outdir}/storage_nodes/node{idx}_get.txt") | |
| run_cmd(f"{sbcli} cluster list-tasks {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_list_tasks.txt") | |
| bal_ids = [] | |
| try: | |
| with open(f"{outdir}/mgmt/cluster_list_tasks.txt") as f: | |
| for line in f: | |
| if line.startswith("+") or "Task ID" in line or "|" not in line: | |
| continue | |
| cols = [c.strip() for c in line.split("|")] | |
| if len(cols) >= 5 and cols[3] == "balancing_on_restart" and cols[1]: | |
| bal_ids.append(cols[1]) | |
| except Exception: | |
| pass | |
| for tid in bal_ids: | |
| run_cmd(f"{sbcli} cluster get-subtasks {tid}", f"{outdir}/subtasks/{tid}_subtasks.txt") | |
| PY | |
| - name: Collect docker logs into RUN_BASE_DIR (always) | |
| if: always() | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| TAG="containers-final-$(date +%Y%m%d_%H%M%S)" | |
| SSH_OPTS=(-i "${KEY_PATH}" -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -C) | |
| NODES="$(echo "${MNODES} ${STORAGE_PRIVATE_IPS}" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | |
| for NODE in ${NODES}; do | |
| echo ">>> Node: ${NODE}" | |
| LOCAL_NODE_DIR="${RUN_BASE_DIR}/${NODE}/${TAG}" | |
| mkdir -p "${LOCAL_NODE_DIR}" | |
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "docker ps -a 2>&1 || true" \ | |
| > "${LOCAL_NODE_DIR}/docker_ps_a_${NODE}.txt" || true | |
| CONTAINERS="$(ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | |
| "docker ps -a --format '{{.Names}}' 2>/dev/null || true" 2>/dev/null || true)" | |
| if [[ -z "${CONTAINERS}" ]]; then | |
| echo "No containers found on ${NODE}" > "${LOCAL_NODE_DIR}/_NO_CONTAINERS_${NODE}.txt" | |
| continue | |
| fi | |
| while IFS= read -r C; do | |
| [[ -z "${C}" ]] && continue | |
| echo " dumping: ${C}" | |
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | |
| "docker logs --timestamps --details '${C}' 2>&1 || true" \ | |
| > "${LOCAL_NODE_DIR}/${C}.txt" || true | |
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | |
| "docker inspect '${C}' 2>&1 || true" \ | |
| > "${LOCAL_NODE_DIR}/${C}_inspect.json" || true | |
| done <<< "${CONTAINERS}" | |
| done | |
| - name: Collect distrib debug dumps into RUN_BASE_DIR (always) | |
| if: always() | |
| timeout-minutes: 35 | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| python3 - <<'PY' | |
| import os, subprocess, sys | |
| ssh_user = os.environ["SSH_USER"] | |
| key = os.environ["KEY_PATH"] | |
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | |
| tag = "finaldistrib_bdev_logs" | |
| storage_ips = os.environ["STORAGE_PRIVATE_IPS"].split() | |
| ssh_base = [ | |
| "ssh", | |
| "-i", key, | |
| "-o", "StrictHostKeyChecking=no", | |
| "-o", "UserKnownHostsFile=/dev/null", | |
| "-o", "ServerAliveInterval=15", | |
| "-o", "ServerAliveCountMax=4", | |
| "-o", "ConnectTimeout=10", | |
| "-C", | |
| ] | |
| scp_base = [ | |
| "scp", | |
| "-i", key, | |
| "-o", "StrictHostKeyChecking=no", | |
| "-o", "UserKnownHostsFile=/dev/null", | |
| "-o", "ConnectTimeout=10", | |
| "-C", | |
| ] | |
| remote_script = """\ | |
| set -euo pipefail | |
| TS="$(date +%Y%m%d_%H%M%S)" | |
| HOST="$(hostname -s 2>/dev/null || hostname)" | |
| STAGING="/tmp/distrib_host_collect_${TS}" | |
| mkdir -p "$STAGING" | |
| CN="$(sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' | head -n1 || true)" | |
| if [[ -z "$CN" ]]; then echo "NO_SPDK_CONTAINER"; exit 0; fi | |
| SOCK="/mnt/ramdisk/${CN}/spdk.sock" | |
| BDEV_JSON="$(sudo docker exec "$CN" bash -lc "python spdk/scripts/rpc.py -s '$SOCK' bdev_get_bdevs" 2>/dev/null || true)" | |
| if [[ -z "$BDEV_JSON" ]]; then echo "BDEV_EMPTY"; exit 0; fi | |
| if command -v jq >/dev/null 2>&1; then | |
| mapfile -t DISTRIBS < <(printf '%s' "$BDEV_JSON" | jq -r '.[] | select(.name|startswith("distrib_")) | .name' | sort -u) | |
| else | |
| mapfile -t DISTRIBS < <(printf '%s\\n' "$BDEV_JSON" | grep -oE '"name"\\s*:\\s*"distrib_[^"]+"' | sed -E 's/.*"name"\\s*:\\s*"([^"]+)".*/\\1/' | sort -u) | |
| fi | |
| if [[ ${#DISTRIBS[@]} -eq 0 ]]; then echo "NO_DISTRIBS"; exit 0; fi | |
| for d in "${DISTRIBS[@]}"; do | |
| JF="/tmp/stack_${d}.json" | |
| python3 - "$d" "$JF" <<'PYIN' | |
| import json, sys | |
| d = sys.argv[1] | |
| jf = sys.argv[2] | |
| obj = {"subsystems":[{"subsystem":"distr","config":[{"method":"distr_debug_placement_map_dump","params":{"name":d}}]}]} | |
| with open(jf, "w") as f: | |
| f.write(json.dumps(obj)) | |
| PYIN | |
| sudo docker cp "$JF" "$CN:$JF" || true | |
| sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py '$JF' '$SOCK' > /tmp/rpc_${d}.log 2>&1 || true" || true | |
| sudo docker cp "$CN:/tmp/rpc_${d}.log" "$STAGING/rpc_${d}.log" 2>/dev/null || true | |
| for f in $(sudo docker exec "$CN" bash -lc "ls /tmp 2>/dev/null | grep -F \\\"$d\\\" || true"); do | |
| sudo docker cp "$CN:/tmp/$f" "$STAGING/${CN}__$f" 2>/dev/null || true | |
| done | |
| sudo docker exec "$CN" bash -lc "rm -f '$JF' '/tmp/rpc_${d}.log'" || true | |
| rm -f "$JF" || true | |
| done | |
| cat /proc/meminfo | grep -i huge > "$STAGING/hugepage_meminfo.txt" 2>/dev/null || true | |
| TAR="/tmp/${HOST}_distrib_dumps_${TS}.tgz" | |
| tar -C "$STAGING" -czf "$TAR" . 2>/dev/null || true | |
| echo "$TAR" | |
| """ | |
| for ip in storage_ips: | |
| print(f"=== {ip} ===", flush=True) | |
| cmd = ssh_base + [f"{ssh_user}@{ip}", "bash", "-s"] | |
| p = subprocess.run(cmd, input=remote_script.encode(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False) | |
| out = p.stdout.decode(errors="replace").strip().splitlines() | |
| last = out[-1].strip() if out else "" | |
| last = last.replace("\r", "") | |
| if last in ("NO_SPDK_CONTAINER", "BDEV_EMPTY", "NO_DISTRIBS") or not last.startswith("/tmp/"): | |
| print(f"[{ip}] WARN: distrib collection skipped/failed: {last or '(no output)'}", flush=True) | |
| continue | |
| dest_dir = f"{run_base}/{ip}/{tag}" | |
| os.makedirs(dest_dir, exist_ok=True) | |
| scp_cmd = scp_base + [f"{ssh_user}@{ip}:{last}", dest_dir + "/"] | |
| subprocess.run(scp_cmd, check=False) | |
| print(f"[{ip}] Saved -> {dest_dir}/{os.path.basename(last)}", flush=True) | |
| PY | |
| # ========================= | |
| # SUMMARY (always) | |
| # ========================= | |
| - name: Write Job Summary | |
| if: always() | |
| shell: bash | |
| run: | | |
| set -euxo pipefail | |
| mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" | |
| out_log="sbcli/e2e/output.log" | |
| # --- Timing --- | |
| start="${TEST_START_EPOCH:-0}" | |
| end="${TEST_END_EPOCH:-0}" | |
| dur_sec=0 | |
| if [[ "$start" =~ ^[0-9]+$ && "$end" =~ ^[0-9]+$ && "$end" -ge "$start" ]]; then | |
| dur_sec=$((end-start)) | |
| fi | |
| dur_h=$((dur_sec/3600)); dur_m=$(((dur_sec%3600)/60)); dur_s=$((dur_sec%60)) | |
| dur_fmt="${dur_h}h ${dur_m}m ${dur_s}s" | |
| # --- Parse test counts from output.log (upgrade_e2e.py format) --- | |
| total_cases=0; passed_cases=0; failed_cases=0; skipped_cases=0 | |
| if [[ -f "${out_log}" ]]; then | |
| v="$(grep -m1 'Total Cases:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && total_cases="${v}" | |
| v="$(grep -m1 'Passed:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && passed_cases="${v}" | |
| v="$(grep -m1 'Failed:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && failed_cases="${v}" | |
| v="$(grep -m1 'Skipped:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && skipped_cases="${v}" | |
| fi | |
| pass_pct=0; fail_pct=0; skip_pct=0 | |
| if [[ "${total_cases}" -gt 0 ]]; then | |
| pass_pct=$(( (passed_cases * 100) / total_cases )) | |
| fail_pct=$(( (failed_cases * 100) / total_cases )) | |
| skip_pct=$(( (skipped_cases * 100) / total_cases )) | |
| fi | |
| # --- Parse per-test status --- | |
| test_details_table="" | |
| if [[ -f "${out_log}" ]]; then | |
| while IFS= read -r line; do | |
| clean="$(printf '%s' "${line}" | sed 's/\x1b\[[0-9;]*m//g')" | |
| test_name="$(printf '%s' "${clean}" | grep -oE 'Test[A-Za-z0-9]+' | head -n1 || true)" | |
| [[ -z "${test_name}" ]] && continue | |
| if printf '%s' "${clean}" | grep -qi 'PASSED'; then icon="✅"; status="PASSED" | |
| elif printf '%s' "${clean}" | grep -qi 'FAILED'; then icon="❌"; status="FAILED" | |
| elif printf '%s' "${clean}" | grep -qi 'SKIPPED'; then icon="⏭"; status="SKIPPED" | |
| else continue | |
| fi | |
| test_details_table+="| \`${test_name}\` | ${icon} ${status} |"$'\n' | |
| done < <(grep -iE 'PASSED|FAILED|SKIPPED' "${out_log}" 2>/dev/null || true) | |
| fi | |
| # --- Failure reason --- | |
| failure_reason="" | |
| if [[ -f "${out_log}" ]]; then | |
| multi="$(grep 'MultipleExceptions:' "${out_log}" | sed 's/\x1b\[[0-9;]*m//g' || true)" | |
| if [[ -n "${multi}" ]]; then | |
| failure_reason="${multi}" | |
| elif grep -Eqi 'Traceback \(most recent call last\)|Exception:|AssertionError|Input/output error' "${out_log}"; then | |
| failure_reason="$(grep -Ei 'Traceback \(most recent call last\)|Exception:|AssertionError|Input/output error' "${out_log}" | tail -n 3 | sed 's/\x1b\[[0-9;]*m//g' || true)" | |
| fi | |
| fi | |
| # --- Mgmt artifacts list --- | |
| mgmt_dir="${RUN_BASE_DIR:-}/${mgmt_ip}/mgmt_details/mgmt" | |
| mgmt_files="(not found)" | |
| if [[ -n "${RUN_BASE_DIR:-}" && -d "${mgmt_dir}" ]]; then | |
| mgmt_files="$(find "${mgmt_dir}" -maxdepth 1 -type f -printf '%f (%s bytes)\n' 2>/dev/null | sort || true)" | |
| [[ -n "${mgmt_files}" ]] || mgmt_files="(empty)" | |
| fi | |
| # --- Overall result --- | |
| conclusion="✅ SUCCESS" | |
| if [[ "${{ job.status }}" != "success" ]]; then | |
| conclusion="❌ FAILED" | |
| fi | |
| { | |
| echo "## SimplyBlock Upgrade E2E Run Summary" | |
| echo "" | |
| echo "**Result:** ${conclusion} | **Duration:** ${dur_fmt}" | |
| echo "" | |
| echo "### Upgrade" | |
| echo "| | Branch | SPDK Image |" | |
| echo "|---|---|---|" | |
| echo "| **Base** | \`${BASE_SBCLI_BRANCH}\` | \`${BASE_SPDK_IMAGE:-default}\` |" | |
| echo "| **Target** | \`${TARGET_SBCLI_BRANCH}\` | \`${TARGET_SPDK_IMAGE:-default}\` |" | |
| echo "" | |
| echo "### Test Results" | |
| echo "| | Count | % |" | |
| echo "|---|---|---|" | |
| echo "| ✅ Passed | ${passed_cases} | ${pass_pct}% |" | |
| echo "| ❌ Failed | ${failed_cases} | ${fail_pct}% |" | |
| echo "| ⏭ Skipped | ${skipped_cases} | ${skip_pct}% |" | |
| echo "| **Total** | **${total_cases}** | |" | |
| echo "" | |
| if [[ -n "${test_details_table}" ]]; then | |
| echo "### Test Case Details" | |
| echo "| Test | Result |" | |
| echo "|---|---|" | |
| printf '%s' "${test_details_table}" | |
| echo "" | |
| fi | |
| echo "### Run Info" | |
| echo "- **Test class:** \`${TEST_CLASS:-all}\`" | |
| echo "- **Cluster ID:** \`${CLUSTER_ID}\`" | |
| echo "- **Mgmt node:** \`${mgmt_ip}\`" | |
| echo "- **Start (UTC):** ${TEST_START_HUMAN:-unknown}" | |
| echo "- **End (UTC):** ${TEST_END_HUMAN:-unknown}" | |
| echo "" | |
| if [[ -n "${failure_reason}" ]]; then | |
| echo "### Failure Reason" | |
| echo '```' | |
| printf '%s\n' "${failure_reason}" | |
| echo '```' | |
| echo "" | |
| fi | |
| if [[ -n "${RUN_BASE_DIR:-}" ]]; then | |
| echo "<details><summary>Run Artifacts (NFS)</summary>" | |
| echo "" | |
| echo "- **Run dir:** \`${RUN_BASE_DIR}/\`" | |
| echo "- Mgmt details: \`${RUN_BASE_DIR}/${mgmt_ip}/mgmt_details/\`" | |
| echo "- Docker logs: \`${RUN_BASE_DIR}/<node_ip>/containers-final-*/\`" | |
| echo "- Distrib dumps: \`${RUN_BASE_DIR}/<storage_ip>/finaldistrib_bdev_logs/\`" | |
| echo "" | |
| echo "</details>" | |
| echo "" | |
| fi | |
| echo "<details><summary>Mgmt Artifacts (cluster state at end of run)</summary>" | |
| echo "" | |
| echo "Path: \`${mgmt_dir}\`" | |
| echo "" | |
| echo '```' | |
| printf '%s\n' "${mgmt_files}" | |
| echo '```' | |
| echo "" | |
| echo "</details>" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Send Slack Notification | |
| if: always() | |
| shell: bash | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| JOB_STATUS: ${{ job.status }} | |
| SLACK_RUN_URL: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| GITHUB_REF_NAME: ${{ github.ref_name }} | |
| SLACK_WF_NAME: "E2E (Upgrade Bootstrap)" | |
| run: | | |
| python3 - <<'PYEOF' | |
| import json, os, re, sys, urllib.request, urllib.error | |
| webhook = os.environ.get("SLACK_WEBHOOK_URL", "") | |
| if not webhook: | |
| print("No SLACK_WEBHOOK_URL set, skipping.") | |
| sys.exit(0) | |
| out_log = "sbcli/e2e/output.log" | |
| content = open(out_log).read() if os.path.isfile(out_log) else "" | |
| # --- Counts (upgrade_e2e.py format) --- | |
| def px(pat): | |
| m = re.search(pat, content) | |
| return int(m.group(1)) if m else 0 | |
| total = px(r'Total Cases:\s*(\d+)') | |
| passed = px(r'Passed:\s*(\d+)') | |
| failed = px(r'Failed:\s*(\d+)') | |
| skipped = px(r'Skipped:\s*(\d+)') | |
| pass_pct = (passed * 100 // total) if total > 0 else 0 | |
| # --- Per-test results --- | |
| ansi = re.compile(r'\x1b\[[0-9;]*m') | |
| test_results = [] | |
| for line in content.splitlines(): | |
| clean = ansi.sub('', line) | |
| m = re.search(r'Test[A-Za-z0-9_]+', clean) | |
| if not m: | |
| continue | |
| name = m.group(0) | |
| if 'PASSED' in clean: test_results.append(('PASSED', name)) | |
| elif 'FAILED' in clean: test_results.append(('FAILED', name)) | |
| elif 'SKIPPED' in clean: test_results.append(('SKIPPED', name)) | |
| # --- Failure reason --- | |
| failure_reason = "" | |
| multi = [ansi.sub('', l) for l in content.splitlines() if 'MultipleExceptions:' in l] | |
| if multi: | |
| failure_reason = multi[0][:2000] | |
| elif content: | |
| exc_lines = [ansi.sub('', l) for l in content.splitlines() | |
| if re.search(r'(Exception:|AssertionError|Input/output error)', l)] | |
| if exc_lines: | |
| failure_reason = '\n'.join(exc_lines[-5:])[:2000] | |
| # --- Env --- | |
| s = int(os.environ.get("TEST_START_EPOCH", "0") or "0") | |
| e = int(os.environ.get("TEST_END_EPOCH", "0") or "0") | |
| secs = max(0, e - s) if e >= s > 0 else 0 | |
| dur = f"{secs//3600}h {(secs%3600)//60}m {secs%60}s" | |
| run_url = os.environ.get("SLACK_RUN_URL", "") | |
| log_dir = os.environ.get("RUN_BASE_DIR", "N/A") | |
| base_branch = os.environ.get("BASE_SBCLI_BRANCH", "?") | |
| base_spdk = os.environ.get("BASE_SPDK_IMAGE", "") or "default" | |
| target_branch = os.environ.get("TARGET_SBCLI_BRANCH", "?") | |
| target_spdk = os.environ.get("TARGET_SPDK_IMAGE", "") or "default" | |
| test_cls = os.environ.get("TEST_CLASS", "") or "all" | |
| branch = os.environ.get("GITHUB_REF_NAME", "?") | |
| wf_name = os.environ.get("SLACK_WF_NAME", "Run") | |
| ok = os.environ.get("JOB_STATUS", "") == "success" | |
| icon = ":white_check_mark:" if ok else ":x:" | |
| status = "SUCCESS" if ok else "FAILURE" | |
| mention = "" if ok else " <!channel>" | |
| lines = [ | |
| f"{icon} *SimplyBlock {wf_name}*{mention}", | |
| f"*Status:* {status} | *Duration:* {dur}", | |
| f"*Branch:* `{branch}` | *Test class:* `{test_cls}`", | |
| f"*Upgrade:* `{base_branch}` → `{target_branch}`", | |
| f"*Base SPDK:* `{base_spdk}` | *Target SPDK:* `{target_spdk}`", | |
| "", | |
| ] | |
| if total > 0: | |
| lines += [ | |
| f":white_check_mark: *Passed:* {passed}/{total} ({pass_pct}%)", | |
| f":x: *Failed:* {failed}", | |
| f":fast_forward: *Skipped:* {skipped}", | |
| ] | |
| else: | |
| lines.append("_(test counts not found in log)_") | |
| if test_results: | |
| lines.append("") | |
| lines.append("*Test Results:*") | |
| icons = {'PASSED': ':white_check_mark:', 'FAILED': ':x:', 'SKIPPED': ':fast_forward:'} | |
| for st, nm in test_results: | |
| lines.append(f"{icons.get(st, ':grey_question:')} `{nm}`") | |
| if failure_reason: | |
| lines += ["", "*Failure:*", f"```{failure_reason}```"] | |
| lines += [ | |
| "", | |
| f":link: *Run:* <{run_url}|View on GitHub>", | |
| f":file_folder: *Final Logs:* `{log_dir}`", | |
| ] | |
| payload = {"text": "\n".join(lines)} | |
| req = urllib.request.Request( | |
| webhook, | |
| data=json.dumps(payload).encode(), | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| try: | |
| urllib.request.urlopen(req, timeout=15) | |
| print("Slack notification sent.") | |
| except Exception as exc: | |
| print(f"WARN: Slack notification failed: {exc}", file=sys.stderr) | |
| PYEOF | |
| - name: Upload logs (always) | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: simplyblock-upgrade-logs-${{ github.run_id }}${{ inputs.RUN_LABEL != '' && format('-{0}', inputs.RUN_LABEL) || '' }} | |
| path: | | |
| simplyBlockDeploy/bare-metal/bootstrap.log | |
| sbcli/e2e/output.log | |
| sbcli/e2e/logs/** | |
| if-no-files-found: warn | |
| - name: Export MGMT_IP (first MNODES) | |
| if: always() | |
| shell: bash | |
| run: | | |
| echo "MGMT_IP=$(echo "${MNODES}" | awk '{print $1}')" >> "$GITHUB_ENV" | |
| - name: Upload small artifacts (always) | |
| if: always() && env.RUN_BASE_DIR != '' && env.MGMT_IP != '' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: simplyblock-upgrade-small-logs-${{ github.run_id }}${{ inputs.RUN_LABEL != '' && format('-{0}', inputs.RUN_LABEL) || '' }} | |
| path: | | |
| sbcli/e2e/output.log | |
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/mgmt/*.txt | |
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/subtasks/*.txt | |
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/storage_nodes/*.txt | |
| if-no-files-found: warn |