Upgrade: 25.10.5 → 26.1.1 | e2e-yaml-file | 192.168.10.211 #4

Workflow file for this run

.github/workflows/upgrade-bootstrap.yml at a4e1b7b

	name: Upgrade Bootstrap + Run Upgrade Tests
	run-name: "Upgrade: ${{ inputs.BASE_SBCLI_BRANCH }} → ${{ inputs.TARGET_SBCLI_BRANCH }} \| ${{ github.ref_name }} \| ${{ inputs.MNODES }}"

	# Bootstraps a cluster using BASE_SBCLI_BRANCH + BASE_SPDK_IMAGE, then runs
	# upgrade_e2e.py (TestMajorUpgrade) to upgrade to TARGET_SBCLI_BRANCH + TARGET_SPDK_IMAGE.

	on:
	workflow_call:
	inputs:
	BASE_SBCLI_BRANCH:
	type: string
	required: true
	description: "sbcli branch / version tag to bootstrap the cluster with (e.g. R25.10-Hotfix)"
	CUSTOM_IMAGES:
	type: string
	default: 'base_spdk="" target_spdk="" base_docker="" target_docker=""'
	description: "Image overrides: set base_spdk, target_spdk, base_docker and/or target_docker values, leave as \"\" to skip."
	TARGET_SBCLI_BRANCH:
	type: string
	required: true
	description: "sbcli branch / version tag to upgrade to (e.g. main)"
	STORAGE_PRIVATE_IPS:
	type: string
	default: "192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208"
	API_INVOKE_URL:
	type: string
	default: "http://192.168.10.211/"
	BASTION_IP:
	type: string
	default: "192.168.10.211"
	MNODES:
	type: string
	default: "192.168.10.211"
	NR_HUGEPAGES:
	type: string
	default: "2048"
	GRAFANA_ENDPOINT:
	type: string
	default: "http://192.168.10.211/grafana"
	SBCLI_CMD:
	type: string
	default: "sbctl"
	SSH_USER:
	type: string
	default: "root"
	KEY_PATH:
	type: string
	default: "/home/ec2-user/.ssh/simplyblock-us-east-2.pem"
	CLIENTNODES:
	type: string
	default: "192.168.10.165 192.168.10.166"
	NFS_MOUNTPOINT:
	type: string
	default: "/mnt/nfs_share"
	BOOTSTRAP_MAX_LVOL:
	type: string
	default: "300"
	BOOTSTRAP_DATA_CHUNKS:
	type: string
	default: "2"
	BOOTSTRAP_PARITY_CHUNKS:
	type: string
	default: "2"
	BOOTSTRAP_JOURNAL_PARTITION:
	type: string
	default: "1"
	BOOTSTRAP_HA_JM_COUNT:
	type: string
	default: "3"
	BOOTSTRAP_HA_TYPE:
	type: string
	default: "ha"
	BOOTSTRAP_DATA_NIC:
	type: string
	default: "eth1"
	BOOTSTRAP_IS_SINGLE_NODE:
	type: boolean
	default: false
	BOOTSTRAP_ENABLE_NODE_AFFINITY:
	type: boolean
	default: false
	TEST_CLASS:
	type: string
	default: "major_upgrade"
	RUN_LABEL:
	type: string
	default: ""
	description: "Optional label appended to artifact names to avoid collisions (e.g. 'run1')"

	workflow_dispatch:
	inputs:
	BASE_SBCLI_BRANCH:
	description: "sbcli branch / version tag to bootstrap the cluster with (e.g. R25.10-Hotfix)"
	required: true
	CUSTOM_IMAGES:
	description: "Image overrides: set base_spdk, target_spdk, base_docker and/or target_docker values, leave as \"\" to skip."
	required: false
	default: 'base_spdk="" target_spdk="" base_docker="" target_docker=""'
	TARGET_SBCLI_BRANCH:
	description: "sbcli branch / version tag to upgrade to (e.g. main)"
	required: true
	STORAGE_PRIVATE_IPS:
	description: "Space-separated storage node IPs (also used for cleanup)"
	required: true
	default: "192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208"
	API_INVOKE_URL:
	description: "API invoke URL"
	required: true
	default: "http://192.168.10.211/"
	BASTION_IP:
	description: "Bastion IP"
	required: true
	default: "192.168.10.211"
	MNODES:
	description: "Management node(s) IPs"
	required: true
	default: "192.168.10.211"
	NR_HUGEPAGES:
	description: "Hugepages"
	required: true
	default: "2048"
	GRAFANA_ENDPOINT:
	description: "Grafana endpoint"
	required: true
	default: "http://192.168.10.211/grafana"
	SBCLI_CMD:
	description: "sbcli command name"
	required: true
	default: "sbctl"
	SSH_USER:
	description: "SSH user"
	required: true
	default: "root"
	KEY_PATH:
	description: "SSH private key path on runner"
	required: true
	default: "/home/ec2-user/.ssh/simplyblock-us-east-2.pem"
	CLIENTNODES:
	description: "Space-separated client node IPs"
	required: true
	default: "192.168.10.165 192.168.10.166"
	NFS_MOUNTPOINT:
	description: "NFS mountpoint to unmount everywhere"
	required: true
	default: "/mnt/nfs_share"
	BOOTSTRAP_MAX_LVOL:
	description: "bootstrap: --max-lvol"
	required: true
	default: "300"
	BOOTSTRAP_DATA_CHUNKS:
	description: "bootstrap: --data-chunks-per-stripe"
	required: true
	default: "2"
	BOOTSTRAP_PARITY_CHUNKS:
	description: "bootstrap: --parity-chunks-per-stripe"
	required: true
	default: "2"
	BOOTSTRAP_JOURNAL_PARTITION:
	description: "bootstrap: --journal-partition"
	required: true
	default: "1"
	BOOTSTRAP_HA_JM_COUNT:
	description: "bootstrap: --ha-jm-count"
	required: true
	default: "3"
	BOOTSTRAP_HA_TYPE:
	description: "bootstrap: --ha-type"
	required: true
	default: "ha"
	BOOTSTRAP_DATA_NIC:
	description: "bootstrap: --data-nics"
	required: true
	default: "eth1"
	BOOTSTRAP_IS_SINGLE_NODE:
	description: "Bootstrap: deploy as single-node"
	type: boolean
	required: false
	default: false
	BOOTSTRAP_ENABLE_NODE_AFFINITY:
	description: "Bootstrap: enable node affinity"
	type: boolean
	required: false
	default: false
	TEST_CLASS:
	description: "Upgrade test class name (--testname); leave empty to run all upgrade tests"
	required: false
	type: string
	default: "major_upgrade"

	concurrency:
	group: simplyblock-lab-upgrade
	cancel-in-progress: false

	jobs:
	bootstrap-and-upgrade:
	name: Pre-clean -> Bootstrap (${{ inputs.BASE_SBCLI_BRANCH }}) -> Upgrade (${{ inputs.TARGET_SBCLI_BRANCH }})
	runs-on: [self-hosted]
	timeout-minutes: 300

	env:
	# Upgrade-specific
	BASE_SBCLI_BRANCH: ${{ inputs.BASE_SBCLI_BRANCH }}
	TARGET_SBCLI_BRANCH: ${{ inputs.TARGET_SBCLI_BRANCH }}
	CUSTOM_IMAGES: ${{ inputs.CUSTOM_IMAGES \|\| 'base_spdk="" target_spdk="" base_docker="" target_docker=""' }}

	# Cluster/lab env
	STORAGE_PRIVATE_IPS: ${{ inputs.STORAGE_PRIVATE_IPS \|\| '192.168.10.205 192.168.10.206 192.168.10.207 192.168.10.208' }}
	API_INVOKE_URL: ${{ inputs.API_INVOKE_URL \|\| 'http://192.168.10.211/' }}
	API_BASE_URL: ${{ inputs.API_INVOKE_URL \|\| 'http://192.168.10.211/' }}
	BASTION_IP: ${{ inputs.BASTION_IP \|\| '192.168.10.211' }}
	BASTION_SERVER: ${{ inputs.BASTION_IP \|\| '192.168.10.211' }}
	MNODES: ${{ inputs.MNODES \|\| '192.168.10.211' }}
	NR_HUGEPAGES: ${{ inputs.NR_HUGEPAGES \|\| '2048' }}
	GRAFANA_ENDPOINT: ${{ inputs.GRAFANA_ENDPOINT \|\| 'http://192.168.10.211/grafana' }}
	SBCLI_CMD: ${{ inputs.SBCLI_CMD \|\| 'sbctl' }}

	# SSH/client env
	SSH_USER: ${{ inputs.SSH_USER \|\| 'root' }}
	KEY_PATH: ${{ inputs.KEY_PATH \|\| '/home/ec2-user/.ssh/simplyblock-us-east-2.pem' }}
	CLIENTNODES: ${{ inputs.CLIENTNODES \|\| '192.168.10.165 192.168.10.166' }}
	CLIENT_IP: ${{ inputs.CLIENTNODES \|\| '192.168.10.165 192.168.10.166' }}

	# Cleanup
	NFS_MOUNTPOINT: ${{ inputs.NFS_MOUNTPOINT \|\| '/mnt/nfs_share' }}

	# Bootstrap params
	BOOTSTRAP_MAX_LVOL: ${{ inputs.BOOTSTRAP_MAX_LVOL \|\| '300' }}
	BOOTSTRAP_DATA_CHUNKS: ${{ inputs.BOOTSTRAP_DATA_CHUNKS \|\| '2' }}
	BOOTSTRAP_PARITY_CHUNKS: ${{ inputs.BOOTSTRAP_PARITY_CHUNKS \|\| '2' }}
	BOOTSTRAP_JOURNAL_PARTITION: ${{ inputs.BOOTSTRAP_JOURNAL_PARTITION \|\| '1' }}
	BOOTSTRAP_HA_JM_COUNT: ${{ inputs.BOOTSTRAP_HA_JM_COUNT \|\| '3' }}
	BOOTSTRAP_HA_TYPE: ${{ inputs.BOOTSTRAP_HA_TYPE \|\| 'ha' }}
	BOOTSTRAP_DATA_NIC: ${{ inputs.BOOTSTRAP_DATA_NIC \|\| 'eth1' }}

	TEST_CLASS: ${{ inputs.TEST_CLASS \|\| 'major_upgrade' }}

	# Secrets
	SSH_PASSWORD: ${{ secrets.SSH_PASSWORD }}
	SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
	MINIO_ACCESS_KEY: ${{ secrets.MINIO_ACCESS_KEY }}
	MINIO_SECRET_KEY: ${{ secrets.MINIO_SECRET_KEY }}
	SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }}

	# Filled after bootstrap parsing
	CLUSTER_ID: ""
	CLUSTER_SECRET: ""

	steps:
	- name: Runner diagnostics
	shell: bash
	run: \|
	set -euxo pipefail
	uname -a
	whoami
	pwd
	python3 --version \|\| true
	git --version

	- name: Clear stale test artifacts from previous run
	shell: bash
	run: \|
	rm -f sbcli/e2e/output.log \|\| true

	- name: Install prereqs (sshpass)
	shell: bash
	run: \|
	set -euxo pipefail
	if command -v sshpass >/dev/null 2>&1; then
	exit 0
	fi
	if command -v apt-get >/dev/null 2>&1; then
	sudo apt-get update -y
	sudo apt-get install -y sshpass
	elif command -v yum >/dev/null 2>&1; then
	sudo yum install -y epel-release \|\| true
	sudo yum install -y sshpass
	elif command -v dnf >/dev/null 2>&1; then
	sudo dnf install -y sshpass
	else
	echo "ERROR: Cannot install sshpass (unknown package manager)."
	exit 1
	fi

	- name: Parse CUSTOM_IMAGES overrides
	shell: bash
	run: \|
	set -euxo pipefail
	custom="${CUSTOM_IMAGES}"
	for item in $custom; do
	key="${item%%=*}"
	value="${item#*=}"
	value="${value//\"/}"
	if [[ -z "$value" ]]; then
	echo "Skipping $key (empty)"
	continue
	fi
	case "$key" in
	base_spdk) echo "BASE_SPDK_IMAGE=$value" >> "$GITHUB_ENV"
	echo "SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=$value" >> "$GITHUB_ENV"
	echo "Base SPDK image set: $value" ;;
	target_spdk) echo "TARGET_SPDK_IMAGE=$value" >> "$GITHUB_ENV"
	echo "Target SPDK image set: $value" ;;
	base_docker) echo "BASE_DOCKER_IMAGE=$value" >> "$GITHUB_ENV"
	echo "SIMPLY_BLOCK_DOCKER_IMAGE=$value" >> "$GITHUB_ENV"
	echo "Base Docker image set: $value" ;;
	target_docker) echo "TARGET_DOCKER_IMAGE=$value" >> "$GITHUB_ENV"
	echo "Target Docker image set: $value" ;;
	*) echo "Unknown image key: $key (ignored)" ;;
	esac
	done

	- name: Resolve KEY_PATH and validate key exists
	shell: bash
	run: \|
	set -euxo pipefail

	kp="${KEY_PATH}"
	kp="${kp%\"}"; kp="${kp#\"}"
	kp="${kp%\'}"; kp="${kp#\'}"
	if [[ "$kp" == .ssh/* ]]; then kp="${HOME}/${kp}"; fi
	if [[ "$kp" == ~/* ]]; then kp="${HOME}/${kp#~/}"; fi
	if [[ "$kp" == "~.ssh/"* ]]; then kp="${HOME}/.${kp#~.}"; fi

	echo "Resolved KEY_PATH=$kp"
	echo "KEY_PATH=$kp" >> "$GITHUB_ENV"

	test -f "$kp" \|\| (echo "ERROR: SSH key not found at $kp" && exit 1)
	chmod 600 "$kp" \|\| true

	- name: Export KEY_NAME from KEY_PATH
	shell: bash
	run: \|
	set -euxo pipefail
	key_name="$(basename "${KEY_PATH}")"
	echo "KEY_NAME=${key_name}" >> "$GITHUB_ENV"
	echo "Exported KEY_NAME=${key_name}"

	- name: Validate required secrets exist
	shell: bash
	run: \|
	set -euxo pipefail
	[[ -n "${SSH_PASSWORD}" ]] \|\| (echo "ERROR: secrets.SSH_PASSWORD required" && exit 1)

	# ============================================================
	# PRE-BOOTSTRAP CLEANUP
	# ============================================================
	- name: Pre-clean kill fio/tmux and unmount NFS on MNODES + storage + clients
	shell: bash
	run: \|
	set -euxo pipefail

	run_remote() {
	local ip="$1"
	local script="$2"
	sshpass -p "${SSH_PASSWORD}" ssh \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	"${SSH_USER}@${ip}" "bash -s" <<< "$script"
	}

	targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES"
	uniq_targets="$(echo "$targets" \| tr ' ' '\n' \| sed '/^$/d' \| sort -u \| tr '\n' ' ')"

	for ip in $uniq_targets; do
	echo "---- $ip: kill fio/tmux + umount ${NFS_MOUNTPOINT} ----"
	run_remote "$ip" "set -euxo pipefail;
	pkill -9 fio \|\| true;
	pkill -9 tmux \|\| true;
	mp='${NFS_MOUNTPOINT}';
	if mountpoint -q \"\$mp\"; then umount -f \"\$mp\" \|\| umount \"\$mp\"; else
	if mount \| grep -q \" \$mp \"; then umount -f \"\$mp\" \|\| umount \"\$mp\" \|\| true; fi
	fi"
	done

	- name: Destroy/clean storage nodes (deploy-cleaner, docker prune, uninstall sbcli, k3s)
	shell: bash
	run: \|
	set -euxo pipefail

	run_remote() {
	local ip="$1"
	local script="$2"
	sshpass -p "${SSH_PASSWORD}" ssh \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	"${SSH_USER}@${ip}" "bash -s" <<< "$script"
	}

	targets="$MNODES $STORAGE_PRIVATE_IPS"
	uniq_targets="$(echo "$targets" \| tr ' ' '\n' \| sed '/^$/d' \| sort -u \| tr '\n' ' ')"

	for ip in $uniq_targets; do
	echo "---- storage destroy/clean: $ip ----"
	run_remote "$ip" "set -euxo pipefail;
	systemctl stop firewalld \|\| true;
	systemctl stop ufw \|\| true;
	sysctl -w net.ipv6.conf.all.disable_ipv6=1 \|\| true;

	'${SBCLI_CMD}' sn deploy-cleaner \|\| echo 'WARN: deploy-cleaner failed';

	docker stop \$(docker ps -aq) \|\| true;
	docker rm -f \$(docker ps -aq) \|\| true;
	docker builder prune --all -f \|\| true;
	docker system prune -af \|\| true;
	docker volume prune -f \|\| true;
	docker rmi -f \$(docker images -aq) \|\| true;

	pip uninstall -y '${SBCLI_CMD}' \|\| echo 'WARN: uninstall sbcli failed';
	pip uninstall -y sbctl \|\| echo 'WARN: uninstall sbctl failed';
	rm -rf /usr/local/bin/sbc* \|\| true;
	k3s-agent-uninstall.sh \|\| true"
	sleep 10
	done

	- name: Client cleanup disconnect lvols; unmount all /mnt; remove /mnt dirs
	shell: bash
	run: \|
	set -euxo pipefail

	run_remote() {
	local ip="$1"
	local script="$2"
	sshpass -p "${SSH_PASSWORD}" ssh \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	"${SSH_USER}@${ip}" "bash -s" <<< "$script"
	}

	for ip in $CLIENTNODES; do
	echo "---- client disconnect lvols: $ip ----"
	run_remote "$ip" "set -euxo pipefail;
	subsystems=\$(nvme list-subsys \| grep -i lvol \| awk '{print \$3}' \| cut -d '=' -f 2 \|\| true);
	for s in \$subsystems; do nvme disconnect -n \"\$s\" \|\| true; done"
	done

	targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES"
	uniq_targets="$(echo "$targets" \| tr ' ' '\n' \| sed '/^$/d' \| sort -u \| tr '\n' ' ')"

	still=0
	for ip in $uniq_targets; do
	if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \
	"mount \| grep -q \" ${NFS_MOUNTPOINT} \""; then
	echo "ERROR: ${NFS_MOUNTPOINT} still mounted on $ip"
	still=1
	fi
	done
	[[ "$still" -eq 0 ]] \|\| exit 1

	for ip in $CLIENTNODES; do
	echo "---- client unmount all /mnt and remove dirs: $ip ----"
	run_remote "$ip" "set -euxo pipefail;
	mps=\$(mount \| grep ' /mnt' \| awk '{print \$3}' \|\| true);
	for mp in \$mps; do umount -f \"\$mp\" \|\| umount \"\$mp\" \|\| true; done;
	dirs=\$(find /mnt -mindepth 1 -type d 2>/dev/null \|\| true);
	for d in \$dirs; do rm -rf \"\$d\" \|\| true; done"
	done

	- name: Remove /etc/simplyblock; reboot storage; disk reset + PCI rebind + mklabel
	shell: bash
	run: \|
	set -euxo pipefail

	run_remote() {
	local ip="$1"
	local script="$2"
	sshpass -p "${SSH_PASSWORD}" ssh \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	"${SSH_USER}@${ip}" "bash -s" <<< "$script"
	}

	targets="$MNODES $STORAGE_PRIVATE_IPS"
	uniq_targets="$(echo "$targets" \| tr ' ' '\n' \| sed '/^$/d' \| sort -u \| tr '\n' ' ')"
	for ip in $uniq_targets; do
	run_remote "$ip" "rm -rf /etc/simplyblock \|\| true"
	done

	for ip in $STORAGE_PRIVATE_IPS; do
	echo "---- reboot storage: $ip ----"
	sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \
	"nohup reboot >/dev/null 2>&1 &" \|\| true
	done

	for ip in $STORAGE_PRIVATE_IPS; do
	echo "Waiting for $ip..."
	for i in {1..60}; do
	if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 "${SSH_USER}@${ip}" \
	"echo online" >/dev/null 2>&1; then
	echo "$ip online"
	break
	fi
	sleep 10
	[[ "$i" -lt 60 ]] \|\| (echo "ERROR: $ip did not come online" && exit 1)
	done
	done

	for ip in $STORAGE_PRIVATE_IPS; do
	echo "---- disk reset on $ip ----"
	run_remote "$ip" "set -euxo pipefail;
	for dev in /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1; do
	if [[ -b \$dev ]]; then
	parted \$dev --script rm 1 \|\| true;
	parted \$dev --script rm 2 \|\| true;
	parted \$dev --script rm 3 \|\| true;
	fi
	done;

	for pci in \$(lspci -D \| grep 'QEMU NVM Express' \| awk '{print \$1}' \|\| true); do
	echo \$pci > /sys/bus/pci/drivers/nvme/unbind \|\| true;
	done;

	for i in 2 3 4 5; do
	echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/uio_pci_generic/unbind \|\| true;
	done;
	for i in 2 3 4 5; do
	echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/vfio-pci/unbind \|\| true;
	done;

	for i in 2 3 4 5; do
	echo nvme > /sys/bus/pci/devices/0000:00:0\${i}.0/driver_override \|\| true;
	done;
	for i in 2 3 4 5; do
	echo 0000:00:0\${i}.0 > /sys/bus/pci/drivers/nvme/bind \|\| true;
	done;

	for i in 0 1 2 3; do
	dev=/dev/nvme\${i}n1;
	if [[ -b \$dev ]]; then
	parted -fs \$dev mklabel gpt \|\| true;
	fi
	done"
	done

	# ============================================================
	# BOOTSTRAP (using BASE_SBCLI_BRANCH + BASE_SPDK_IMAGE)
	# ============================================================
	- name: Clone simplyBlockDeploy (bootstrap repo)
	shell: bash
	run: \|
	set -euxo pipefail
	rm -rf simplyBlockDeploy
	git clone https://github.com/simplyblock-io/simplyBlockDeploy.git simplyBlockDeploy
	test -f simplyBlockDeploy/bare-metal/bootstrap-cluster.sh

	- name: Bootstrap cluster (BASE_SBCLI_BRANCH=${{ inputs.BASE_SBCLI_BRANCH }})
	shell: bash
	run: \|
	set -euxo pipefail
	cd simplyBlockDeploy/bare-metal
	chmod +x ./bootstrap-cluster.sh

	extra_args=()
	# Images exported by the "Parse CUSTOM_IMAGES overrides" step
	if [[ -n "${SIMPLY_BLOCK_SPDK_ULTRA_IMAGE-}" ]]; then
	extra_args+=( --spdk-image "${SIMPLY_BLOCK_SPDK_ULTRA_IMAGE}" )
	fi
	if [[ "${{ inputs.BOOTSTRAP_IS_SINGLE_NODE }}" == "true" ]]; then
	extra_args+=( --is-single-node true )
	fi
	if [[ "${{ inputs.BOOTSTRAP_ENABLE_NODE_AFFINITY }}" == "true" ]]; then
	extra_args+=( --enable-node-affinity true )
	fi

	set +e
	./bootstrap-cluster.sh \
	--sbcli-cmd "${SBCLI_CMD}" \
	--max-lvol "${BOOTSTRAP_MAX_LVOL}" \
	--data-chunks-per-stripe "${BOOTSTRAP_DATA_CHUNKS}" \
	--parity-chunks-per-stripe "${BOOTSTRAP_PARITY_CHUNKS}" \
	--journal-partition "${BOOTSTRAP_JOURNAL_PARTITION}" \
	--ha-jm-count "${BOOTSTRAP_HA_JM_COUNT}" \
	--ha-type "${BOOTSTRAP_HA_TYPE}" \
	--data-nics "${BOOTSTRAP_DATA_NIC}" \
	"${extra_args[@]}" \| tee bootstrap.log
	rc=${PIPESTATUS[0]}
	set -e

	[[ "$rc" -eq 0 ]] \|\| (echo "ERROR: bootstrap failed (rc=$rc)" && exit "$rc")

	- name: Fetch CLUSTER_ID and CLUSTER_SECRET from MNODES
	shell: bash
	run: \|
	set -euxo pipefail

	mgmt_ip="$(echo "${MNODES}" \| awk '{print $1}')"
	ssh_common=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i "${KEY_PATH}")

	cluster_id="$(ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" \
	"${SBCLI_CMD} cluster list" \| grep -Eo '[0-9a-fA-F]{8}-([0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}' \| head -n 1)"

	if [[ -z "${cluster_id}" ]]; then
	echo "ERROR: Could not extract cluster_id from '${SBCLI_CMD} cluster list' on ${mgmt_ip}"
	ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" "${SBCLI_CMD} cluster list" \|\| true
	exit 1
	fi

	cluster_secret="$(ssh "${ssh_common[@]}" "${SSH_USER}@${mgmt_ip}" \
	"${SBCLI_CMD} cluster get-secret ${cluster_id}" \| tr -d '\r' \| tail -n 1 \| xargs)"

	if [[ -z "${cluster_secret}" ]]; then
	echo "ERROR: Could not get cluster_secret from '${SBCLI_CMD} cluster get-secret ${cluster_id}' on ${mgmt_ip}"
	exit 1
	fi

	echo "CLUSTER_ID=${cluster_id}" >> "$GITHUB_ENV"
	echo "CLUSTER_SECRET=${cluster_secret}" >> "$GITHUB_ENV"
	echo "Fetched CLUSTER_ID=${cluster_id}"
	echo "Fetched CLUSTER_SECRET=*set*"

	# ============================================================
	# UPGRADE E2E TESTS (clone from TARGET_SBCLI_BRANCH)
	# ============================================================
	- name: Clone sbcli repo (prefer workflow branch; fallback to TARGET_SBCLI_BRANCH)
	shell: bash
	run: \|
	set -euxo pipefail
	rm -rf sbcli

	wf_branch="${{ github.ref_name }}"
	fallback_branch="${TARGET_SBCLI_BRANCH}"

	echo "Workflow branch: $wf_branch"
	echo "Fallback sbcli branch (TARGET_SBCLI_BRANCH): $fallback_branch"

	if git ls-remote --heads https://github.com/simplyblock-io/sbcli.git "$wf_branch" \| grep -q "$wf_branch"; then
	echo "Cloning sbcli on workflow branch: $wf_branch"
	git clone --branch "$wf_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli
	else
	echo "Branch '$wf_branch' not found in sbcli; cloning fallback branch: $fallback_branch"
	git clone --branch "$fallback_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli
	fi

	test -f sbcli/e2e/upgrade_e2e.py
	test -f sbcli/e2e/e2e_tests/upgrade_tests/major_upgrade.py
	test -f sbcli/e2e/logs/cleanup.py

	- name: Install Python deps (best-effort)
	shell: bash
	run: \|
	set -euxo pipefail
	python3 -m pip install --upgrade pip
	if [[ -f "sbcli/e2e/requirements.txt" ]]; then
	pip install -r sbcli/e2e/requirements.txt
	fi

	- name: Cleanup logs before upgrade e2e
	shell: bash
	working-directory: sbcli/e2e
	run: \|
	set -euxo pipefail
	python3 logs/cleanup.py

	- name: Set RUN_BASE_DIR
	shell: bash
	run: \|
	set -euxo pipefail
	RUN_TIMESTAMP="$(date +%Y%m%d-%H%M%S)"
	RUN_BASE_DIR="${NFS_MOUNTPOINT}/upgrade-run-${RUN_TIMESTAMP}-${GITHUB_RUN_ID}"
	echo "RUN_BASE_DIR=${RUN_BASE_DIR}" >> "$GITHUB_ENV"
	mkdir -p "${RUN_BASE_DIR}"

	- name: Record test start time
	shell: bash
	run: \|
	set -euxo pipefail
	echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
	echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"

	- name: Run upgrade e2e tests (major_upgrade)
	shell: bash
	working-directory: sbcli/e2e
	run: \|
	set -euxo pipefail
	TESTNAME_ARGS=()
	if [[ -n "${TEST_CLASS:-}" ]]; then
	TESTNAME_ARGS=(--testname "${TEST_CLASS}")
	fi
	python3 -u upgrade_e2e.py \
	--base_version "${BASE_SBCLI_BRANCH}" \
	--target_version "${TARGET_SBCLI_BRANCH}" \
	--base_spdk_image "${BASE_SPDK_IMAGE:-}" \
	--target_spdk_image "${TARGET_SPDK_IMAGE:-}" \
	--target_docker_image "${TARGET_DOCKER_IMAGE:-}" \
	"${TESTNAME_ARGS[@]}" \
	2>&1 \| tee output.log

	- name: Mark test end time (always)
	if: always()
	shell: bash
	run: \|
	set -euxo pipefail
	echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
	echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"

	- name: Collect mgmt snapshots into RUN_BASE_DIR (always)
	if: always()
	shell: bash
	run: \|
	set -euxo pipefail

	python3 - <<'PY'
	import os, subprocess, json

	mgmt_ip = os.environ["MNODES"].split()[0]
	key = os.environ["KEY_PATH"]
	user = os.environ["SSH_USER"]
	sbcli = os.environ["SBCLI_CMD"]
	cluster_id = os.environ["CLUSTER_ID"]
	run_base = os.environ["RUN_BASE_DIR"].rstrip("/")
	outdir = f"{run_base}/{mgmt_ip}/mgmt_details"

	os.makedirs(f"{outdir}/mgmt", exist_ok=True)
	os.makedirs(f"{outdir}/subtasks", exist_ok=True)
	os.makedirs(f"{outdir}/storage_nodes", exist_ok=True)

	ssh_base = [
	"ssh", "-i", key,
	"-o", "StrictHostKeyChecking=no",
	"-o", "UserKnownHostsFile=/dev/null",
	"-o", "ConnectTimeout=10",
	f"{user}@{mgmt_ip}",
	]

	def run_cmd(cmd, out_file):
	print(f" {cmd} -> {out_file}", flush=True)
	try:
	with open(out_file, "w") as f:
	subprocess.run(ssh_base + [cmd], stdout=f, stderr=subprocess.STDOUT, check=False, timeout=60)
	except Exception as e:
	print(f" WARN: failed: {e}", flush=True)

	run_cmd(f"{sbcli} cluster list", f"{outdir}/mgmt/cluster_list.txt")
	run_cmd(f"{sbcli} cluster status {cluster_id}", f"{outdir}/mgmt/cluster_status.txt")
	run_cmd(f"{sbcli} cluster show {cluster_id}", f"{outdir}/mgmt/cluster_show.txt")
	run_cmd(f"{sbcli} cluster get-capacity {cluster_id}", f"{outdir}/mgmt/cluster_capacity.txt")
	run_cmd(f"{sbcli} cluster get-logs {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_get_logs.txt")

	run_cmd(f"{sbcli} pool list", f"{outdir}/mgmt/pool_list.txt")
	run_cmd(f"{sbcli} lvol list", f"{outdir}/mgmt/lvol_list.txt")
	run_cmd(f"{sbcli} snapshot list", f"{outdir}/mgmt/snapshot_list.txt")

	run_cmd(f"{sbcli} sn list", f"{outdir}/mgmt/sn_list.txt")
	run_cmd(f"{sbcli} sn list --json", f"{outdir}/mgmt/sn_list.json")

	sn_uuids = []
	try:
	with open(f"{outdir}/mgmt/sn_list.json") as f:
	data = json.load(f)
	for item in (data if isinstance(data, list) else []):
	uid = item.get("UUID") or item.get("uuid") or item.get("Id") or item.get("id")
	if uid:
	sn_uuids.append(uid)
	except Exception:
	pass

	for idx, uuid in enumerate(sn_uuids, 1):
	run_cmd(f"{sbcli} sn list-devices {uuid}", f"{outdir}/storage_nodes/node{idx}_list_devices.txt")
	run_cmd(f"{sbcli} sn check {uuid}", f"{outdir}/storage_nodes/node{idx}_check.txt")
	run_cmd(f"{sbcli} sn get {uuid}", f"{outdir}/storage_nodes/node{idx}_get.txt")

	run_cmd(f"{sbcli} cluster list-tasks {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_list_tasks.txt")

	bal_ids = []
	try:
	with open(f"{outdir}/mgmt/cluster_list_tasks.txt") as f:
	for line in f:
	if line.startswith("+") or "Task ID" in line or "\|" not in line:
	continue
	cols = [c.strip() for c in line.split("\|")]
	if len(cols) >= 5 and cols[3] == "balancing_on_restart" and cols[1]:
	bal_ids.append(cols[1])
	except Exception:
	pass

	for tid in bal_ids:
	run_cmd(f"{sbcli} cluster get-subtasks {tid}", f"{outdir}/subtasks/{tid}_subtasks.txt")
	PY

	- name: Collect docker logs into RUN_BASE_DIR (always)
	if: always()
	shell: bash
	run: \|
	set -euxo pipefail

	TAG="containers-final-$(date +%Y%m%d_%H%M%S)"
	SSH_OPTS=(-i "${KEY_PATH}" -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -C)

	NODES="$(echo "${MNODES} ${STORAGE_PRIVATE_IPS}" \| tr ' ' '\n' \| sed '/^$/d' \| sort -u \| tr '\n' ' ')"

	for NODE in ${NODES}; do
	echo ">>> Node: ${NODE}"
	LOCAL_NODE_DIR="${RUN_BASE_DIR}/${NODE}/${TAG}"
	mkdir -p "${LOCAL_NODE_DIR}"

	ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "docker ps -a 2>&1 \|\| true" \
	> "${LOCAL_NODE_DIR}/docker_ps_a_${NODE}.txt" \|\| true

	CONTAINERS="$(ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \
	"docker ps -a --format '{{.Names}}' 2>/dev/null \|\| true" 2>/dev/null \|\| true)"

	if [[ -z "${CONTAINERS}" ]]; then
	echo "No containers found on ${NODE}" > "${LOCAL_NODE_DIR}/_NO_CONTAINERS_${NODE}.txt"
	continue
	fi

	while IFS= read -r C; do
	[[ -z "${C}" ]] && continue
	echo " dumping: ${C}"
	ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \
	"docker logs --timestamps --details '${C}' 2>&1 \|\| true" \
	> "${LOCAL_NODE_DIR}/${C}.txt" \|\| true
	ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \
	"docker inspect '${C}' 2>&1 \|\| true" \
	> "${LOCAL_NODE_DIR}/${C}_inspect.json" \|\| true
	done <<< "${CONTAINERS}"
	done

	- name: Collect distrib debug dumps into RUN_BASE_DIR (always)
	if: always()
	timeout-minutes: 35
	shell: bash
	run: \|
	set -euxo pipefail

	python3 - <<'PY'
	import os, subprocess, sys

	ssh_user = os.environ["SSH_USER"]
	key = os.environ["KEY_PATH"]
	run_base = os.environ["RUN_BASE_DIR"].rstrip("/")
	tag = "finaldistrib_bdev_logs"

	storage_ips = os.environ["STORAGE_PRIVATE_IPS"].split()

	ssh_base = [
	"ssh",
	"-i", key,
	"-o", "StrictHostKeyChecking=no",
	"-o", "UserKnownHostsFile=/dev/null",
	"-o", "ServerAliveInterval=15",
	"-o", "ServerAliveCountMax=4",
	"-o", "ConnectTimeout=10",
	"-C",
	]

	scp_base = [
	"scp",
	"-i", key,
	"-o", "StrictHostKeyChecking=no",
	"-o", "UserKnownHostsFile=/dev/null",
	"-o", "ConnectTimeout=10",
	"-C",
	]

	remote_script = """\
	set -euo pipefail
	TS="$(date +%Y%m%d_%H%M%S)"
	HOST="$(hostname -s 2>/dev/null \|\| hostname)"
	STAGING="/tmp/distrib_host_collect_${TS}"
	mkdir -p "$STAGING"

	CN="$(sudo docker ps --format '{{.Names}}' \| grep -E '^spdk_[0-9]+$' \| head -n1 \|\| true)"
	if [[ -z "$CN" ]]; then echo "NO_SPDK_CONTAINER"; exit 0; fi
	SOCK="/mnt/ramdisk/${CN}/spdk.sock"

	BDEV_JSON="$(sudo docker exec "$CN" bash -lc "python spdk/scripts/rpc.py -s '$SOCK' bdev_get_bdevs" 2>/dev/null \|\| true)"
	if [[ -z "$BDEV_JSON" ]]; then echo "BDEV_EMPTY"; exit 0; fi

	if command -v jq >/dev/null 2>&1; then
	mapfile -t DISTRIBS < <(printf '%s' "$BDEV_JSON" \| jq -r '.[] \| select(.name\|startswith("distrib_")) \| .name' \| sort -u)
	else
	mapfile -t DISTRIBS < <(printf '%s\\n' "$BDEV_JSON" \| grep -oE '"name"\\s:\\s"distrib_[^"]+"' \| sed -E 's/."name"\\s:\\s"([^"]+)"./\\1/' \| sort -u)
	fi

	if [[ ${#DISTRIBS[@]} -eq 0 ]]; then echo "NO_DISTRIBS"; exit 0; fi

	for d in "${DISTRIBS[@]}"; do
	JF="/tmp/stack_${d}.json"

	python3 - "$d" "$JF" <<'PYIN'
	import json, sys
	d = sys.argv[1]
	jf = sys.argv[2]
	obj = {"subsystems":[{"subsystem":"distr","config":[{"method":"distr_debug_placement_map_dump","params":{"name":d}}]}]}
	with open(jf, "w") as f:
	f.write(json.dumps(obj))
	PYIN

	sudo docker cp "$JF" "$CN:$JF" \|\| true
	sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py '$JF' '$SOCK' > /tmp/rpc_${d}.log 2>&1 \|\| true" \|\| true
	sudo docker cp "$CN:/tmp/rpc_${d}.log" "$STAGING/rpc_${d}.log" 2>/dev/null \|\| true

	for f in $(sudo docker exec "$CN" bash -lc "ls /tmp 2>/dev/null \| grep -F \\\"$d\\\" \|\| true"); do
	sudo docker cp "$CN:/tmp/$f" "$STAGING/${CN}__$f" 2>/dev/null \|\| true
	done

	sudo docker exec "$CN" bash -lc "rm -f '$JF' '/tmp/rpc_${d}.log'" \|\| true
	rm -f "$JF" \|\| true
	done

	cat /proc/meminfo \| grep -i huge > "$STAGING/hugepage_meminfo.txt" 2>/dev/null \|\| true

	TAR="/tmp/${HOST}_distrib_dumps_${TS}.tgz"
	tar -C "$STAGING" -czf "$TAR" . 2>/dev/null \|\| true
	echo "$TAR"
	"""

	for ip in storage_ips:
	print(f"=== {ip} ===", flush=True)
	cmd = ssh_base + [f"{ssh_user}@{ip}", "bash", "-s"]
	p = subprocess.run(cmd, input=remote_script.encode(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False)
	out = p.stdout.decode(errors="replace").strip().splitlines()
	last = out[-1].strip() if out else ""
	last = last.replace("\r", "")

	if last in ("NO_SPDK_CONTAINER", "BDEV_EMPTY", "NO_DISTRIBS") or not last.startswith("/tmp/"):
	print(f"[{ip}] WARN: distrib collection skipped/failed: {last or '(no output)'}", flush=True)
	continue

	dest_dir = f"{run_base}/{ip}/{tag}"
	os.makedirs(dest_dir, exist_ok=True)

	scp_cmd = scp_base + [f"{ssh_user}@{ip}:{last}", dest_dir + "/"]
	subprocess.run(scp_cmd, check=False)

	print(f"[{ip}] Saved -> {dest_dir}/{os.path.basename(last)}", flush=True)
	PY

	# =========================
	# SUMMARY (always)
	# =========================
	- name: Write Job Summary
	if: always()
	shell: bash
	run: \|
	set -euxo pipefail

	mgmt_ip="$(echo "${MNODES}" \| awk '{print $1}')"
	out_log="sbcli/e2e/output.log"

	# --- Timing ---
	start="${TEST_START_EPOCH:-0}"
	end="${TEST_END_EPOCH:-0}"
	dur_sec=0
	if [[ "$start" =~ ^[0-9]+$ && "$end" =~ ^[0-9]+$ && "$end" -ge "$start" ]]; then
	dur_sec=$((end-start))
	fi
	dur_h=$((dur_sec/3600)); dur_m=$(((dur_sec%3600)/60)); dur_s=$((dur_sec%60))
	dur_fmt="${dur_h}h ${dur_m}m ${dur_s}s"

	# --- Parse test counts from output.log (upgrade_e2e.py format) ---
	total_cases=0; passed_cases=0; failed_cases=0; skipped_cases=0
	if [[ -f "${out_log}" ]]; then
	v="$(grep -m1 'Total Cases:' "${out_log}" \| grep -oE '[0-9]+$' 2>/dev/null \|\| true)"; [[ "${v}" =~ ^[0-9]+$ ]] && total_cases="${v}"
	v="$(grep -m1 'Passed:' "${out_log}" \| grep -oE '[0-9]+$' 2>/dev/null \|\| true)"; [[ "${v}" =~ ^[0-9]+$ ]] && passed_cases="${v}"
	v="$(grep -m1 'Failed:' "${out_log}" \| grep -oE '[0-9]+$' 2>/dev/null \|\| true)"; [[ "${v}" =~ ^[0-9]+$ ]] && failed_cases="${v}"
	v="$(grep -m1 'Skipped:' "${out_log}" \| grep -oE '[0-9]+$' 2>/dev/null \|\| true)"; [[ "${v}" =~ ^[0-9]+$ ]] && skipped_cases="${v}"
	fi

	pass_pct=0; fail_pct=0; skip_pct=0
	if [[ "${total_cases}" -gt 0 ]]; then
	pass_pct=$(( (passed_cases * 100) / total_cases ))
	fail_pct=$(( (failed_cases * 100) / total_cases ))
	skip_pct=$(( (skipped_cases * 100) / total_cases ))
	fi

	# --- Parse per-test status ---
	test_details_table=""
	if [[ -f "${out_log}" ]]; then
	while IFS= read -r line; do
	clean="$(printf '%s' "${line}" \| sed 's/\x1b\[[0-9;]*m//g')"
	test_name="$(printf '%s' "${clean}" \| grep -oE 'Test[A-Za-z0-9]+' \| head -n1 \|\| true)"
	[[ -z "${test_name}" ]] && continue
	if printf '%s' "${clean}" \| grep -qi 'PASSED'; then icon="✅"; status="PASSED"
	elif printf '%s' "${clean}" \| grep -qi 'FAILED'; then icon="❌"; status="FAILED"
	elif printf '%s' "${clean}" \| grep -qi 'SKIPPED'; then icon="⏭"; status="SKIPPED"
	else continue
	fi
	test_details_table+="\| \`${test_name}\` \| ${icon} ${status} \|"$'\n'
	done < <(grep -iE 'PASSED\|FAILED\|SKIPPED' "${out_log}" 2>/dev/null \|\| true)
	fi

	# --- Failure reason ---
	failure_reason=""
	if [[ -f "${out_log}" ]]; then
	multi="$(grep 'MultipleExceptions:' "${out_log}" \| sed 's/\x1b\[[0-9;]*m//g' \|\| true)"
	if [[ -n "${multi}" ]]; then
	failure_reason="${multi}"
	elif grep -Eqi 'Traceback $most recent call last$\|Exception:\|AssertionError\|Input/output error' "${out_log}"; then
	failure_reason="$(grep -Ei 'Traceback $most recent call last$\|Exception:\|AssertionError\|Input/output error' "${out_log}" \| tail -n 3 \| sed 's/\x1b\[[0-9;]*m//g' \|\| true)"
	fi
	fi

	# --- Mgmt artifacts list ---
	mgmt_dir="${RUN_BASE_DIR:-}/${mgmt_ip}/mgmt_details/mgmt"
	mgmt_files="(not found)"
	if [[ -n "${RUN_BASE_DIR:-}" && -d "${mgmt_dir}" ]]; then
	mgmt_files="$(find "${mgmt_dir}" -maxdepth 1 -type f -printf '%f (%s bytes)\n' 2>/dev/null \| sort \|\| true)"
	[[ -n "${mgmt_files}" ]] \|\| mgmt_files="(empty)"
	fi

	# --- Overall result ---
	conclusion="✅ SUCCESS"
	if [[ "${{ job.status }}" != "success" ]]; then
	conclusion="❌ FAILED"
	fi

	{
	echo "## SimplyBlock Upgrade E2E Run Summary"
	echo ""
	echo "Result: ${conclusion}  \|  Duration: ${dur_fmt}"
	echo ""

	echo "### Upgrade"
	echo "\| \| Branch \| SPDK Image \|"
	echo "\|---\|---\|---\|"
	echo "\| Base \| \`${BASE_SBCLI_BRANCH}\` \| \`${BASE_SPDK_IMAGE:-default}\` \|"
	echo "\| Target \| \`${TARGET_SBCLI_BRANCH}\` \| \`${TARGET_SPDK_IMAGE:-default}\` \|"
	echo ""

	echo "### Test Results"
	echo "\| \| Count \| % \|"
	echo "\|---\|---\|---\|"
	echo "\| ✅ Passed \| ${passed_cases} \| ${pass_pct}% \|"
	echo "\| ❌ Failed \| ${failed_cases} \| ${fail_pct}% \|"
	echo "\| ⏭ Skipped \| ${skipped_cases} \| ${skip_pct}% \|"
	echo "\| Total \| ${total_cases} \| \|"
	echo ""

	if [[ -n "${test_details_table}" ]]; then
	echo "### Test Case Details"
	echo "\| Test \| Result \|"
	echo "\|---\|---\|"
	printf '%s' "${test_details_table}"
	echo ""
	fi

	echo "### Run Info"
	echo "- Test class: \`${TEST_CLASS:-all}\`"
	echo "- Cluster ID: \`${CLUSTER_ID}\`"
	echo "- Mgmt node: \`${mgmt_ip}\`"
	echo "- Start (UTC): ${TEST_START_HUMAN:-unknown}"
	echo "- End (UTC): ${TEST_END_HUMAN:-unknown}"
	echo ""

	if [[ -n "${failure_reason}" ]]; then
	echo "### Failure Reason"
	echo '```'
	printf '%s\n' "${failure_reason}"
	echo '```'
	echo ""
	fi

	if [[ -n "${RUN_BASE_DIR:-}" ]]; then
	echo "<details><summary>Run Artifacts (NFS)</summary>"
	echo ""
	echo "- Run dir: \`${RUN_BASE_DIR}/\`"
	echo "- Mgmt details: \`${RUN_BASE_DIR}/${mgmt_ip}/mgmt_details/\`"
	echo "- Docker logs: \`${RUN_BASE_DIR}/<node_ip>/containers-final-*/\`"
	echo "- Distrib dumps: \`${RUN_BASE_DIR}/<storage_ip>/finaldistrib_bdev_logs/\`"
	echo ""
	echo "</details>"
	echo ""
	fi

	echo "<details><summary>Mgmt Artifacts (cluster state at end of run)</summary>"
	echo ""
	echo "Path: \`${mgmt_dir}\`"
	echo ""
	echo '```'
	printf '%s\n' "${mgmt_files}"
	echo '```'
	echo ""
	echo "</details>"
	} >> "$GITHUB_STEP_SUMMARY"

	- name: Send Slack Notification
	if: always()
	shell: bash
	env:
	SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
	JOB_STATUS: ${{ job.status }}
	SLACK_RUN_URL: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	GITHUB_REF_NAME: ${{ github.ref_name }}
	SLACK_WF_NAME: "E2E (Upgrade Bootstrap)"
	run: \|
	python3 - <<'PYEOF'
	import json, os, re, sys, urllib.request, urllib.error

	webhook = os.environ.get("SLACK_WEBHOOK_URL", "")
	if not webhook:
	print("No SLACK_WEBHOOK_URL set, skipping.")
	sys.exit(0)

	out_log = "sbcli/e2e/output.log"
	content = open(out_log).read() if os.path.isfile(out_log) else ""

	# --- Counts (upgrade_e2e.py format) ---
	def px(pat):
	m = re.search(pat, content)
	return int(m.group(1)) if m else 0
	total = px(r'Total Cases:\s*(\d+)')
	passed = px(r'Passed:\s*(\d+)')
	failed = px(r'Failed:\s*(\d+)')
	skipped = px(r'Skipped:\s*(\d+)')
	pass_pct = (passed * 100 // total) if total > 0 else 0

	# --- Per-test results ---
	ansi = re.compile(r'\x1b\[[0-9;]*m')
	test_results = []
	for line in content.splitlines():
	clean = ansi.sub('', line)
	m = re.search(r'Test[A-Za-z0-9_]+', clean)
	if not m:
	continue
	name = m.group(0)
	if 'PASSED' in clean: test_results.append(('PASSED', name))
	elif 'FAILED' in clean: test_results.append(('FAILED', name))
	elif 'SKIPPED' in clean: test_results.append(('SKIPPED', name))

	# --- Failure reason ---
	failure_reason = ""
	multi = [ansi.sub('', l) for l in content.splitlines() if 'MultipleExceptions:' in l]
	if multi:
	failure_reason = multi[0][:2000]
	elif content:
	exc_lines = [ansi.sub('', l) for l in content.splitlines()
	if re.search(r'(Exception:\|AssertionError\|Input/output error)', l)]
	if exc_lines:
	failure_reason = '\n'.join(exc_lines[-5:])[:2000]

	# --- Env ---
	s = int(os.environ.get("TEST_START_EPOCH", "0") or "0")
	e = int(os.environ.get("TEST_END_EPOCH", "0") or "0")
	secs = max(0, e - s) if e >= s > 0 else 0
	dur = f"{secs//3600}h {(secs%3600)//60}m {secs%60}s"
	run_url = os.environ.get("SLACK_RUN_URL", "")
	log_dir = os.environ.get("RUN_BASE_DIR", "N/A")
	base_branch = os.environ.get("BASE_SBCLI_BRANCH", "?")
	base_spdk = os.environ.get("BASE_SPDK_IMAGE", "") or "default"
	target_branch = os.environ.get("TARGET_SBCLI_BRANCH", "?")
	target_spdk = os.environ.get("TARGET_SPDK_IMAGE", "") or "default"
	test_cls = os.environ.get("TEST_CLASS", "") or "all"
	branch = os.environ.get("GITHUB_REF_NAME", "?")
	wf_name = os.environ.get("SLACK_WF_NAME", "Run")
	ok = os.environ.get("JOB_STATUS", "") == "success"

	icon = ":white_check_mark:" if ok else ":x:"
	status = "SUCCESS" if ok else "FAILURE"
	mention = "" if ok else " <!channel>"

	lines = [
	f"{icon} SimplyBlock {wf_name}{mention}",
	f"Status: {status} \| Duration: {dur}",
	f"Branch: `{branch}` \| Test class: `{test_cls}`",
	f"Upgrade: `{base_branch}` → `{target_branch}`",
	f"Base SPDK: `{base_spdk}` \| Target SPDK: `{target_spdk}`",
	"",
	]

	if total > 0:
	lines += [
	f":white_check_mark: Passed: {passed}/{total} ({pass_pct}%)",
	f":x: Failed: {failed}",
	f":fast_forward: Skipped: {skipped}",
	]
	else:
	lines.append("_(test counts not found in log)_")

	if test_results:
	lines.append("")
	lines.append("Test Results:")
	icons = {'PASSED': ':white_check_mark:', 'FAILED': ':x:', 'SKIPPED': ':fast_forward:'}
	for st, nm in test_results:
	lines.append(f"{icons.get(st, ':grey_question:')} `{nm}`")

	if failure_reason:
	lines += ["", "Failure:", f"```{failure_reason}```"]

	lines += [
	"",
	f":link: Run: <{run_url}\|View on GitHub>",
	f":file_folder: Final Logs: `{log_dir}`",
	]

	payload = {"text": "\n".join(lines)}
	req = urllib.request.Request(
	webhook,
	data=json.dumps(payload).encode(),
	headers={"Content-Type": "application/json"},
	)
	try:
	urllib.request.urlopen(req, timeout=15)
	print("Slack notification sent.")
	except Exception as exc:
	print(f"WARN: Slack notification failed: {exc}", file=sys.stderr)
	PYEOF

	- name: Upload logs (always)
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: simplyblock-upgrade-logs-${{ github.run_id }}${{ inputs.RUN_LABEL != '' && format('-{0}', inputs.RUN_LABEL) \|\| '' }}
	path: \|
	simplyBlockDeploy/bare-metal/bootstrap.log
	sbcli/e2e/output.log
	sbcli/e2e/logs/**
	if-no-files-found: warn

	- name: Export MGMT_IP (first MNODES)
	if: always()
	shell: bash
	run: \|
	echo "MGMT_IP=$(echo "${MNODES}" \| awk '{print $1}')" >> "$GITHUB_ENV"

	- name: Upload small artifacts (always)
	if: always() && env.RUN_BASE_DIR != '' && env.MGMT_IP != ''
	uses: actions/upload-artifact@v4
	with:
	name: simplyblock-upgrade-small-logs-${{ github.run_id }}${{ inputs.RUN_LABEL != '' && format('-{0}', inputs.RUN_LABEL) \|\| '' }}
	path: \|
	sbcli/e2e/output.log
	${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/mgmt/*.txt
	${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/subtasks/*.txt
	${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/storage_nodes/*.txt
	if-no-files-found: warn

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Upgrade: 25.10.5 → 26.1.1 | e2e-yaml-file | 192.168.10.211 #4

Workflow file

Upgrade: 25.10.5 → 26.1.1 | e2e-yaml-file | 192.168.10.211 #4

Uh oh!

Workflow file for this run