|
| 1 | +#!/usr/bin/env bash |
| 2 | +# ----------------------------------------------------------------------------- |
| 3 | +# File: 01-verify-node-filesystem-mounts.sh |
| 4 | +# Purpose: |
| 5 | +# Verify that the Nebius Shared Filesystem is mounted on every Kubernetes |
| 6 | +# node at the expected host path before any pod-level storage testing begins. |
| 7 | +# |
| 8 | +# Why We Run This: |
| 9 | +# The Nebius CSI workflow in this repo depends on the shared filesystem |
| 10 | +# already being attached and mounted on each node. If a node is missing the |
| 11 | +# host mount, later PVC or pod checks can fail in ways that are harder to |
| 12 | +# diagnose. |
| 13 | +# |
| 14 | +# Reference Docs: |
| 15 | +# https://docs.nebius.com/kubernetes/storage/filesystem-over-csi |
| 16 | +# |
| 17 | +# Repo Sources of Truth: |
| 18 | +# - ../../modules/cloud-init/k8s-cloud-init.tftpl |
| 19 | +# - ../main.tf |
| 20 | +# |
| 21 | +# What This Script Checks: |
| 22 | +# - The mount exists at /mnt/data (or the value of MOUNT_POINT) |
| 23 | +# - The mount is present in /etc/fstab |
| 24 | +# - The mounted filesystem reports capacity via df |
| 25 | +# - The target directory exists on the host |
| 26 | +# |
| 27 | +# Usage: |
| 28 | +# ./01-verify-node-filesystem-mounts.sh |
| 29 | +# |
| 30 | +# Optional Environment Variables: |
| 31 | +# TEST_NAMESPACE Namespace used for the temporary node-debugger pods. |
| 32 | +# Defaults to the current kubectl namespace or default. |
| 33 | +# MOUNT_POINT Host path to validate. Defaults to the Terraform mount. |
| 34 | +# DEBUG_IMAGE Image used by kubectl debug. Defaults to ubuntu. |
| 35 | +# VERIFY_ALL_NODES When true, validates every node in the cluster. Defaults |
| 36 | +# to false. |
| 37 | +# TARGET_NODE Specific node to validate. Accepts either |
| 38 | +# node/<name> or <name>. Overrides VERIFY_ALL_NODES. |
| 39 | +# |
| 40 | +# Created By: Aaron Fagan |
| 41 | +# Created On: 2026-03-17 |
| 42 | +# Version: 0.1.0 |
| 43 | +# ----------------------------------------------------------------------------- |
| 44 | +set -euo pipefail |
| 45 | + |
| 46 | +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" |
| 47 | +source "${SCRIPT_DIR}/common.sh" |
| 48 | + |
| 49 | +DEBUG_IMAGE="${DEBUG_IMAGE:-ubuntu}" |
| 50 | +VERIFY_ALL_NODES="${VERIFY_ALL_NODES:-false}" |
| 51 | +TARGET_NODE="${TARGET_NODE:-}" |
| 52 | +FAILED=0 |
| 53 | + |
| 54 | +normalize_node_name() { |
| 55 | + local node_name="$1" |
| 56 | + if [[ "${node_name}" == node/* ]]; then |
| 57 | + printf '%s\n' "${node_name}" |
| 58 | + else |
| 59 | + printf 'node/%s\n' "${node_name}" |
| 60 | + fi |
| 61 | +} |
| 62 | + |
| 63 | +log_step "Starting Nebius Shared Filesystem mount verification" |
| 64 | +log_info "Namespace for temporary debug pods: ${TEST_NAMESPACE}" |
| 65 | +log_info "Expected mount point: ${MOUNT_POINT}" |
| 66 | +log_info "Debug image: ${DEBUG_IMAGE}" |
| 67 | + |
| 68 | +log_step "Checking required local dependencies" |
| 69 | +require_command kubectl |
| 70 | +require_command awk |
| 71 | +require_command mktemp |
| 72 | +log_pass "Required local commands for node mount verification are available" |
| 73 | + |
| 74 | +log_step "Preparing local state for debugger pod cleanup" |
| 75 | +ensure_state_dir |
| 76 | +touch "${DEBUG_POD_RECORD_FILE}" |
| 77 | +log_info "Debugger pod record file: ${DEBUG_POD_RECORD_FILE}" |
| 78 | +log_info "New debugger pods from this run will be appended for later cleanup" |
| 79 | + |
| 80 | +log_step "Selecting which nodes to validate" |
| 81 | +ALL_NODES=() |
| 82 | +while IFS= read -r node; do |
| 83 | + [[ -n "${node}" ]] && ALL_NODES+=("${node}") |
| 84 | +done < <(kubectl get nodes -o name) |
| 85 | + |
| 86 | +if [[ "${#ALL_NODES[@]}" -eq 0 ]]; then |
| 87 | + log_fail "No Kubernetes nodes were returned by kubectl" |
| 88 | + exit 1 |
| 89 | +fi |
| 90 | + |
| 91 | +if [[ -n "${TARGET_NODE}" ]]; then |
| 92 | + TARGET_NODE="$(normalize_node_name "${TARGET_NODE}")" |
| 93 | + NODES_TO_CHECK=("${TARGET_NODE}") |
| 94 | + log_info "Using explicitly requested node: ${TARGET_NODE}" |
| 95 | +elif [[ "${VERIFY_ALL_NODES}" == "true" ]]; then |
| 96 | + NODES_TO_CHECK=("${ALL_NODES[@]}") |
| 97 | + log_info "VERIFY_ALL_NODES=true, so every node will be checked" |
| 98 | +else |
| 99 | + NODES_TO_CHECK=("${ALL_NODES[0]}") |
| 100 | + log_info "Defaulting to a single-node validation using: ${NODES_TO_CHECK[0]}" |
| 101 | +fi |
| 102 | + |
| 103 | +log_pass "Selected ${#NODES_TO_CHECK[@]} node(s) for shared filesystem mount validation" |
| 104 | + |
| 105 | +log_step "Checking Nebius Shared Filesystem mounts on the selected Kubernetes nodes" |
| 106 | +for node in "${NODES_TO_CHECK[@]}"; do |
| 107 | + echo |
| 108 | + echo "------------------------------------------------------------" |
| 109 | + echo "=== ${node} ===" |
| 110 | + output_file="$(mktemp)" |
| 111 | + if ! kubectl debug -n "${TEST_NAMESPACE}" "${node}" \ |
| 112 | + --attach=true \ |
| 113 | + --quiet \ |
| 114 | + --image="${DEBUG_IMAGE}" \ |
| 115 | + --profile=sysadmin -- \ |
| 116 | + chroot /host sh -lc " |
| 117 | + set -eu |
| 118 | + echo '[check] Verifying that the Nebius Shared Filesystem is actively mounted at ${MOUNT_POINT}' |
| 119 | + mount | awk '\$3 == \"${MOUNT_POINT}\" { print; found=1 } END { exit found ? 0 : 1 }' |
| 120 | + echo '[check] Verifying that the mount is persisted in /etc/fstab for node reboot safety' |
| 121 | + awk '\$2 == \"${MOUNT_POINT}\" { print; found=1 } END { exit found ? 0 : 1 }' /etc/fstab |
| 122 | + echo '[check] Verifying that the mounted filesystem reports capacity and is readable' |
| 123 | + df -h ${MOUNT_POINT} |
| 124 | + echo '[check] Verifying that the target directory exists on the host' |
| 125 | + test -d ${MOUNT_POINT} |
| 126 | + echo '[result] PASS: shared filesystem host mount is active and healthy at ${MOUNT_POINT} on this node' |
| 127 | + " 2>&1 | tee "${output_file}"; then |
| 128 | + FAILED=1 |
| 129 | + echo "[result] FAIL: ${node} does not have a healthy shared filesystem mount at ${MOUNT_POINT}" >&2 |
| 130 | + fi |
| 131 | + |
| 132 | + debug_pod_name="$(awk '/Creating debugging pod / { print $4 }' "${output_file}" | tail -n 1)" |
| 133 | + if [[ -n "${debug_pod_name}" ]]; then |
| 134 | + printf '%s %s\n' "${TEST_NAMESPACE}" "${debug_pod_name}" >> "${DEBUG_POD_RECORD_FILE}" |
| 135 | + fi |
| 136 | + rm -f "${output_file}" |
| 137 | +done |
| 138 | + |
| 139 | +if [[ "${FAILED}" -eq 0 ]]; then |
| 140 | + log_step "Shared filesystem mount verification completed successfully" |
| 141 | + log_info "All checked nodes reported a healthy mount at ${MOUNT_POINT}" |
| 142 | +else |
| 143 | + log_step "Shared filesystem mount verification completed with failures" |
| 144 | + log_info "Review the node output above for the failing mount checks" |
| 145 | +fi |
| 146 | + |
| 147 | +exit "${FAILED}" |
0 commit comments