|
| 1 | +#!/bin/sh |
| 2 | + |
| 3 | +# ReadonlyFilesystem detector with auto-recovery capability. |
| 4 | +# |
| 5 | +# Strategy: |
| 6 | +# 1. Find all devices remounted read-only in last 5 minutes (from /dev/kmsg) |
| 7 | +# 2. Check if ANY of those devices are CURRENTLY mounted read-only |
| 8 | +# 3. If ANY device is RO -> set ReadonlyFilesystem=True (exit 1) |
| 9 | +# 4. Get all devices remounted read-only in history (from /dev/kmsg) |
| 10 | +# 5. Check if ALL of those devices are CURRENTLY mounted read-only OR not mounted at all |
| 11 | +# 6. If ALL devices recovered -> set ReadonlyFilesystem=False (exit 0) |
| 12 | + |
| 13 | + |
| 14 | +readonly OK=0 # All devices recovered, clear condition (condition: False) |
| 15 | +readonly NONOK=1 # At least one device still RO (condition: True) |
| 16 | +readonly MOUNTS_FILE_HOST="/host/proc/1/mounts" |
| 17 | +readonly MOUNTS_FILE_LOCAL="/proc/mounts" |
| 18 | +readonly LOOKBACK_SEC=300 # 5 minutes lookback |
| 19 | + |
| 20 | +# Extract device name from kernel message |
| 21 | +# Example: "EXT4-fs (sda1): remounting filesystem read-only" |
| 22 | +# Example: "XFS (dm-3): remounting filesystem read-only" |
| 23 | +extract_device_name() { |
| 24 | + _msg="$1" |
| 25 | + # Extract device from parentheses: (device) |
| 26 | + _device=$(printf '%s\n' "$_msg" | sed -n 's/.*(\([^)]*\)).*/\1/p') |
| 27 | + if [ -n "$_device" ]; then |
| 28 | + printf '%s\n' "$_device" |
| 29 | + return 0 |
| 30 | + fi |
| 31 | + return 1 |
| 32 | +} |
| 33 | + |
| 34 | +# Extract device names from kmsg output |
| 35 | +# Input: kmsg messages (one per line) |
| 36 | +# Output: unique device names (sorted) |
| 37 | +extract_devices_from_messages() { |
| 38 | + _messages="$1" |
| 39 | + printf '%s\n' "$_messages" | while IFS= read -r _line; do |
| 40 | + # Skip empty lines |
| 41 | + [ -z "$_line" ] && continue |
| 42 | + |
| 43 | + # Extract device name from the message |
| 44 | + _dev=$(extract_device_name "$_line") |
| 45 | + if [ -n "$_dev" ]; then |
| 46 | + printf '%s\n' "$_dev" |
| 47 | + fi |
| 48 | + done | sort -u |
| 49 | +} |
| 50 | + |
| 51 | +# Get all possible device paths for a given device name |
| 52 | +# Handles dm-X devices by resolving symlinks via udevadm |
| 53 | +# Returns: space-separated list of device paths to check |
| 54 | +get_device_paths() { |
| 55 | + _dev="$1" |
| 56 | + _paths="$_dev" |
| 57 | + |
| 58 | + # Add common path prefixes |
| 59 | + _paths="$_paths /dev/$_dev" |
| 60 | + |
| 61 | + # For dm-X devices, get symlinks from udevadm |
| 62 | + case "$_dev" in |
| 63 | + dm-*) |
| 64 | + # Try to get symlinks for this dm device |
| 65 | + if command -v udevadm >/dev/null 2>&1; then |
| 66 | + _symlinks=$(udevadm info --query=symlink --name="/dev/$_dev" 2>/dev/null || true) |
| 67 | + if [ -n "$_symlinks" ]; then |
| 68 | + # Add each symlink as a potential path |
| 69 | + for _link in $_symlinks; do |
| 70 | + _paths="$_paths /dev/$_link" |
| 71 | + done |
| 72 | + fi |
| 73 | + fi |
| 74 | + ;; |
| 75 | + esac |
| 76 | + |
| 77 | + # For devices with ! character (Portworx dm-name format) |
| 78 | + # Example: pxd!123 → /dev/mapper/pxd!123 |
| 79 | + # Example: pxd!pxd952712810427059188 → /dev/mapper/pxd!pxd952712810427059188 AND /dev/pxd/pxd952712810427059188 |
| 80 | + case "$_dev" in |
| 81 | + *!*) |
| 82 | + _paths="$_paths /dev/mapper/$_dev" |
| 83 | + |
| 84 | + # If pattern is pxd!pxdXXX, also check /dev/pxd/pxdXXX |
| 85 | + # Extract the part after ! and check if it starts with pxd followed by numbers |
| 86 | + _after_bang="${_dev#*!}" |
| 87 | + case "$_after_bang" in |
| 88 | + pxd[0-9]*) |
| 89 | + # Add /dev/pxd/pxdXXX path |
| 90 | + _paths="$_paths /dev/pxd/$_after_bang" |
| 91 | + ;; |
| 92 | + esac |
| 93 | + ;; |
| 94 | + esac |
| 95 | + |
| 96 | + # For devices with - character that might be in /dev/mapper/ |
| 97 | + # Example: pwx0-206233844786798552 → /dev/mapper/pwx0-206233844786798552 |
| 98 | + # Example: 3624a93704cdc47b41e974dd913a8eac2 → /dev/mapper/3624a93704cdc47b41e974dd913a8eac2 |
| 99 | + case "$_dev" in |
| 100 | + *-*|[0-9a-f][0-9a-f][0-9a-f][0-9a-f]*) |
| 101 | + # Device name contains - or looks like a WWID (hex string) |
| 102 | + # These are often in /dev/mapper/ |
| 103 | + _paths="$_paths /dev/mapper/$_dev" |
| 104 | + ;; |
| 105 | + esac |
| 106 | + |
| 107 | + # For Portworx pxd devices in /dev/pxd/ directory |
| 108 | + # Example: pxd342462708072724230 → /dev/pxd/pxd342462708072724230 |
| 109 | + case "$_dev" in |
| 110 | + pxd[0-9]*) |
| 111 | + # Device name starts with pxd followed by numbers |
| 112 | + _paths="$_paths /dev/pxd/$_dev" |
| 113 | + ;; |
| 114 | + esac |
| 115 | + |
| 116 | + printf '%s\n' "$_paths" |
| 117 | +} |
| 118 | + |
| 119 | +# Check if device is currently mounted read-only |
| 120 | +# Returns: 0 if device is RO, 1 if device is RW or not mounted |
| 121 | +is_device_readonly() { |
| 122 | + _device="$1" |
| 123 | + _mounts_file="$MOUNTS_FILE_HOST" |
| 124 | + |
| 125 | + # Try host mounts first, fallback to local |
| 126 | + [ ! -r "$_mounts_file" ] && _mounts_file="$MOUNTS_FILE_LOCAL" |
| 127 | + |
| 128 | + if [ ! -r "$_mounts_file" ]; then |
| 129 | + return 1 # Cannot determine, assume not RO |
| 130 | + fi |
| 131 | + |
| 132 | + # Get all possible device paths for this device |
| 133 | + _device_paths=$(get_device_paths "$_device") |
| 134 | + |
| 135 | + # Parse /proc/mounts to check current state |
| 136 | + # Format: device mountpoint fstype options dump pass |
| 137 | + while IFS=' ' read -r _mount_device _mountpoint _fstype _options _rest; do |
| 138 | + _match=0 |
| 139 | + |
| 140 | + # Check if mount device matches any of our device paths |
| 141 | + for _path in $_device_paths; do |
| 142 | + if [ "$_mount_device" = "$_path" ]; then |
| 143 | + _match=1 |
| 144 | + break |
| 145 | + fi |
| 146 | + done |
| 147 | + |
| 148 | + # If device matched, check if it's mounted read-only |
| 149 | + if [ $_match -eq 1 ]; then |
| 150 | + case ",$_options," in |
| 151 | + *,ro,*) |
| 152 | + printf 'Device %s at %s is read-only\n' "$_mount_device" "$_mountpoint" |
| 153 | + return 0 # Device is RO |
| 154 | + ;; |
| 155 | + esac |
| 156 | + fi |
| 157 | + done < "$_mounts_file" |
| 158 | + |
| 159 | + # Device not found or is RW |
| 160 | + return 1 |
| 161 | +} |
| 162 | +printf 'Scanning /dev/kmsg for '\''Remounting filesystem read-only'\'' messages...\n' |
| 163 | + |
| 164 | + |
| 165 | +# Step 1: Get devices from /dev/kmsg with 5-minute lookback for DETECTION |
| 166 | +# Check if /dev/kmsg is readable |
| 167 | +if [ ! -r /dev/kmsg ]; then |
| 168 | + printf 'Warning: /dev/kmsg not readable, ReadonlyFilesystem condition: False\n' |
| 169 | + exit $OK |
| 170 | +fi |
| 171 | + |
| 172 | +# Calculate cutoff timestamp in microseconds for 5-minute lookback |
| 173 | +# /proc/uptime -> seconds since boot (float). Convert to microseconds and subtract lookback. |
| 174 | +if [ -f /proc/uptime ] && [ -r /proc/uptime ]; then |
| 175 | + CUTOFF_US=$(awk -v lb="$LOOKBACK_SEC" ' |
| 176 | + NR==1 { |
| 177 | + up = $1 + 0 |
| 178 | + c = (up - lb) * 1000000 |
| 179 | + if (c < 0) c = 0 |
| 180 | + printf("%.0f\n", c) |
| 181 | + exit |
| 182 | + }' /proc/uptime 2>/dev/null) |
| 183 | + |
| 184 | + # Check if awk succeeded |
| 185 | + if [ -z "$CUTOFF_US" ]; then |
| 186 | + CUTOFF_US=0 |
| 187 | + fi |
| 188 | +else |
| 189 | + # Fallback: if /proc/uptime not available, set cutoff to 0 (get all messages) |
| 190 | + CUTOFF_US=0 |
| 191 | +fi |
| 192 | + |
| 193 | +# Filter /dev/kmsg by timestamp (only messages within 5-minute lookback period for DETECTION) |
| 194 | +# Use timeout with cat to read all available messages then stop |
| 195 | +if command -v timeout >/dev/null 2>&1; then |
| 196 | + kmsg_output_recent=$(timeout 10 cat /dev/kmsg 2>/dev/null | awk -v cutoff="$CUTOFF_US" ' |
| 197 | + BEGIN { keep=0 } |
| 198 | + /^[ \t]/ { if (keep) print; next } |
| 199 | + { |
| 200 | + semi = index($0, ";"); if (!semi) next |
| 201 | + header = substr($0, 1, semi-1) |
| 202 | + msg = substr($0, semi+1) |
| 203 | + n = split(header, h, ","); if (n < 3) { keep=0; next } |
| 204 | + ts = h[3] + 0 |
| 205 | + keep = (ts >= cutoff) |
| 206 | + if (keep) print msg |
| 207 | + }' 2>/dev/null | grep -iE "remounting filesystem read-only" 2>/dev/null || true) |
| 208 | +else |
| 209 | + # Fallback: if no timeout command, skip detection |
| 210 | + printf 'Warning: timeout command not available, cannot safely read /dev/kmsg\n' |
| 211 | + kmsg_output_recent="" |
| 212 | +fi |
| 213 | + |
| 214 | +# Extract device names from recent messages (5-minute lookback) |
| 215 | +devices_recent=$(extract_devices_from_messages "$kmsg_output_recent") |
| 216 | + |
| 217 | +# Step 2: Check current mount state |
| 218 | +# If recent devices (5-min) found → check them and set to True if RO |
| 219 | +# If no recent devices but old devices exist → check old devices for recovery |
| 220 | + |
| 221 | +if [ -n "$devices_recent" ]; then |
| 222 | + any_device_ro=0 |
| 223 | + for _dev in $devices_recent; do |
| 224 | + if is_device_readonly "$_dev"; then |
| 225 | + any_device_ro=1 |
| 226 | + fi |
| 227 | + done |
| 228 | + |
| 229 | + printf '\n' |
| 230 | + if [ $any_device_ro -eq 1 ]; then |
| 231 | + printf 'At least one device is currently read-only. ReadonlyFilesystem condition: True\n' |
| 232 | + exit $NONOK # Exit 1 = Condition True |
| 233 | + fi |
| 234 | +fi |
| 235 | + |
| 236 | +# Step 3: Get ALL devices ever mentioned in /dev/kmsg (no time limit) for RECOVERY check |
| 237 | +# Scanning /dev/kmsg for ALL 'Remounting filesystem read-only' messages (no time limit for recovery check) |
| 238 | + |
| 239 | +# Get ALL messages from /dev/kmsg (no timestamp filtering) |
| 240 | +# Use timeout with cat to read all available messages then stop |
| 241 | +if command -v timeout >/dev/null 2>&1; then |
| 242 | + kmsg_output_all=$(timeout 10 cat /dev/kmsg 2>/dev/null | awk ' |
| 243 | + BEGIN { keep=0 } |
| 244 | + /^[ \t]/ { if (keep) print; next } |
| 245 | + { |
| 246 | + semi = index($0, ";"); if (!semi) next |
| 247 | + msg = substr($0, semi+1) |
| 248 | + print msg |
| 249 | + }' 2>/dev/null | grep -iE "remounting filesystem read-only" 2>/dev/null || true) |
| 250 | +else |
| 251 | + # Fallback: if no timeout command, skip recovery check |
| 252 | + kmsg_output_all="" |
| 253 | +fi |
| 254 | + |
| 255 | +# Extract ALL device names ever mentioned |
| 256 | +devices_all=$(extract_devices_from_messages "$kmsg_output_all") |
| 257 | + |
| 258 | +# Step 4: Check if any old devices are still RO |
| 259 | +if [ -n "$devices_all" ]; then |
| 260 | + any_device_ro=0 |
| 261 | + for _dev in $devices_all; do |
| 262 | + if is_device_readonly "$_dev"; then |
| 263 | + any_device_ro=1 |
| 264 | + fi |
| 265 | + done |
| 266 | + |
| 267 | + printf '\n' |
| 268 | + if [ $any_device_ro -eq 1 ]; then |
| 269 | + printf 'At least one device is still read-only, ReadonlyFilesystem condition: True\n' |
| 270 | + exit $NONOK # Exit 1 = Condition True (not recovered) |
| 271 | + else |
| 272 | + printf 'All devices have recovered, ReadonlyFilesystem condition: False\n' |
| 273 | + exit $OK # Exit 0 = Condition False (recovered!) |
| 274 | + fi |
| 275 | +else |
| 276 | + printf 'No '\''Remounting filesystem read-only'\'' messages found in kmsg, ReadonlyFilesystem condition: False\n' |
| 277 | + exit $OK |
| 278 | +fi |
0 commit comments