Skip to content

Commit 937994a

Browse files
Ability to revert ReadonlyFilesystem condition to False when recovered
1 parent 49ee7e7 commit 937994a

File tree

3 files changed

+436
-0
lines changed

3 files changed

+436
-0
lines changed
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
#!/bin/sh
2+
3+
# ReadonlyFilesystem detector with auto-recovery capability.
4+
#
5+
# Strategy:
6+
# 1. Find all devices remounted read-only in last 5 minutes (from /dev/kmsg)
7+
# 2. Check if ANY of those devices are CURRENTLY mounted read-only
8+
# 3. If ANY device is RO -> set ReadonlyFilesystem=True (exit 1)
9+
# 4. Get all devices remounted read-only in history (from /dev/kmsg)
10+
# 5. Check if ALL of those devices are CURRENTLY mounted read-only OR not mounted at all
11+
# 6. If ALL devices recovered -> set ReadonlyFilesystem=False (exit 0)
12+
13+
14+
readonly OK=0 # All devices recovered, clear condition (condition: False)
15+
readonly NONOK=1 # At least one device still RO (condition: True)
16+
readonly MOUNTS_FILE_HOST="/host/proc/1/mounts"
17+
readonly MOUNTS_FILE_LOCAL="/proc/mounts"
18+
readonly LOOKBACK_SEC=300 # 5 minutes lookback
19+
20+
# Extract device name from kernel message
21+
# Example: "EXT4-fs (sda1): remounting filesystem read-only"
22+
# Example: "XFS (dm-3): remounting filesystem read-only"
23+
extract_device_name() {
24+
_msg="$1"
25+
# Extract device from parentheses: (device)
26+
_device=$(printf '%s\n' "$_msg" | sed -n 's/.*(\([^)]*\)).*/\1/p')
27+
if [ -n "$_device" ]; then
28+
printf '%s\n' "$_device"
29+
return 0
30+
fi
31+
return 1
32+
}
33+
34+
# Extract device names from kmsg output
35+
# Input: kmsg messages (one per line)
36+
# Output: unique device names (sorted)
37+
extract_devices_from_messages() {
38+
_messages="$1"
39+
printf '%s\n' "$_messages" | while IFS= read -r _line; do
40+
# Skip empty lines
41+
[ -z "$_line" ] && continue
42+
43+
# Extract device name from the message
44+
_dev=$(extract_device_name "$_line")
45+
if [ -n "$_dev" ]; then
46+
printf '%s\n' "$_dev"
47+
fi
48+
done | sort -u
49+
}
50+
51+
# Get all possible device paths for a given device name
52+
# Handles dm-X devices by resolving symlinks via udevadm
53+
# Returns: space-separated list of device paths to check
54+
get_device_paths() {
55+
_dev="$1"
56+
_paths="$_dev"
57+
58+
# Add common path prefixes
59+
_paths="$_paths /dev/$_dev"
60+
61+
# For dm-X devices, get symlinks from udevadm
62+
case "$_dev" in
63+
dm-*)
64+
# Try to get symlinks for this dm device
65+
if command -v udevadm >/dev/null 2>&1; then
66+
_symlinks=$(udevadm info --query=symlink --name="/dev/$_dev" 2>/dev/null || true)
67+
if [ -n "$_symlinks" ]; then
68+
# Add each symlink as a potential path
69+
for _link in $_symlinks; do
70+
_paths="$_paths /dev/$_link"
71+
done
72+
fi
73+
fi
74+
;;
75+
esac
76+
77+
# For devices with ! character (Portworx dm-name format)
78+
# Example: pxd!123 → /dev/mapper/pxd!123
79+
# Example: pxd!pxd952712810427059188 → /dev/mapper/pxd!pxd952712810427059188 AND /dev/pxd/pxd952712810427059188
80+
case "$_dev" in
81+
*!*)
82+
_paths="$_paths /dev/mapper/$_dev"
83+
84+
# If pattern is pxd!pxdXXX, also check /dev/pxd/pxdXXX
85+
# Extract the part after ! and check if it starts with pxd followed by numbers
86+
_after_bang="${_dev#*!}"
87+
case "$_after_bang" in
88+
pxd[0-9]*)
89+
# Add /dev/pxd/pxdXXX path
90+
_paths="$_paths /dev/pxd/$_after_bang"
91+
;;
92+
esac
93+
;;
94+
esac
95+
96+
# For devices with - character that might be in /dev/mapper/
97+
# Example: pwx0-206233844786798552 → /dev/mapper/pwx0-206233844786798552
98+
# Example: 3624a93704cdc47b41e974dd913a8eac2 → /dev/mapper/3624a93704cdc47b41e974dd913a8eac2
99+
case "$_dev" in
100+
*-*|[0-9a-f][0-9a-f][0-9a-f][0-9a-f]*)
101+
# Device name contains - or looks like a WWID (hex string)
102+
# These are often in /dev/mapper/
103+
_paths="$_paths /dev/mapper/$_dev"
104+
;;
105+
esac
106+
107+
# For Portworx pxd devices in /dev/pxd/ directory
108+
# Example: pxd342462708072724230 → /dev/pxd/pxd342462708072724230
109+
case "$_dev" in
110+
pxd[0-9]*)
111+
# Device name starts with pxd followed by numbers
112+
_paths="$_paths /dev/pxd/$_dev"
113+
;;
114+
esac
115+
116+
printf '%s\n' "$_paths"
117+
}
118+
119+
# Check if device is currently mounted read-only
120+
# Returns: 0 if device is RO, 1 if device is RW or not mounted
121+
is_device_readonly() {
122+
_device="$1"
123+
_mounts_file="$MOUNTS_FILE_HOST"
124+
125+
# Try host mounts first, fallback to local
126+
[ ! -r "$_mounts_file" ] && _mounts_file="$MOUNTS_FILE_LOCAL"
127+
128+
if [ ! -r "$_mounts_file" ]; then
129+
return 1 # Cannot determine, assume not RO
130+
fi
131+
132+
# Get all possible device paths for this device
133+
_device_paths=$(get_device_paths "$_device")
134+
135+
# Parse /proc/mounts to check current state
136+
# Format: device mountpoint fstype options dump pass
137+
while IFS=' ' read -r _mount_device _mountpoint _fstype _options _rest; do
138+
_match=0
139+
140+
# Check if mount device matches any of our device paths
141+
for _path in $_device_paths; do
142+
if [ "$_mount_device" = "$_path" ]; then
143+
_match=1
144+
break
145+
fi
146+
done
147+
148+
# If device matched, check if it's mounted read-only
149+
if [ $_match -eq 1 ]; then
150+
case ",$_options," in
151+
*,ro,*)
152+
printf 'Device %s at %s is read-only\n' "$_mount_device" "$_mountpoint"
153+
return 0 # Device is RO
154+
;;
155+
esac
156+
fi
157+
done < "$_mounts_file"
158+
159+
# Device not found or is RW
160+
return 1
161+
}
162+
printf 'Scanning /dev/kmsg for '\''Remounting filesystem read-only'\'' messages...\n'
163+
164+
165+
# Step 1: Get devices from /dev/kmsg with 5-minute lookback for DETECTION
166+
# Check if /dev/kmsg is readable
167+
if [ ! -r /dev/kmsg ]; then
168+
printf 'Warning: /dev/kmsg not readable, ReadonlyFilesystem condition: False\n'
169+
exit $OK
170+
fi
171+
172+
# Calculate cutoff timestamp in microseconds for 5-minute lookback
173+
# /proc/uptime -> seconds since boot (float). Convert to microseconds and subtract lookback.
174+
if [ -f /proc/uptime ] && [ -r /proc/uptime ]; then
175+
CUTOFF_US=$(awk -v lb="$LOOKBACK_SEC" '
176+
NR==1 {
177+
up = $1 + 0
178+
c = (up - lb) * 1000000
179+
if (c < 0) c = 0
180+
printf("%.0f\n", c)
181+
exit
182+
}' /proc/uptime 2>/dev/null)
183+
184+
# Check if awk succeeded
185+
if [ -z "$CUTOFF_US" ]; then
186+
CUTOFF_US=0
187+
fi
188+
else
189+
# Fallback: if /proc/uptime not available, set cutoff to 0 (get all messages)
190+
CUTOFF_US=0
191+
fi
192+
193+
# Filter /dev/kmsg by timestamp (only messages within 5-minute lookback period for DETECTION)
194+
# Use timeout with cat to read all available messages then stop
195+
if command -v timeout >/dev/null 2>&1; then
196+
kmsg_output_recent=$(timeout 10 cat /dev/kmsg 2>/dev/null | awk -v cutoff="$CUTOFF_US" '
197+
BEGIN { keep=0 }
198+
/^[ \t]/ { if (keep) print; next }
199+
{
200+
semi = index($0, ";"); if (!semi) next
201+
header = substr($0, 1, semi-1)
202+
msg = substr($0, semi+1)
203+
n = split(header, h, ","); if (n < 3) { keep=0; next }
204+
ts = h[3] + 0
205+
keep = (ts >= cutoff)
206+
if (keep) print msg
207+
}' 2>/dev/null | grep -iE "remounting filesystem read-only" 2>/dev/null || true)
208+
else
209+
# Fallback: if no timeout command, skip detection
210+
printf 'Warning: timeout command not available, cannot safely read /dev/kmsg\n'
211+
kmsg_output_recent=""
212+
fi
213+
214+
# Extract device names from recent messages (5-minute lookback)
215+
devices_recent=$(extract_devices_from_messages "$kmsg_output_recent")
216+
217+
# Step 2: Check current mount state
218+
# If recent devices (5-min) found → check them and set to True if RO
219+
# If no recent devices but old devices exist → check old devices for recovery
220+
221+
if [ -n "$devices_recent" ]; then
222+
any_device_ro=0
223+
for _dev in $devices_recent; do
224+
if is_device_readonly "$_dev"; then
225+
any_device_ro=1
226+
fi
227+
done
228+
229+
printf '\n'
230+
if [ $any_device_ro -eq 1 ]; then
231+
printf 'At least one device is currently read-only. ReadonlyFilesystem condition: True\n'
232+
exit $NONOK # Exit 1 = Condition True
233+
fi
234+
fi
235+
236+
# Step 3: Get ALL devices ever mentioned in /dev/kmsg (no time limit) for RECOVERY check
237+
# Scanning /dev/kmsg for ALL 'Remounting filesystem read-only' messages (no time limit for recovery check)
238+
239+
# Get ALL messages from /dev/kmsg (no timestamp filtering)
240+
# Use timeout with cat to read all available messages then stop
241+
if command -v timeout >/dev/null 2>&1; then
242+
kmsg_output_all=$(timeout 10 cat /dev/kmsg 2>/dev/null | awk '
243+
BEGIN { keep=0 }
244+
/^[ \t]/ { if (keep) print; next }
245+
{
246+
semi = index($0, ";"); if (!semi) next
247+
msg = substr($0, semi+1)
248+
print msg
249+
}' 2>/dev/null | grep -iE "remounting filesystem read-only" 2>/dev/null || true)
250+
else
251+
# Fallback: if no timeout command, skip recovery check
252+
kmsg_output_all=""
253+
fi
254+
255+
# Extract ALL device names ever mentioned
256+
devices_all=$(extract_devices_from_messages "$kmsg_output_all")
257+
258+
# Step 4: Check if any old devices are still RO
259+
if [ -n "$devices_all" ]; then
260+
any_device_ro=0
261+
for _dev in $devices_all; do
262+
if is_device_readonly "$_dev"; then
263+
any_device_ro=1
264+
fi
265+
done
266+
267+
printf '\n'
268+
if [ $any_device_ro -eq 1 ]; then
269+
printf 'At least one device is still read-only, ReadonlyFilesystem condition: True\n'
270+
exit $NONOK # Exit 1 = Condition True (not recovered)
271+
else
272+
printf 'All devices have recovered, ReadonlyFilesystem condition: False\n'
273+
exit $OK # Exit 0 = Condition False (recovered!)
274+
fi
275+
else
276+
printf 'No '\''Remounting filesystem read-only'\'' messages found in kmsg, ReadonlyFilesystem condition: False\n'
277+
exit $OK
278+
fi
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "30s",
5+
"timeout": "25s",
6+
"max_output_length": 512,
7+
"concurrency": 1
8+
},
9+
"source": "readonly-recovery-plugin-monitor",
10+
"conditions": [
11+
{
12+
"type": "ReadonlyFilesystem",
13+
"reason": "FilesystemRecovered",
14+
"message": "All read-only filesystems have recovered"
15+
}
16+
],
17+
"rules": [
18+
{
19+
"type": "permanent",
20+
"condition": "ReadonlyFilesystem",
21+
"reason": "FilesystemRecovered",
22+
"path": "/config/plugin/check_ro_filesystem.sh",
23+
"timeout": "25s"
24+
}
25+
]
26+
}
27+

0 commit comments

Comments
 (0)