Skip to content

Commit 5444e0c

Browse files
authored
Update FS watcher to check shared SCRATCH (#45)
1 parent ed196cd commit 5444e0c

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

tools/fs-watchdog.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,25 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
135135
alerts.append(response)
136136
alerts.append("")
137137

138+
def analyse_shared_disk(partition_name, alert_bytes_threshold):
139+
partition_name_2_disk = {
140+
"SCRATCH": "gpfsssd",
141+
"WORK": "gpfsdswork",
142+
"STORE": "gpfsdsstore"
143+
}
144+
cmd = "df"
145+
response = run_cmd(cmd.split())
146+
disk_metas = response.split("\n")
147+
column_names = disk_metas[0].split()
148+
disk_meta = [disk_meta_.split() for disk_meta_ in disk_metas if disk_meta_.startswith(partition_name_2_disk[partition_name])][0]
149+
disk_meta = {column_name: value for column_name, value in zip(column_names, disk_meta)}
150+
151+
# default `df` counts uses 1024-byte units, and `1024 == 2 ** 10`
152+
available_disk_left = int(disk_meta["Available"]) * 2 ** 10
153+
if available_disk_left < alert_bytes_threshold:
154+
alerts.append(f"Shared {partition_name} has {available_disk_left/2**40:.2f}TB left")
155+
alerts.append("")
156+
138157
# WORK and STORE partitions stats can be accessed much faster through `idrquota`, and it already
139158
# includes the quota info
140159
analyse_partition_idrquota(partition_name="WORK", partition_flag="-w", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
@@ -143,6 +162,9 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
143162
# SCRATCH - check only bytes w/ a hard quota of 400TB - alert on lower threshold than other
144163
# partitions due to it filling up at a faster rate (dumping huge checkpoints)
145164
analyse_partition_bytes(partition_name="SCRATCH", partition_path="/gpfsssd/scratch/rech/six/", hard_limit_bytes=400*2**40, alert_bytes_threshold=0.75)
165+
# Actually SCRATCH is shared with everyone and we should monitor the output of `df -h | grep gpfsssd`
166+
# Check that there's still 40TB left
167+
analyse_shared_disk("SCRATCH", 100 * 2 ** 40)
146168

147169
# WORKFS - check both bytes and inodes w/ hard quotas of 2TB / 3M
148170
analyse_partition_bytes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=2*2**40, alert_bytes_threshold=0.85)

0 commit comments

Comments
 (0)