Skip to content

Commit 8c7b7be

Browse files
work around NFS caching issues (#6603)
* scan: refresh NFS cache for contact file * load_contact_file: refresh NFS cache for directories above contact file * infer_latest_run: refresh NFS cache for the run directory
1 parent aad39a9 commit 8c7b7be

File tree

3 files changed

+52
-4
lines changed

3 files changed

+52
-4
lines changed

changes.d/6506.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Work around caching behaviour observed on NFS filesystems which could cause workflows to appear to be stopped or even to not exist, when they are running.

cylc/flow/network/scan.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -316,10 +316,21 @@ async def is_active(flow, is_active):
316316
False to filter for stopped and unregistered flows.
317317
318318
"""
319-
contact = flow['path'] / SERVICE / CONTACT
320-
_is_active = contact.exists()
319+
service = flow['path'] / SERVICE
320+
# NOTE: We must list the service directory contents rather than checking
321+
# for the existence of the contact file directly, because listing the
322+
# directory forces NFS filesystems to recompute their local cache.
323+
# See https://github.com/cylc/cylc-flow/issues/6506
324+
try:
325+
contents = await scandir(service)
326+
except FileNotFoundError:
327+
_is_active = False
328+
else:
329+
_is_active = any(
330+
path.name == WorkflowFiles.Service.CONTACT for path in contents
331+
)
321332
if _is_active:
322-
flow['contact'] = contact
333+
flow['contact'] = service / CONTACT
323334
return _is_active == is_active
324335

325336

cylc/flow/workflow_files.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
2323
"""
2424

25+
from contextlib import suppress
2526
from enum import Enum
2627
import errno
28+
from collections import deque
2729
import os
2830
from pathlib import Path
2931
import re
@@ -620,8 +622,29 @@ def get_workflow_srv_dir(id_):
620622
return os.path.join(run_d, WorkflowFiles.Service.DIRNAME)
621623

622624

625+
def refresh_nfs_cache(path: Path):
626+
"""Refresh NFS cache for dirs between ~/cylc-run and <path> inclusive.
627+
628+
On NFS filesystems, the non-existence of files/directories may become
629+
cashed. To work around this, we can list the contents of these directories
630+
which refreshes the NFS cache.
631+
632+
See: https://github.com/cylc/cylc-flow/issues/6506
633+
634+
Arguments:
635+
path: The directory to refresh.
636+
637+
Raises:
638+
FileNotFoundError: If any of the directories between ~/cylc-run and
639+
this directory (inclsive) are not present.
640+
641+
"""
642+
cylc_run_dir = get_cylc_run_dir()
643+
for subdir in reversed(path.relative_to(cylc_run_dir).parents):
644+
deque((cylc_run_dir / subdir).iterdir(), maxlen=0)
645+
646+
623647
def load_contact_file(id_: str, run_dir=None) -> Dict[str, str]:
624-
"""Load contact file. Return data as key=value dict."""
625648
if not run_dir:
626649
path = Path(get_contact_file_path(id_))
627650
else:
@@ -630,6 +653,14 @@ def load_contact_file(id_: str, run_dir=None) -> Dict[str, str]:
630653
WorkflowFiles.Service.DIRNAME,
631654
WorkflowFiles.Service.CONTACT
632655
)
656+
657+
if not path.exists():
658+
# work around NFS caching issues
659+
try:
660+
refresh_nfs_cache(path)
661+
except FileNotFoundError as exc:
662+
raise ServiceFileError("Couldn't load contact file") from exc
663+
633664
try:
634665
with open(path) as f:
635666
file_content = f.read()
@@ -919,6 +950,11 @@ def infer_latest_run(
919950
except ValueError:
920951
raise ValueError(f"{path} is not in the cylc-run directory") from None
921952

953+
if not path.exists():
954+
# work around NFS caching issues
955+
with suppress(FileNotFoundError):
956+
refresh_nfs_cache(path)
957+
922958
if not path.exists():
923959
raise InputError(
924960
f'Workflow ID not found: {id_}\n(Directory not found: {path})'

0 commit comments

Comments
 (0)