Skip to content

Commit efb69ee

Browse files
committed
qa/cephadm: add test for cephadm asyncio based timeout
Adds a test that will set the default cephadm command timeout and then force a timeout to occur by holding the cephadm lock and triggering a device refresh. This works because cephadm ceph-volume commands require the cephadm lock to run, so the command will timeout waiting for the lock to become available. Signed-off-by: Adam King <[email protected]>
1 parent 4df348c commit efb69ee

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
roles:
2+
- - host.a
3+
- mon.a
4+
- mgr.a
5+
- osd.0
6+
- client.0
7+
tasks:
8+
- install:
9+
- cephadm:
10+
- workunit:
11+
clients:
12+
client.0:
13+
- cephadm/test_cephadm_timeout.py
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/usr/bin/python3 -s
2+
3+
import time
4+
import os
5+
import fcntl
6+
import subprocess
7+
import uuid
8+
import sys
9+
10+
from typing import Optional, Any
11+
12+
LOCK_DIR = '/run/cephadm'
13+
DATA_DIR = '/var/lib/ceph'
14+
15+
class _Acquire_ReturnProxy(object):
16+
def __init__(self, lock: 'FileLock') -> None:
17+
self.lock = lock
18+
return None
19+
20+
def __enter__(self) -> 'FileLock':
21+
return self.lock
22+
23+
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
24+
self.lock.release()
25+
return None
26+
27+
class FileLock(object):
28+
def __init__(self, name: str, timeout: int = -1) -> None:
29+
if not os.path.exists(LOCK_DIR):
30+
os.mkdir(LOCK_DIR, 0o700)
31+
self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
32+
33+
self._lock_file_fd: Optional[int] = None
34+
self.timeout = timeout
35+
self._lock_counter = 0
36+
return None
37+
38+
@property
39+
def is_locked(self) -> bool:
40+
return self._lock_file_fd is not None
41+
42+
def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
43+
# Use the default timeout, if no timeout is provided.
44+
if timeout is None:
45+
timeout = self.timeout
46+
47+
# Increment the number right at the beginning.
48+
# We can still undo it, if something fails.
49+
self._lock_counter += 1
50+
51+
start_time = time.time()
52+
try:
53+
while True:
54+
if not self.is_locked:
55+
self._acquire()
56+
57+
if self.is_locked:
58+
break
59+
elif timeout >= 0 and time.time() - start_time > timeout:
60+
raise Exception(self._lock_file)
61+
else:
62+
time.sleep(poll_intervall)
63+
except Exception:
64+
# Something did go wrong, so decrement the counter.
65+
self._lock_counter = max(0, self._lock_counter - 1)
66+
67+
raise
68+
return _Acquire_ReturnProxy(lock=self)
69+
70+
def release(self, force: bool = False) -> None:
71+
if self.is_locked:
72+
self._lock_counter -= 1
73+
74+
if self._lock_counter == 0 or force:
75+
self._release()
76+
self._lock_counter = 0
77+
78+
return None
79+
80+
def __enter__(self) -> 'FileLock':
81+
self.acquire()
82+
return self
83+
84+
def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
85+
self.release()
86+
return None
87+
88+
def __del__(self) -> None:
89+
self.release(force=True)
90+
return None
91+
92+
def _acquire(self) -> None:
93+
open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
94+
fd = os.open(self._lock_file, open_mode)
95+
96+
try:
97+
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
98+
except (IOError, OSError):
99+
os.close(fd)
100+
else:
101+
self._lock_file_fd = fd
102+
return None
103+
104+
def _release(self) -> None:
105+
fd = self._lock_file_fd
106+
self._lock_file_fd = None
107+
fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
108+
os.close(fd) # type: ignore
109+
return None
110+
111+
def _is_fsid(s):
112+
try:
113+
uuid.UUID(s)
114+
except ValueError:
115+
return False
116+
return True
117+
118+
def find_fsid():
119+
if not os.path.exists(DATA_DIR):
120+
raise Exception(f'{DATA_DIR} does not exist. Aborting...')
121+
122+
for d in os.listdir(DATA_DIR):
123+
# assume the first thing we find that is an fsid
124+
# is what we want. Not expecting multiple clusters
125+
# to have been installed here.
126+
if _is_fsid(d):
127+
return d
128+
raise Exception(f'No fsid dir found in {DATA_DIR} does not exist. Aborting...')
129+
130+
def main():
131+
print('Looking for cluster fsid...')
132+
fsid = find_fsid()
133+
print(f'Found fsid {fsid}')
134+
135+
print('Setting cephadm command timeout to 120...')
136+
subprocess.run(['cephadm', 'shell', '--', 'ceph', 'config', 'set',
137+
'mgr', 'mgr/cephadm/default_cephadm_command_timeout', '120'],
138+
check=True)
139+
140+
print('Taking hold of cephadm lock for 300 seconds...')
141+
lock = FileLock(fsid, 300)
142+
lock.acquire()
143+
144+
print('Triggering cephadm device refresh...')
145+
subprocess.run(['cephadm', 'shell', '--', 'ceph', 'orch', 'device', 'ls', '--refresh'],
146+
check=True)
147+
148+
print('Sleeping 150 seconds to allow for timeout to occur...')
149+
time.sleep(150)
150+
151+
print('Checking ceph health detail...')
152+
# directing stdout to res.stdout via "capture_stdout" option
153+
# (and same for stderr) seems to have been added in python 3.7.
154+
# Using files so this works with 3.6 as well
155+
with open('/tmp/ceph-health-detail-stdout', 'w') as f_stdout:
156+
with open('/tmp/ceph-health-detail-stderr', 'w') as f_stderr:
157+
subprocess.run(['cephadm', 'shell', '--', 'ceph', 'health', 'detail'],
158+
check=True, stdout=f_stdout, stderr=f_stderr)
159+
160+
res_stdout = open('/tmp/ceph-health-detail-stdout', 'r').read()
161+
res_stderr = open('/tmp/ceph-health-detail-stderr', 'r').read()
162+
print(f'"cephadm shell -- ceph health detail" stdout:\n{res_stdout}')
163+
print(f'"cephadm shell -- ceph health detail" stderr:\n{res_stderr}')
164+
165+
print('Checking for correct health warning in health detail...')
166+
if 'CEPHADM_REFRESH_FAILED' not in res_stdout:
167+
raise Exception('No health warning caused by timeout was raised')
168+
if 'Command "cephadm ceph-volume -- inventory" timed out' not in res_stdout:
169+
raise Exception('Health warnings did not contain message about time out')
170+
171+
print('Health warnings found succesfully. Exiting.')
172+
return 0
173+
174+
175+
if __name__ == '__main__':
176+
if os.getuid() != 0:
177+
print('Trying to run myself with sudo...')
178+
os.execvp('sudo', [sys.executable] + list(sys.argv))
179+
main()

0 commit comments

Comments
 (0)