Skip to content

Commit 29650c2

Browse files
Merge pull request ceph#44359 from mchangir/mds-uninline-file-during-scrub
mds: un-inline data on scrub Reviewed-by: Xiubo Li <[email protected]>
2 parents 710357d + 277423c commit 29650c2

File tree

19 files changed

+1095
-11
lines changed

19 files changed

+1095
-11
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
overrides:
2+
ceph:
3+
conf:
4+
mgr:
5+
debug mgr: 20
6+
debug ms: 1
7+
debug finisher: 20
8+
debug client: 20
9+
mds:
10+
# to force replication without waiting for hit ratio to ramp up
11+
# this helps with quicker testing against replicas
12+
mds_bal_replicate_threshold: 1
13+
log-whitelist:
14+
- OSD full dropping all updates
15+
- OSD near full
16+
- pausewr flag
17+
- failsafe engaged, dropping updates
18+
- failsafe disengaged, no longer dropping
19+
- is full \(reached quota
20+
- POOL_FULL
21+
- POOL_BACKFILLFULL
22+
23+
tasks:
24+
- cephfs_test_runner:
25+
modules:
26+
- tasks.cephfs.test_uninlining

qa/tasks/cephfs/test_uninlining.py

Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
2+
"""
3+
Test that data is uninlined using scrubbing.
4+
5+
The idea is to untar a linux-5.4.0 kernel tarball's kernel/ dir
6+
consisting of about 8000 files and uninline about 5145 of those which are
7+
less than or equal to client_max_inline_size bytes and can be inlined when
8+
written to while the inline_data config option is enabled.
9+
10+
This test runs across 1 or 2 active MDS, where a subset of the dirs under the
11+
kernel/ dir are pinned to either of the MDS.
12+
"""
13+
14+
import os
15+
import logging
16+
import threading
17+
import time
18+
import json
19+
20+
from io import StringIO
21+
from tasks.cephfs.cephfs_test_case import CephFSTestCase
22+
from tasks.cephfs.mount import CephFSMount
23+
24+
log = logging.getLogger(__name__)
25+
26+
27+
def remote_mntpt_cmd(mount, cmd):
28+
final_cmd = f'cd {mount.hostfs_mntpt} && ' + cmd
29+
out = mount.client_remote.sh(final_cmd, stdout=StringIO())
30+
return out.strip()
31+
32+
33+
class InlineDataInfo:
34+
def __init__(self, length: int, version: int):
35+
self.inline_data_length = length
36+
self.inline_data_version = version
37+
38+
39+
class SnapshotterThread(threading.Thread):
40+
def __init__(self, base_dir: str, snap_count: int, mount: CephFSMount):
41+
super(SnapshotterThread, self).__init__()
42+
self.base_dir: str = base_dir
43+
self.snap_count: int = snap_count
44+
self.mount = mount
45+
46+
def run(self):
47+
for i in range(self.snap_count):
48+
cmd = f"mkdir {self.base_dir}/.snap/snap_{i}"
49+
remote_mntpt_cmd(self.mount, cmd)
50+
time.sleep(1)
51+
52+
53+
class TestDataUninlining(CephFSTestCase):
54+
MDSS_REQUIRED = 2
55+
CLIENTS_REQUIRED = 2
56+
57+
# data version number of uninlined inode: ((1 << 64) - 1)
58+
CEPH_INLINE_NONE = 18446744073709551615
59+
60+
NUM_SNAPS = 10
61+
DUMP_INODE_RETRIES = 10
62+
63+
def setUp(self):
64+
super(TestDataUninlining, self).setUp()
65+
self.cache_info = dict()
66+
self.unmount_info = dict()
67+
self.mount_openbg_info = dict()
68+
self.multimds_info = dict()
69+
self.snapshot_info = dict()
70+
71+
self.cache_info[0] = "without clearing cache"
72+
self.cache_info[1] = "clear cache before scrub"
73+
self.cache_info[2] = "clear cache after scrub"
74+
self.unmount_info[0] = "without unmount client"
75+
self.unmount_info[1] = "unmount client before scrub"
76+
self.unmount_info[2] = "unmount client after scrub"
77+
self.mount_openbg_info[0] = "without mount.open_background"
78+
self.mount_openbg_info[1] = "with mount.open_background"
79+
self.multimds_info[0] = "without multimds"
80+
self.multimds_info[1] = "with multimds"
81+
self.snapshot_info[0] = "without snapshots"
82+
self.snapshot_info[1] = "with snapshots"
83+
84+
def tearDown(self):
85+
super(TestDataUninlining, self).tearDown()
86+
87+
def extract_inodes(self, files):
88+
inodes = []
89+
for fil in files:
90+
log.debug(f"getting inode for:{fil}")
91+
cmd = f'ls -i {fil}'
92+
o = remote_mntpt_cmd(self.mount_a, cmd)
93+
inodes.append(o.split(' ')[0])
94+
return inodes
95+
96+
def get_inline_data_info(self, inodes, files, dir_pins, num_mds):
97+
def get_inode_dump(inode, rank, retries):
98+
for i in range(retries):
99+
log.debug(f"try #{i+1} - dump inode {inode}")
100+
try:
101+
json_out = self.fs.rank_tell(['dump', 'inode', inode], rank=rank)
102+
if len(json_out) != 0:
103+
return json_out
104+
except json.decoder.JSONDecodeError:
105+
time.sleep(1)
106+
finally:
107+
if len(json_out) == 0:
108+
time.sleep(1)
109+
raise json.decoder.JSONDecodeError(f'No JSON found after {retries} attempts', None, 0)
110+
111+
info = []
112+
for i in range(len(inodes)):
113+
inode = inodes[i]
114+
log.debug(f"getting inode info #{i+1} of {len(inodes)}:{inode}")
115+
path = os.path.dirname(files[i])
116+
rank = dir_pins[path] if path in dir_pins else 0
117+
r = rank
118+
while r < rank + num_mds:
119+
try:
120+
json_out = get_inode_dump(inode,
121+
r % num_mds,
122+
self.DUMP_INODE_RETRIES)
123+
break
124+
except json.decoder.JSONDecodeError:
125+
pass
126+
finally:
127+
r += 1
128+
self.assertTrue(json_out is not None)
129+
self.assertTrue('inline_data_length' in json_out)
130+
self.assertTrue('inline_data_version' in json_out)
131+
info.append(InlineDataInfo(json_out['inline_data_length'],
132+
json_out['inline_data_version']))
133+
return info
134+
135+
def run_test_worker(self,
136+
opt_clear_cache,
137+
opt_unmount,
138+
opt_mount_openbg,
139+
opt_multimds,
140+
opt_snapshot):
141+
log.info("Running Data Uninlining test with: "
142+
f"{self.cache_info[opt_clear_cache]}, "
143+
f"{self.unmount_info[opt_unmount]}, "
144+
f"{self.mount_openbg_info[opt_mount_openbg]}, "
145+
f"{self.multimds_info[opt_multimds]}, "
146+
f"{self.snapshot_info[opt_snapshot]}")
147+
148+
# Set max_mds to 1 or 2
149+
num_mds = 2 if opt_multimds else 1
150+
log.debug(f"setting max_mds:{num_mds}")
151+
self.fs.set_max_mds(num_mds)
152+
153+
# Get configured max inline data size
154+
log.debug("getting client_max_inline_size")
155+
idsize = self.fs.fs_config.get('client_max_inline_size', 4096)
156+
idsize = int(idsize)
157+
log.debug(f"got client_max_inline_size:{idsize}")
158+
159+
# IMPORTANT
160+
# At this time, the kernel client doesn't work correctly if
161+
# client_max_inline_size is greater tham 4096
162+
self.assertTrue(idsize == 4096)
163+
164+
snapshotter = None
165+
if opt_snapshot:
166+
log.debug("starting snapshotter thread")
167+
cmd = 'mkdir linux-5.4'
168+
remote_mntpt_cmd(self.mount_b, cmd)
169+
snapshotter = SnapshotterThread("linux-5.4",
170+
self.NUM_SNAPS,
171+
self.mount_b)
172+
snapshotter.start()
173+
174+
# Extract test data tarball
175+
# FIXME
176+
log.debug("extracting tarball")
177+
cmd = 'tar -x -z -f linux-5.4.tar.gz linux-5.4/fs/ceph linux-5.4/fs/orangefs linux-5.4/fs/ext2'
178+
# cmd = 'tar -x -z -f linux-5.4.tar.gz'
179+
remote_mntpt_cmd(self.mount_a, cmd)
180+
181+
bg_proc = None
182+
# the data uninlining or snapshot should cause the caps to be revoked
183+
# and get the data uninlined without any problems
184+
if opt_mount_openbg:
185+
log.debug("opening file in background")
186+
cap_test_dir = "linux-5.4/fs/cap_revoke_test"
187+
cmd = f"mkdir {cap_test_dir}"
188+
remote_mntpt_cmd(self.mount_b, cmd)
189+
test_file = f"{cap_test_dir}/test_file"
190+
bg_proc = self.mount_b.open_background(test_file, True)
191+
192+
# Get dirs under linux-5.4.0/kernel/
193+
# FIXME
194+
log.debug("fetching dir list")
195+
cmd = 'find linux-5.4/ -mindepth 2 -maxdepth 2 -type d'
196+
# cmd = 'find linux-5.4/ -mindepth 1 -maxdepth 1 -type d'
197+
o = remote_mntpt_cmd(self.mount_a, cmd)
198+
dirs = o.split('\n')
199+
200+
# Pin dirs alternately to available mds
201+
dir_pins = {}
202+
log.debug("distributing dir pins")
203+
for i in range(len(dirs)):
204+
self.mount_a.setfattr(dirs[i], 'ceph.dir.pin', str(i % num_mds))
205+
dir_pins[dirs[i]] = i % num_mds
206+
207+
# Count files with size <= idsize
208+
log.debug(f"listing files with size <= {idsize}")
209+
cmd = f'find linux-5.4/ -type f -size -{idsize + 1}c'
210+
o = remote_mntpt_cmd(self.mount_a, cmd)
211+
files = o.split('\n')
212+
213+
# Dump file count
214+
log.info(f'Found {len(files)} inlined files')
215+
216+
if opt_unmount == 1:
217+
log.debug("unmounting mount_a before scrub")
218+
self.mount_a.umount()
219+
220+
if opt_clear_cache == 1:
221+
log.debug("clearing cache")
222+
for i in range(num_mds):
223+
self.fs.rank_tell(['cache', 'drop'], rank=i)
224+
225+
# Start recursive scrub on rank 0
226+
log.debug("starting scrub")
227+
out_json = self.fs.run_scrub(["start", "/", "recursive"])
228+
log.debug(f"scrub start response: {out_json}")
229+
230+
# Wait for scrub completion
231+
log.debug("waiting for scrub to complete")
232+
status = self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"])
233+
self.assertEqual(status, True)
234+
235+
if opt_unmount == 2:
236+
log.debug("unmounting mount_a after scrub")
237+
self.mount_a.umount()
238+
239+
if opt_snapshot:
240+
log.debug("joining snapshotter thread")
241+
snapshotter.join()
242+
for i in range(self.NUM_SNAPS):
243+
cmd = f"rmdir linux-5.4/.snap/snap_{i}"
244+
remote_mntpt_cmd(self.mount_b, cmd)
245+
246+
if opt_clear_cache == 2:
247+
log.debug("clearing cache")
248+
for i in range(num_mds):
249+
self.fs.rank_tell(['cache', 'drop'], rank=i)
250+
251+
if opt_unmount > 0:
252+
log.debug("remounting mount_a")
253+
self.mount_a.mount()
254+
255+
# Extract inode numbers of inlined files
256+
log.debug("extracting inodes")
257+
inodes = self.extract_inodes(files)
258+
259+
# Dump inode info of files with size <= idsize
260+
self.assertEqual(len(files), len(inodes))
261+
262+
log.debug("getting inline data info")
263+
info = self.get_inline_data_info(inodes, files, dir_pins, num_mds)
264+
265+
# cleanup
266+
if opt_mount_openbg:
267+
log.debug("killing background open file process")
268+
self.mount_b.kill_background(bg_proc)
269+
270+
log.debug("removing dir linux-5.4")
271+
remote_mntpt_cmd(self.mount_a, "rm -rf linux-5.4/")
272+
273+
self.assertEqual(len(info), len(inodes))
274+
275+
# Count files with inline_data_length == 0 and validate
276+
zero_length_count = 0
277+
for finfo in info:
278+
if int(finfo.inline_data_length) == 0:
279+
zero_length_count += 1
280+
log.info(f'Found {zero_length_count} files with '
281+
'inline_data_length == 0')
282+
self.assertTrue(zero_length_count == len(files))
283+
284+
# Count files with inline_data_version == 18446744073709551615
285+
# and validate
286+
uninlined_version_count = 0
287+
for finfo in info:
288+
if int(finfo.inline_data_version) == self.CEPH_INLINE_NONE:
289+
uninlined_version_count += 1
290+
log.info(f'Found {uninlined_version_count} files with '
291+
'inline_data_version == CEPH_INLINE_NONE')
292+
self.assertTrue(uninlined_version_count == len(files))
293+
294+
def test_data_uninlining(self):
295+
# Enable inline_data
296+
log.debug("setting inline_data:1")
297+
self.fs.set_var('inline_data', '1', '--yes-i-really-really-mean-it')
298+
299+
# Fetch tarball
300+
log.debug("fetching tarball")
301+
cmd = 'wget http://download.ceph.com/qa/linux-5.4.tar.gz'
302+
remote_mntpt_cmd(self.mount_a, cmd)
303+
304+
# multimds
305+
# 0: without multimds
306+
# 1: with multimds
307+
for opt_multimds in [0, 1]:
308+
# unmount
309+
# 0: do not unmount
310+
# 1: unmount before scrub
311+
# 2: unmount after scrub
312+
for opt_unmount in [0, 1, 2]:
313+
# mount
314+
# 0: no mount.open_background
315+
# 1: mount.open_background
316+
for opt_mount_openbg in [0, 1]:
317+
# clear cache
318+
# 0: do not clear cache
319+
# 1: clear cache before scrub
320+
# 2: clear cache after scrub
321+
for opt_clear_cache in [0, 1, 2]:
322+
# snapshots
323+
# 0: without snapshots
324+
# 1: with snapshots
325+
for opt_snapshot in [0, 1]:
326+
self.run_test_worker(opt_clear_cache,
327+
opt_unmount,
328+
opt_mount_openbg,
329+
opt_multimds,
330+
opt_snapshot)
331+
332+
remote_mntpt_cmd(self.mount_a, "rm -f linux-5.4.tar.gz")

src/common/options/mds.yaml.in

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,6 +1666,17 @@ options:
16661666
- mds
16671667
flags:
16681668
- runtime
1669+
- name: mds_scrub_stats_review_period
1670+
type: uint
1671+
level: advanced
1672+
desc: Period for which scrub stats will be available for review.
1673+
long_desc: Number of days for which scrub stats will be available for review since
1674+
start of scrub operation. After this period, the stats will be auto purged.
1675+
These stats will not be saved to the disk. So any restart or failover of mds
1676+
will cause stats to be lost forever.
1677+
default: 1
1678+
min: 1
1679+
max: 60
16691680
- name: mds_session_metadata_threshold
16701681
type: size
16711682
level: advanced

src/include/ceph_fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ enum {
440440
CEPH_MDS_OP_QUIESCE_PATH = 0x01508,
441441
CEPH_MDS_OP_QUIESCE_INODE = 0x01509,
442442
CEPH_MDS_OP_LOCK_PATH = 0x0150a,
443+
CEPH_MDS_OP_UNINLINE_DATA = 0x0150b
443444
};
444445

445446
#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \

src/include/cephfs/types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,8 @@ void inode_t<Allocator>::dump(ceph::Formatter *f) const
861861
f->dump_unsigned("file_data_version", file_data_version);
862862
f->dump_unsigned("xattr_version", xattr_version);
863863
f->dump_unsigned("backtrace_version", backtrace_version);
864+
f->dump_unsigned("inline_data_version", inline_data.version);
865+
f->dump_unsigned("inline_data_length", inline_data.length());
864866

865867
f->dump_string("stray_prior_path", stray_prior_path);
866868
f->dump_unsigned("max_size_ever", max_size_ever);

0 commit comments

Comments
 (0)