Skip to content

Commit 4ee8e59

Browse files
authored
Merge pull request ceph#56597 from liangmingyuanneo/optimize-reshard
rgw reshard: optimize reshard process to minimum blocking time Reviewed-by: Casey Bodley <[email protected]>
2 parents 5e51a6e + 196a73c commit 4ee8e59

28 files changed

+1672
-278
lines changed

doc/radosgw/dynamicresharding.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ Configuration
5151
.. confval:: rgw_reshard_bucket_lock_duration
5252
.. confval:: rgw_reshard_thread_interval
5353
.. confval:: rgw_reshard_num_logs
54+
.. confval:: rgw_reshard_progress_judge_interval
55+
.. confval:: rgw_reshard_progress_judge_ratio
5456

5557
Admin commands
5658
==============

qa/suites/rgw/verify/overrides.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ overrides:
1414
rgw bucket counters cache: true
1515
rgw sts key: abcdefghijklmnop
1616
rgw s3 auth use sts: true
17+
rgw reshard progress judge interval: 10
1718
rgw:
1819
compression type: random
1920
storage classes: LUKEWARM, FROZEN

qa/workunits/rgw/test_rgw_reshard.py

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ def get_bucket_num_shards(bucket_name, bucket_id):
7676
num_shards = json_op['data']['bucket_info']['num_shards']
7777
return num_shards
7878

79+
def get_bucket_reshard_status(bucket_name):
80+
"""
81+
function to get bucket reshard status
82+
"""
83+
cmd = exec_cmd("radosgw-admin bucket stats --bucket {}".format(bucket_name))
84+
json_op = json.loads(cmd)
85+
#print(json.dumps(json_op, indent = 4, sort_keys=True))
86+
reshard_status = json_op['reshard_status']
87+
return reshard_status
88+
7989
def run_bucket_reshard_cmd(bucket_name, num_shards, **kwargs):
8090
cmd = 'radosgw-admin bucket reshard --bucket {} --num-shards {}'.format(bucket_name, num_shards)
8191
cmd += ' --rgw-reshard-bucket-lock-duration 30' # reduce to minimum
@@ -104,7 +114,7 @@ def test_bucket_reshard(conn, name, **fault):
104114
# try reshard with fault injection
105115
_, ret = run_bucket_reshard_cmd(name, num_shards_expected, check_retcode=False, **fault)
106116

107-
if fault.get('error_code') == errno.ECANCELED:
117+
if fault.get('error_code') == errno.ECANCELED or fault.get('error_code') == errno.EOPNOTSUPP:
108118
assert(ret == 0) # expect ECANCELED to retry and succeed
109119
else:
110120
assert(ret != 0 and ret != errno.EBUSY)
@@ -139,6 +149,11 @@ def test_bucket_reshard(conn, name, **fault):
139149
bucket.delete_objects(Delete={'Objects':[{'Key':o.key} for o in objs]})
140150
bucket.delete()
141151

152+
def calc_reshardlog_count(json_op):
153+
cnt = 0
154+
for shard in json_op:
155+
cnt += len(shard['shard_entries'])
156+
return cnt
142157

143158
def main():
144159
"""
@@ -210,13 +225,27 @@ def main():
210225
log.error("Resharding failed on bucket {}. Expected number of shards are not created\n".format(BUCKET_NAME))
211226

212227
# TESTCASE 'manual bucket resharding','inject error','fail','check bucket accessibility', 'retry reshard'
228+
log.debug('TEST: reshard bucket with EIO injected at init_index\n')
229+
test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index')
230+
log.debug('TEST: reshard bucket with EOPNOTSUPP injected at init_index\n')
231+
test_bucket_reshard(connection, 'error-at-init-index', error_at='init_index', error_code=errno.EOPNOTSUPP)
232+
log.debug('TEST: reshard bucket with abort at init_index\n')
233+
test_bucket_reshard(connection, 'abort-at-init-indext', abort_at='init_index')
234+
213235
log.debug('TEST: reshard bucket with EIO injected at set_target_layout\n')
214236
test_bucket_reshard(connection, 'error-at-set-target-layout', error_at='set_target_layout')
215237
log.debug('TEST: reshard bucket with ECANCELED injected at set_target_layout\n')
216238
test_bucket_reshard(connection, 'error-at-set-target-layout', error_at='set_target_layout', error_code=errno.ECANCELED)
217239
log.debug('TEST: reshard bucket with abort at set_target_layout\n')
218240
test_bucket_reshard(connection, 'abort-at-set-target-layout', abort_at='set_target_layout')
219241

242+
log.debug('TEST: reshard bucket with EIO injected at trim_reshard_log_entries\n')
243+
test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries')
244+
log.debug('TEST: reshard bucket with EOPNOTSUPP injected at trim_reshard_log_entries\n')
245+
test_bucket_reshard(connection, 'error-at-trim-reshard-log-entries', error_at='trim_reshard_log_entries', error_code=errno.EOPNOTSUPP)
246+
log.debug('TEST: reshard bucket with abort at trim_reshard_log_entries\n')
247+
test_bucket_reshard(connection, 'abort-at-trim-reshard-log-entries', abort_at='trim_reshard_log_entries')
248+
220249
log.debug('TEST: reshard bucket with EIO injected at block_writes\n')
221250
test_bucket_reshard(connection, 'error-at-block-writes', error_at='block_writes')
222251
log.debug('TEST: reshard bucket with abort at block_writes\n')
@@ -234,6 +263,80 @@ def main():
234263
log.debug('TEST: reshard bucket with abort at do_reshard\n')
235264
test_bucket_reshard(connection, 'abort-at-do-reshard', abort_at='do_reshard')
236265

266+
log.debug('TEST: reshard bucket with EIO injected at logrecord_writes\n')
267+
test_bucket_reshard(connection, 'error-at-logrecord-writes', error_at='logrecord_writes')
268+
log.debug('TEST: reshard bucket with abort at logrecord_writes\n')
269+
test_bucket_reshard(connection, 'abort-at-logrecord-writes', abort_at='logrecord_writes')
270+
271+
log.debug('TEST: reshard bucket with EIO injected at change_reshard_state\n')
272+
test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state')
273+
log.debug('TEST: reshard bucket with ECANCELED injected at change_reshard_state\n')
274+
test_bucket_reshard(connection, 'error-at-change-reshard-state', error_at='change_reshard_state', error_code=errno.ECANCELED)
275+
log.debug('TEST: reshard bucket with abort at change_reshard_state\n')
276+
test_bucket_reshard(connection, 'abort-at-change-reshard-state', abort_at='change_reshard_state')
277+
278+
# TESTCASE 'logrecord could be stopped after reshard failed'
279+
log.debug(' test: logrecord could be stopped after reshard failed')
280+
num_shards = get_bucket_stats(BUCKET_NAME).num_shards
281+
assert "None" == get_bucket_reshard_status(BUCKET_NAME)
282+
_, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='change_reshard_state')
283+
assert(ret != 0 and ret != errno.EBUSY)
284+
assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
285+
286+
bucket.put_object(Key='put_during_logrecord', Body=b"some_data")
287+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
288+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
289+
assert calc_reshardlog_count(json_op) == 1
290+
291+
# end up with logrecord status, the logrecord will be purged
292+
time.sleep(30)
293+
assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
294+
bucket.put_object(Key='put_during_logrecord1', Body=b"some_data1")
295+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
296+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
297+
assert calc_reshardlog_count(json_op) == 0
298+
assert "None" == get_bucket_reshard_status(BUCKET_NAME)
299+
300+
# TESTCASE 'duplicated entries should be purged before reshard'
301+
log.debug(' test: duplicated entries should be purged before reshard')
302+
num_shards = get_bucket_stats(BUCKET_NAME).num_shards
303+
_, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard')
304+
assert(ret != 0 and ret != errno.EBUSY)
305+
assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
306+
307+
bucket.put_object(Key='put_during_logrecord2', Body=b"some_data2")
308+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
309+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
310+
assert calc_reshardlog_count(json_op) == 1
311+
312+
# begin to reshard again, the duplicated entries will be purged
313+
time.sleep(30)
314+
_, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='logrecord_writes')
315+
assert(ret != 0 and ret != errno.EBUSY)
316+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
317+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
318+
assert calc_reshardlog_count(json_op) == 0
319+
320+
# TESTCASE 'duplicated entries can be purged manually'
321+
log.debug(' test: duplicated entries can be purged manually')
322+
time.sleep(30)
323+
num_shards = get_bucket_stats(BUCKET_NAME).num_shards
324+
_, ret = run_bucket_reshard_cmd(BUCKET_NAME, num_shards + 1, check_retcode=False, abort_at='do_reshard')
325+
assert(ret != 0 and ret != errno.EBUSY)
326+
assert "InLogrecord" == get_bucket_reshard_status(BUCKET_NAME)
327+
328+
bucket.put_object(Key='put_during_logrecord3', Body=b"some_data3")
329+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
330+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
331+
assert calc_reshardlog_count(json_op) == 1
332+
333+
time.sleep(30)
334+
exec_cmd('radosgw-admin reshardlog purge --bucket %s' % BUCKET_NAME)
335+
cmd = exec_cmd('radosgw-admin reshardlog list --bucket %s' % BUCKET_NAME)
336+
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
337+
assert calc_reshardlog_count(json_op) == 0
338+
log.debug('check reshard logrecord successfully')
339+
237340
# TESTCASE 'versioning reshard-','bucket', reshard','versioning reshard','succeeds'
238341
log.debug(' test: reshard versioned bucket')
239342
num_shards_expected = get_bucket_stats(VER_BUCKET_NAME).num_shards + 1
@@ -287,6 +390,8 @@ def main():
287390
time.sleep(1)
288391
ver_bucket.put_object(Key='put_during_reshard', Body=b"some_data")
289392
log.debug('put object successful')
393+
# waiter for delay reshard to finish
394+
time.sleep(5)
290395

291396
# TESTCASE 'check that bucket stats are correct after reshard with unlinked entries'
292397
log.debug('TEST: check that bucket stats are correct after reshard with unlinked entries\n')

0 commit comments

Comments
 (0)