Skip to content

Commit f8f95ab

Browse files
committed
rgw: provide testing support to dynamic resharding with reduction
Adds a config option rgw_reshard_debug_interval that will allow us to make the resharding algorithms run on a faster schedule by allowing one day to be simulated by a set number of seconds. Signed-off-by: J. Eric Ivancich <[email protected]>
1 parent 0f1726d commit f8f95ab

File tree

3 files changed

+84
-13
lines changed

3 files changed

+84
-13
lines changed

src/common/options/rgw.yaml.in

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3301,6 +3301,23 @@ options:
33013301
services:
33023302
- rgw
33033303
min: 10
3304+
- name: rgw_reshard_debug_interval
3305+
type: int
3306+
level: dev
3307+
desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding.
3308+
Do *not* modify for a production cluster.
3309+
long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to
3310+
one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior.
3311+
For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
3312+
then this would be set to 600, the number of seconds in 10 minutes.
3313+
default: -1
3314+
services:
3315+
- rgw
3316+
with_legacy: true
3317+
see_also:
3318+
- rgw_dynamic_resharding
3319+
- rgw_reshard_thread_interval
3320+
- rgw_dynamic_resharding_reduction_wait
33043321
- name: rgw_cache_expiry_interval
33053322
type: uint
33063323
level: advanced

src/rgw/driver/rados/rgw_reshard.cc

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,15 +1398,21 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
13981398
ceph::real_time when_queued = entry.time;
13991399
ceph::real_time now = real_clock::now();
14001400

1401-
// convert hours to seconds
1402-
const uint32_t reshard_reduction_wait_period_hours =
1401+
// use double so we can handle fractions
1402+
double reshard_reduction_wait_hours =
14031403
uint32_t(store->ctx()->_conf.get_val<uint64_t>("rgw_dynamic_resharding_reduction_wait"));
14041404

1405-
auto timespan =
1406-
ceph::make_timespan(reshard_reduction_wait_period_hours * 60 * 60);
1407-
// if (now < when_queued + reshard_reduction_wait_period) {
1405+
// see if we have to reduce the waiting interval due to debug
1406+
// config
1407+
int debug_interval = store->ctx()->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
1408+
if (debug_interval >= 1) {
1409+
constexpr int secs_per_day = 60 * 60 * 24;
1410+
reshard_reduction_wait_hours = reshard_reduction_wait_hours * debug_interval / secs_per_day;
1411+
}
1412+
1413+
auto timespan = std::chrono::seconds(int(60 * 60 * reshard_reduction_wait_hours));
14081414
if (now < when_queued + timespan) {
1409-
// skip for now
1415+
// too early to reshard; log and skip
14101416
ldpp_dout(dpp, 20) << __func__ <<
14111417
": INFO: reshard reduction for bucket \"" <<
14121418
entry.bucket_name << "\" will not proceed until " <<
@@ -1415,6 +1421,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
14151421
return 0;
14161422
}
14171423

1424+
// only if we allow the resharding logic to continue should we log
1425+
// the fact that the reduction_wait_time was shortened due to
1426+
// debugging mode
1427+
if (debug_interval >= 1) {
1428+
ldpp_dout(dpp, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
1429+
debug_interval << " the rgw_dynamic_resharding_reduction_wait is now " <<
1430+
reshard_reduction_wait_hours << " hours (" <<
1431+
int(reshard_reduction_wait_hours * 60 * 60) << " seconds) and bucket \"" <<
1432+
entry.bucket_name << "\" has reached the reduction wait period" << dendl;
1433+
}
1434+
14181435
// all checks passed; we can drop through and proceed
14191436
}
14201437

@@ -1544,6 +1561,17 @@ void RGWReshard::stop_processor()
15441561
}
15451562

15461563
void *RGWReshard::ReshardWorker::entry() {
1564+
const auto debug_interval = cct->_conf.get_val<int64_t>("rgw_reshard_debug_interval");
1565+
double interval_factor = 1.0;
1566+
if (debug_interval >= 1) {
1567+
constexpr double secs_per_day = 60 * 60 * 24;
1568+
interval_factor = debug_interval / secs_per_day;
1569+
1570+
ldpp_dout(this, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
1571+
debug_interval << " the rgw_reshard_thread_interval will be "
1572+
"multiplied by a factor of " << interval_factor << dendl;
1573+
}
1574+
15471575
do {
15481576
utime_t start = ceph_clock_now();
15491577
reshard->process_all_logshards(this, null_yield);
@@ -1552,14 +1580,19 @@ void *RGWReshard::ReshardWorker::entry() {
15521580
break;
15531581

15541582
utime_t end = ceph_clock_now();
1555-
end -= start;
1583+
utime_t elapsed = end - start;
1584+
15561585
int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
1586+
secs = std::max(1, int(secs * interval_factor));
15571587

1558-
if (secs <= end.sec())
1588+
if (secs <= elapsed.sec()) {
15591589
continue; // next round
1590+
}
15601591

1561-
secs -= end.sec();
1592+
secs -= elapsed.sec();
15621593

1594+
// note: this will likely wait for the intended period of
1595+
// time, but could wait for less
15631596
std::unique_lock locker{lock};
15641597
cond.wait_for(locker, std::chrono::seconds(secs));
15651598
} while (!reshard->going_down());

src/rgw/rgw_quota.cc

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,21 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
358358

359359
void *entry() override {
360360
ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
361+
362+
// rgw_reshard_debug_interval is a DEV level configuration
363+
// option, so we can assume it won't change while the RGW server
364+
// is running, so we'll handle it once before we loop
365+
double sync_interval_factor = 1.0;
366+
const uint64_t debug_interval = cct->_conf->rgw_reshard_debug_interval;
367+
if (debug_interval >= 1) {
368+
constexpr double secs_per_day = 60 * 60 * 24;
369+
sync_interval_factor = debug_interval / secs_per_day;
370+
371+
ldout(cct, 0) << "DEBUG: since the rgw_reshard_debug_interval is set at " <<
372+
debug_interval << " the rgw_user_quota_bucket_sync_interval will be "
373+
"multiplied by a factor of " << sync_interval_factor << dendl;
374+
}
375+
361376
do {
362377
map<rgw_bucket, rgw_owner> buckets;
363378

@@ -372,14 +387,20 @@ class RGWOwnerStatsCache : public RGWQuotaCache<rgw_owner> {
372387
}
373388
}
374389

375-
if (stats->going_down())
390+
if (stats->going_down()) {
376391
break;
392+
}
393+
394+
uint64_t wait_secs = cct->_conf->rgw_user_quota_bucket_sync_interval;
395+
wait_secs = std::max(uint64_t(1),
396+
uint64_t(wait_secs * sync_interval_factor));
377397

398+
// note: this will likely wait for the intended period of
399+
// time, but could wait for less
378400
std::unique_lock locker{lock};
379-
cond.wait_for(
380-
locker,
381-
std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval));
401+
cond.wait_for(locker, std::chrono::seconds(wait_secs));
382402
} while (!stats->going_down());
403+
383404
ldout(cct, 20) << "BucketsSyncThread: done" << dendl;
384405

385406
return NULL;

0 commit comments

Comments
 (0)