Skip to content

Commit 087aaa2

Browse files
authored
Merge pull request ceph#57538 from ivancich/wip-shrinky-dink
rgw: add shard reduction ability to dynamic resharding Reviewed-by: Matt Benjamin <[email protected]>
2 parents 5ef9f89 + 9302fbb commit 087aaa2

File tree

15 files changed

+530
-165
lines changed

15 files changed

+530
-165
lines changed

src/cls/rgw/cls_rgw.cc

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4422,15 +4422,31 @@ static int rgw_reshard_add(cls_method_context_t hctx, bufferlist *in, bufferlist
44224422
return -EINVAL;
44234423
}
44244424

4425-
4426-
string key;
4425+
std::string key;
44274426
op.entry.get_key(&key);
44284427

4428+
int ret;
44294429
bufferlist bl;
4430+
4431+
if (op.create_only) {
4432+
ret = cls_cxx_map_get_val(hctx, key, &bl);
4433+
if (ret == 0) {
4434+
// entry already exists; make no changes
4435+
return -EEXIST;
4436+
} else if (ret != -ENOENT) {
4437+
CLS_ERR("error accessing reshard queue for %s with key %s",
4438+
op.entry.bucket_name.c_str(), key.c_str());
4439+
return ret;
4440+
}
4441+
4442+
// we got a -ENOENT and can just fall through...
4443+
}
4444+
44304445
encode(op.entry, bl);
4431-
int ret = cls_cxx_map_set_val(hctx, key, &bl);
4446+
ret = cls_cxx_map_set_val(hctx, key, &bl);
44324447
if (ret < 0) {
4433-
CLS_ERR("error adding reshard job for bucket %s with key %s",op.entry.bucket_name.c_str(), key.c_str());
4448+
CLS_ERR("error adding reshard job for bucket %s with key %s",
4449+
op.entry.bucket_name.c_str(), key.c_str());
44344450
return ret;
44354451
}
44364452

src/cls/rgw/cls_rgw_client.cc

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1083,11 +1083,14 @@ void cls_rgw_mp_upload_part_info_update(librados::ObjectWriteOperation& op,
10831083
op.exec(RGW_CLASS, RGW_MP_UPLOAD_PART_INFO_UPDATE, in);
10841084
}
10851085

1086-
void cls_rgw_reshard_add(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry)
1086+
void cls_rgw_reshard_add(librados::ObjectWriteOperation& op,
1087+
const cls_rgw_reshard_entry& entry,
1088+
const bool create_only)
10871089
{
10881090
bufferlist in;
10891091
cls_rgw_reshard_add_op call;
10901092
call.entry = entry;
1093+
call.create_only = create_only;
10911094
encode(call, in);
10921095
op.exec(RGW_CLASS, RGW_RESHARD_ADD, in);
10931096
}

src/cls/rgw/cls_rgw_client.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,9 @@ int cls_rgw_lc_list(librados::IoCtx& io_ctx, const std::string& oid,
626626
void cls_rgw_mp_upload_part_info_update(librados::ObjectWriteOperation& op, const std::string& part_key, const RGWUploadPartInfo& info);
627627

628628
/* resharding */
629-
void cls_rgw_reshard_add(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry);
629+
void cls_rgw_reshard_add(librados::ObjectWriteOperation& op,
630+
const cls_rgw_reshard_entry& entry,
631+
const bool create_only);
630632
void cls_rgw_reshard_remove(librados::ObjectWriteOperation& op, const cls_rgw_reshard_entry& entry);
631633
// these overloads which call io_ctx.operate() should not be called in the rgw.
632634
// rgw_rados_operate() should be called after the overloads w/o calls to io_ctx.operate()

src/cls/rgw/cls_rgw_ops.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1480,19 +1480,27 @@ struct cls_rgw_mp_upload_part_info_update_op {
14801480
WRITE_CLASS_ENCODER(cls_rgw_mp_upload_part_info_update_op)
14811481

14821482
struct cls_rgw_reshard_add_op {
1483-
cls_rgw_reshard_entry entry;
1483+
cls_rgw_reshard_entry entry;
1484+
1485+
// true -> will not overwrite existing entry
1486+
bool create_only {false};
14841487

14851488
cls_rgw_reshard_add_op() {}
14861489

14871490
void encode(ceph::buffer::list& bl) const {
1488-
ENCODE_START(1, 1, bl);
1491+
ENCODE_START(2, 1, bl);
14891492
encode(entry, bl);
1493+
encode(create_only, bl);
14901494
ENCODE_FINISH(bl);
14911495
}
14921496

14931497
void decode(ceph::buffer::list::const_iterator& bl) {
1494-
DECODE_START(1, bl);
1498+
DECODE_START(2, bl);
14951499
decode(entry, bl);
1500+
create_only = false;
1501+
if (struct_v >= 2) {
1502+
decode(create_only, bl);
1503+
}
14961504
DECODE_FINISH(bl);
14971505
}
14981506
static void generate_test_instances(std::list<cls_rgw_reshard_add_op*>& o);

src/cls/rgw/cls_rgw_types.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,19 @@ void rgw_usage_log_entry::generate_test_instances(list<rgw_usage_log_entry *> &o
814814
o.push_back(new rgw_usage_log_entry);
815815
}
816816

817+
std::string to_string(cls_rgw_reshard_initiator i) {
818+
switch (i) {
819+
case cls_rgw_reshard_initiator::Unknown:
820+
return "unknown";
821+
case cls_rgw_reshard_initiator::Admin:
822+
return "administrator";
823+
case cls_rgw_reshard_initiator::Dynamic:
824+
return "dynamic resharding";
825+
default:
826+
return "error";
827+
}
828+
}
829+
817830
void cls_rgw_reshard_entry::generate_key(const string& tenant, const string& bucket_name, string *key)
818831
{
819832
*key = tenant + ":" + bucket_name;
@@ -827,12 +840,13 @@ void cls_rgw_reshard_entry::get_key(string *key) const
827840
void cls_rgw_reshard_entry::dump(Formatter *f) const
828841
{
829842
utime_t ut(time);
830-
encode_json("time",ut, f);
843+
encode_json("time", ut, f);
831844
encode_json("tenant", tenant, f);
832845
encode_json("bucket_name", bucket_name, f);
833846
encode_json("bucket_id", bucket_id, f);
834847
encode_json("old_num_shards", old_num_shards, f);
835848
encode_json("tentative_new_num_shards", new_num_shards, f);
849+
encode_json("initiator", to_string(initiator), f);
836850
}
837851

838852
void cls_rgw_reshard_entry::generate_test_instances(list<cls_rgw_reshard_entry*>& ls)

src/cls/rgw/cls_rgw_types.h

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,25 +1325,40 @@ struct cls_rgw_lc_entry {
13251325
};
13261326
WRITE_CLASS_ENCODER(cls_rgw_lc_entry);
13271327

1328+
1329+
// used to track the initiator of a reshard entry on the reshard queue (log)
1330+
enum class cls_rgw_reshard_initiator : uint8_t {
1331+
Unknown = 0,
1332+
Admin = 1,
1333+
Dynamic = 2,
1334+
};
1335+
std::string to_string(cls_rgw_reshard_initiator i);
1336+
inline std::ostream& operator<<(std::ostream& out, cls_rgw_reshard_initiator i) {
1337+
return out << to_string(i);
1338+
}
1339+
1340+
13281341
struct cls_rgw_reshard_entry
13291342
{
13301343
ceph::real_time time;
13311344
std::string tenant;
13321345
std::string bucket_name;
13331346
std::string bucket_id;
1334-
uint32_t old_num_shards{0};
1335-
uint32_t new_num_shards{0};
1347+
uint32_t old_num_shards {0};
1348+
uint32_t new_num_shards {0};
1349+
cls_rgw_reshard_initiator initiator {cls_rgw_reshard_initiator::Unknown};
13361350

13371351
cls_rgw_reshard_entry() {}
13381352

13391353
void encode(ceph::buffer::list& bl) const {
1340-
ENCODE_START(2, 1, bl);
1354+
ENCODE_START(3, 1, bl);
13411355
encode(time, bl);
13421356
encode(tenant, bl);
13431357
encode(bucket_name, bl);
13441358
encode(bucket_id, bl);
13451359
encode(old_num_shards, bl);
13461360
encode(new_num_shards, bl);
1361+
encode(initiator, bl);
13471362
ENCODE_FINISH(bl);
13481363
}
13491364

@@ -1359,6 +1374,11 @@ struct cls_rgw_reshard_entry
13591374
}
13601375
decode(old_num_shards, bl);
13611376
decode(new_num_shards, bl);
1377+
if (struct_v >= 3) {
1378+
decode(initiator, bl);
1379+
} else {
1380+
initiator = cls_rgw_reshard_initiator::Unknown;
1381+
}
13621382
DECODE_FINISH(bl);
13631383
}
13641384

src/common/options/rgw.yaml.in

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3236,6 +3236,36 @@ options:
32363236
see_also:
32373237
- rgw_max_objs_per_shard
32383238
- rgw_max_dynamic_shards
3239+
- name: rgw_dynamic_resharding_may_reduce
3240+
type: bool
3241+
level: advanced
3242+
desc: Whether dynamic resharding can reduce the number of shards
3243+
long_desc: If true, RGW's dynamic resharding ability is allowed to
3244+
reduce the number of shards if it appears there are too many.
3245+
default: true
3246+
services:
3247+
- rgw
3248+
see_also:
3249+
- rgw_dynamic_resharding
3250+
- name: rgw_dynamic_resharding_reduction_wait
3251+
type: uint
3252+
level: advanced
3253+
desc: Number of hours to delay bucket index shard reduction.
3254+
long_desc: >-
3255+
In order to avoid resharding buckets with object
3256+
counts that fluctuate up and down regularly, we implemement a delay
3257+
between noting a shard reduction might be appropriate and when it's
3258+
actually done. This allows us to cancel the reshard operation if the
3259+
number of object significantly increases during this delay.
3260+
WARNING: Setting this value too low could result in significantly reduced
3261+
cluster performance.
3262+
default: 120
3263+
min: 0
3264+
services:
3265+
- rgw
3266+
see_also:
3267+
- rgw_dynamic_resharding
3268+
- rgw_dynamic_resharding_may_reduce
32393269
- name: rgw_max_objs_per_shard
32403270
type: uint
32413271
level: basic
@@ -3271,6 +3301,23 @@ options:
32713301
services:
32723302
- rgw
32733303
min: 10
3304+
- name: rgw_reshard_debug_interval
3305+
type: int
3306+
level: dev
3307+
desc: The number of seconds that simulate one "day" in order to debug RGW dynamic resharding.
3308+
Do *not* modify for a production cluster.
3309+
long_desc: For debugging RGW dynamic resharding, the number of seconds that are equivalent to
3310+
one simulated "day". Values less than 1 are ignored and do not change dynamic resharding behavior.
3311+
For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
3312+
then this would be set to 600, the number of seconds in 10 minutes.
3313+
default: -1
3314+
services:
3315+
- rgw
3316+
with_legacy: true
3317+
see_also:
3318+
- rgw_dynamic_resharding
3319+
- rgw_reshard_thread_interval
3320+
- rgw_dynamic_resharding_reduction_wait
32743321
- name: rgw_cache_expiry_interval
32753322
type: uint
32763323
level: advanced

src/rgw/driver/rados/rgw_rados.cc

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10324,66 +10324,96 @@ int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBuck
1032410324
return 0;
1032510325
}
1032610326

10327+
10328+
// uses information that the store has easy access to transition to the shard calculatoin logic
10329+
void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
10330+
const uint64_t num_objs,
10331+
const uint32_t num_source_shards,
10332+
bool& need_resharding,
10333+
uint32_t* suggested_num_shards)
10334+
{
10335+
const uint32_t max_dynamic_shards =
10336+
uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
10337+
const uint64_t max_objs_per_shard =
10338+
cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
10339+
const bool is_multisite = svc.zone->need_to_log_data();
10340+
10341+
RGWBucketReshard::calculate_preferred_shards(dpp,
10342+
max_dynamic_shards,
10343+
max_objs_per_shard,
10344+
is_multisite,
10345+
num_objs,
10346+
num_source_shards,
10347+
need_resharding,
10348+
suggested_num_shards);
10349+
}
10350+
10351+
10352+
// Check whether a bucket is a candidate for dynamic resharding and if
10353+
// so, add it to the reshard queue (log).
10354+
//
10355+
// We implement dynamic reshard reduction (where the number of shards
10356+
// can be reduced) in the following manner. In addition to the maximum
10357+
// number of desired entries per shard, we now set a minimum
1032710358
int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
1032810359
uint64_t num_objs,
10329-
const DoutPrefixProvider *dpp, optional_yield y)
10360+
const DoutPrefixProvider* dpp, optional_yield y)
1033010361
{
1033110362
if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
10332-
return 0;
10363+
return 0;
1033310364
}
1033410365

1033510366
if (! is_layout_reshardable(bucket_info.layout)) {
1033610367
return 0;
1033710368
}
1033810369

10339-
bool need_resharding = false;
10340-
uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
10341-
const uint32_t max_dynamic_shards =
10342-
uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
10343-
10344-
if (num_source_shards >= max_dynamic_shards) {
10345-
return 0;
10346-
}
10370+
// TODO: consider per-bucket sync policy here?
1034710371

10372+
bool need_resharding = false;
1034810373
uint32_t suggested_num_shards = 0;
10349-
const uint64_t max_objs_per_shard =
10350-
cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
10374+
const uint32_t num_source_shards =
10375+
rgw::current_num_shards(bucket_info.layout);
1035110376

10352-
// TODO: consider per-bucket sync policy here?
10353-
const bool is_multisite = svc.zone->need_to_log_data();
10354-
10355-
quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
10356-
num_objs, is_multisite, need_resharding,
10357-
&suggested_num_shards);
10377+
calculate_preferred_shards(dpp, num_objs, num_source_shards,
10378+
need_resharding, &suggested_num_shards);
1035810379
if (! need_resharding) {
1035910380
return 0;
1036010381
}
1036110382

10362-
const uint32_t final_num_shards =
10363-
RGWBucketReshard::get_preferred_shards(suggested_num_shards,
10364-
max_dynamic_shards);
1036510383
// final verification, so we don't reduce number of shards
10366-
if (final_num_shards <= num_source_shards) {
10384+
const bool may_reduce =
10385+
uint32_t(cct->_conf.get_val<bool>("rgw_dynamic_resharding_may_reduce"));
10386+
if (! may_reduce && suggested_num_shards <= num_source_shards) {
1036710387
return 0;
1036810388
}
1036910389

10370-
ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket_info.bucket.name <<
10371-
" needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
10372-
"; new num shards " << final_num_shards << " (suggested " <<
10373-
suggested_num_shards << ")" << dendl;
10390+
ldpp_dout(dpp, 1) << "RGWRados::" << __func__ <<
10391+
" bucket " << bucket_info.bucket.name <<
10392+
" needs resharding; current num shards " << num_source_shards <<
10393+
"; new num shards " << suggested_num_shards << dendl;
1037410394

10375-
return add_bucket_to_reshard(dpp, bucket_info, final_num_shards, y);
10395+
return add_bucket_to_reshard(dpp, bucket_info, suggested_num_shards, y);
1037610396
}
1037710397

10378-
int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards, optional_yield y)
10398+
int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp,
10399+
const RGWBucketInfo& bucket_info,
10400+
uint32_t new_num_shards,
10401+
optional_yield y)
1037910402
{
1038010403
RGWReshard reshard(this->driver, dpp);
1038110404

10382-
uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
10383-
10405+
const uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
10406+
const bool may_reduce =
10407+
uint32_t(cct->_conf.get_val<bool>("rgw_dynamic_resharding_may_reduce"));
1038410408
new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
10385-
if (new_num_shards <= num_source_shards) {
10386-
ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
10409+
10410+
if ((! may_reduce && new_num_shards < num_source_shards) ||
10411+
new_num_shards == num_source_shards) {
10412+
ldpp_dout(dpp, 10) << "WARNING: " << __func__ <<
10413+
": rejecting resharding request for bucket name=" <<
10414+
bucket_info.bucket.name << ", shard count=" << num_source_shards <<
10415+
", new shard count=" << new_num_shards <<
10416+
", rgw_dynamic_resharding_may_reduce=" << may_reduce << dendl;
1038710417
return 0;
1038810418
}
1038910419

@@ -10394,6 +10424,7 @@ int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBuck
1039410424
entry.bucket_id = bucket_info.bucket.bucket_id;
1039510425
entry.old_num_shards = num_source_shards;
1039610426
entry.new_num_shards = new_num_shards;
10427+
entry.initiator = cls_rgw_reshard_initiator::Dynamic;
1039710428

1039810429
return reshard.add(dpp, entry, y);
1039910430
}

src/rgw/driver/rados/rgw_rados.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,12 @@ class RGWRados
15861586
RGWQuota& quota, uint64_t obj_size,
15871587
optional_yield y, bool check_size_only = false);
15881588

1589+
void calculate_preferred_shards(const DoutPrefixProvider* dpp,
1590+
const uint64_t num_objs,
1591+
const uint32_t current_shard_count,
1592+
bool& need_resharding,
1593+
uint32_t* suggested_num_shard_count = nullptr);
1594+
15891595
int check_bucket_shards(const RGWBucketInfo& bucket_info, uint64_t num_objs,
15901596
const DoutPrefixProvider *dpp, optional_yield y);
15911597

0 commit comments

Comments
 (0)