Skip to content

Commit 5ee4a92

Browse files
committed
rgw: allow per-bucket minimum number of shards
Dynamic resharding can now reduce the number of shards. The code currently has a hard-coded value of 11 as the minimum number of shards dynamic resharding can reshard to. There may be cases where the user wants to set an alternate minimum, such as when they have a sense of how many objects the bucket will eventually hold. This PR builds off of ceph#61269 . That PR allows the user to specify an initial number of shards during bucket creation. This PR then takes that number to be the minimum and saves it in the layout field of the bucket instance object (RGWBucketInfo). When dynamic resharding is triggered, it will use that stored value as a minimum number of shards for resharing. Signed-off-by: J. Eric Ivancich <[email protected]>
1 parent bdd6ce2 commit 5ee4a92

File tree

9 files changed

+122
-47
lines changed

9 files changed

+122
-47
lines changed

src/rgw/driver/rados/rgw_bucket.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2815,6 +2815,7 @@ void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
28152815

28162816
if (shards) {
28172817
layout.current_index.layout.normal.num_shards = *shards;
2818+
layout.current_index.layout.normal.min_num_shards = *shards;
28182819
} else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
28192820
layout.current_index.layout.normal.num_shards =
28202821
cct->_conf->rgw_override_bucket_index_max_shards;

src/rgw/driver/rados/rgw_rados.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10823,6 +10823,7 @@ int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBuck
1082310823
void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
1082410824
const uint64_t num_objs,
1082510825
const uint32_t num_source_shards,
10826+
const uint32_t min_layout_shards,
1082610827
bool& need_resharding,
1082710828
uint32_t* suggested_num_shards)
1082810829
{
@@ -10834,6 +10835,7 @@ void RGWRados::calculate_preferred_shards(const DoutPrefixProvider* dpp,
1083410835

1083510836
RGWBucketReshard::calculate_preferred_shards(dpp,
1083610837
max_dynamic_shards,
10838+
min_layout_shards,
1083710839
max_objs_per_shard,
1083810840
is_multisite,
1083910841
num_objs,
@@ -10867,8 +10869,11 @@ int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
1086710869
uint32_t suggested_num_shards = 0;
1086810870
const uint32_t num_source_shards =
1086910871
rgw::current_num_shards(bucket_info.layout);
10872+
const uint32_t min_layout_shards =
10873+
rgw::current_min_layout_shards(bucket_info.layout);
1087010874

10871-
calculate_preferred_shards(dpp, num_objs, num_source_shards,
10875+
calculate_preferred_shards(dpp, num_objs,
10876+
num_source_shards, min_layout_shards,
1087210877
need_resharding, &suggested_num_shards);
1087310878
if (! need_resharding) {
1087410879
return 0;

src/rgw/driver/rados/rgw_rados.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1630,6 +1630,7 @@ int restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
16301630
void calculate_preferred_shards(const DoutPrefixProvider* dpp,
16311631
const uint64_t num_objs,
16321632
const uint32_t current_shard_count,
1633+
const uint32_t min_layout_shards,
16331634
bool& need_resharding,
16341635
uint32_t* suggested_num_shard_count = nullptr);
16351636

src/rgw/driver/rados/rgw_reshard.cc

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -69,23 +69,13 @@ const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
6969
};
7070

7171

72-
uint32_t RGWBucketReshard::get_prime_shard_count(
73-
uint32_t shard_count,
74-
uint32_t max_dynamic_shards,
75-
uint32_t min_dynamic_shards)
76-
{
72+
uint32_t RGWBucketReshard::nearest_prime(uint32_t shard_count) {
7773
uint32_t prime_shard_count =
7874
get_prime_shards_greater_or_equal(shard_count);
7975

8076
// if we cannot find a larger prime number, then just use what was
8177
// passed in
82-
if (! prime_shard_count) {
83-
prime_shard_count = shard_count;
84-
}
85-
86-
// keep within min/max bounds
87-
return std::min(max_dynamic_shards,
88-
std::max(min_dynamic_shards, prime_shard_count));
78+
return prime_shard_count ? prime_shard_count : shard_count;
8979
}
9080

9181

@@ -96,6 +86,7 @@ uint32_t RGWBucketReshard::get_prime_shard_count(
9686
void RGWBucketReshard::calculate_preferred_shards(
9787
const DoutPrefixProvider* dpp,
9888
const uint32_t max_dynamic_shards,
89+
const uint32_t min_layout_shards,
9990
const uint64_t max_objs_per_shard,
10091
const bool is_multisite,
10192
const uint64_t num_objs,
@@ -139,10 +130,13 @@ void RGWBucketReshard::calculate_preferred_shards(
139130
}
140131

141132
if (prefer_prime) {
142-
calculated_num_shards = get_prime_shard_count(
143-
calculated_num_shards, max_dynamic_shards, min_dynamic_shards);
133+
calculated_num_shards = nearest_prime(calculated_num_shards);
144134
}
145135

136+
calculated_num_shards =
137+
std::min(max_dynamic_shards,
138+
std::max({ calculated_num_shards, min_dynamic_shards, min_layout_shards }));
139+
146140
ldpp_dout(dpp, 20) << __func__ << ": reshard " << verb <<
147141
" suggested; current average (objects/shard) is " <<
148142
float(num_objs) / current_num_shards << ", which is not within " <<
@@ -461,6 +455,7 @@ static int init_target_layout(rgw::sal::RadosStore* store,
461455
rgw::bucket_index_layout_generation target;
462456
target.layout.type = rgw::BucketIndexType::Normal;
463457
target.layout.normal.num_shards = new_num_shards;
458+
target.layout.normal.min_num_shards = current.layout.normal.min_num_shards;
464459
target.gen = current.gen + 1;
465460

466461
if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
@@ -1256,7 +1251,7 @@ int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& curr
12561251
// block the client op and complete the resharding
12571252
ceph_assert(bucket_info.layout.resharding == rgw::BucketReshardState::InProgress);
12581253
ret = reshard_process(current, max_op_entries, target_shards_mgr, verbose_json_out, out,
1259-
formatter, bucket_info.layout.resharding, dpp, y);
1254+
formatter, bucket_info.layout.resharding, dpp, y);
12601255
if (ret < 0) {
12611256
ldpp_dout(dpp, 0) << __func__ << ": failed in progress state of reshard ret = " << ret << dendl;
12621257
return ret;
@@ -1637,6 +1632,9 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
16371632
ret = store->getRados()->get_bucket_stats(dpp, bucket_info,
16381633
bucket_info.layout.current_index,
16391634
-1, nullptr, nullptr, stats, nullptr, nullptr);
1635+
if (ret < 0) {
1636+
return clean_up("unable to access buckets current stats");
1637+
}
16401638

16411639
// determine current number of bucket entries across shards
16421640
uint64_t num_entries = 0;
@@ -1645,15 +1643,17 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
16451643
}
16461644

16471645
const uint32_t current_shard_count =
1648-
rgw::num_shards(bucket_info.get_current_index().layout.normal);
1646+
rgw::current_num_shards(bucket_info.layout);
1647+
const uint32_t min_layout_shards =
1648+
rgw::current_min_layout_shards(bucket_info.layout);
16491649

16501650
bool needs_resharding { false };
16511651
uint32_t suggested_shard_count { 0 };
16521652
// calling this rados function determines various rados values
16531653
// needed to perform the calculation before calling
16541654
// calculating_preferred_shards() in this class
16551655
store->getRados()->calculate_preferred_shards(
1656-
dpp, num_entries, current_shard_count,
1656+
dpp, num_entries, current_shard_count, min_layout_shards,
16571657
needs_resharding, &suggested_shard_count);
16581658

16591659
// if we no longer need resharding or currently need to expand
@@ -1711,7 +1711,6 @@ int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
17111711
}
17121712

17131713
// all checkes passed; we can reshard...
1714-
17151714
RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
17161715

17171716
ReshardFaultInjector f; // no fault injected

src/rgw/driver/rados/rgw_reshard.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,14 +160,12 @@ class RGWBucketReshard {
160160
}
161161
}
162162

163-
// returns a preferred number of shards given a calculated number of
164-
// shards based on max_dynamic_shards and the list of prime values
165-
static uint32_t get_prime_shard_count(uint32_t suggested_shards,
166-
uint32_t max_dynamic_shards,
167-
uint32_t min_dynamic_shards);
163+
// returns a preferred number of shards as a prime value
164+
static uint32_t nearest_prime(uint32_t suggested_shards);
168165

169166
static void calculate_preferred_shards(const DoutPrefixProvider* dpp,
170167
const uint32_t max_dynamic_shards,
168+
const uint32_t min_layout_shards,
171169
const uint64_t max_objs_per_shard,
172170
const bool is_multisite,
173171
const uint64_t num_objs,

src/rgw/rgw_bucket_layout.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,29 +81,37 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj)
8181
// bucket_index_normal_layout
8282
void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f)
8383
{
84-
ENCODE_START(1, 1, bl);
84+
ENCODE_START(2, 1, bl);
8585
encode(l.num_shards, bl);
8686
encode(l.hash_type, bl);
87+
encode(l.min_num_shards, bl);
8788
ENCODE_FINISH(bl);
8889
}
8990
void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl)
9091
{
91-
DECODE_START(1, bl);
92+
DECODE_START(2, bl);
9293
decode(l.num_shards, bl);
9394
decode(l.hash_type, bl);
95+
if (struct_v >= 2) {
96+
decode(l.min_num_shards, bl);
97+
}
9498
DECODE_FINISH(bl);
9599
}
96100
void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f)
97101
{
98102
f->open_object_section(name);
99103
encode_json("num_shards", l.num_shards, f);
100104
encode_json("hash_type", l.hash_type, f);
105+
encode_json("min_num_shards", l.min_num_shards, f);
101106
f->close_section();
102107
}
103108
void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj)
104109
{
105110
JSONDecoder::decode_json("num_shards", l.num_shards, obj);
106111
JSONDecoder::decode_json("hash_type", l.hash_type, obj);
112+
113+
// if not set in json, set to default value of 1
114+
JSONDecoder::decode_json("min_num_shards", l.min_num_shards, obj, 1);
107115
}
108116

109117
// bucket_index_layout

src/rgw/rgw_bucket_layout.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,15 @@ void decode_json_obj(BucketHashType& t, JSONObj *obj);
5454
struct bucket_index_normal_layout {
5555
uint32_t num_shards = 1;
5656

57+
// the fewest number of shards this bucket layout allows
58+
uint32_t min_num_shards = 1;
59+
5760
BucketHashType hash_type = BucketHashType::Mod;
5861

59-
friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) {
60-
out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type);
62+
friend std::ostream& operator<<(std::ostream& out,
63+
const bucket_index_normal_layout& l) {
64+
out << "num_shards=" << l.num_shards << ", min_num_shards=" <<
65+
l.min_num_shards << ", hash_type=" << to_string(l.hash_type);
6166
return out;
6267
}
6368
};
@@ -278,9 +283,13 @@ inline uint32_t num_shards(const bucket_index_layout& index) {
278283
inline uint32_t num_shards(const bucket_index_layout_generation& index) {
279284
return num_shards(index.layout);
280285
}
286+
281287
inline uint32_t current_num_shards(const BucketLayout& layout) {
282288
return num_shards(layout.current_index);
283289
}
290+
inline uint32_t current_min_layout_shards(const BucketLayout& layout) {
291+
return layout.current_index.layout.normal.min_num_shards;
292+
}
284293
inline bool is_layout_indexless(const bucket_index_layout_generation& layout) {
285294
return layout.layout.type == BucketIndexType::Indexless;
286295
}

src/rgw/rgw_common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,9 @@ class RGWSI_Zone;
10451045

10461046
#include "rgw_cksum.h"
10471047

1048+
1049+
// this represents the at-rest bucket instance object and is stored as
1050+
// a system object
10481051
struct RGWBucketInfo {
10491052
rgw_bucket bucket;
10501053
rgw_owner owner;

src/test/rgw/test_rgw_reshard.cc

Lines changed: 70 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
#include <gtest/gtest.h>
1717

1818

19-
TEST(TestRGWReshard, dynamic_reshard_shard_count)
19+
TEST(TestRGWReshard, max_prime_shards)
2020
{
2121
// assuming we have prime numbers up to 1999
2222
ASSERT_EQ(1999u, RGWBucketReshard::get_max_prime_shards()) <<
2323
"initial list has primes up to 1999";
24+
}
2425

26+
TEST(TestRGWReshard, prime_lookups)
27+
{
2528
ASSERT_EQ(1u, RGWBucketReshard::get_prime_shards_greater_or_equal(1)) <<
2629
"we allow for 1 shard even though it's not prime";
2730
ASSERT_EQ(809u, RGWBucketReshard::get_prime_shards_greater_or_equal(808)) <<
@@ -47,24 +50,72 @@ TEST(TestRGWReshard, dynamic_reshard_shard_count)
4750
"811 is prime";
4851
ASSERT_EQ(811u, RGWBucketReshard::get_prime_shards_less_or_equal(812)) <<
4952
"821 is prime";
53+
}
5054

55+
TEST(TestRGWReshard, nearest_prime)
56+
{
5157
// tests when max dynamic shards is equal to end of prime list
52-
ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 1999, 11));
53-
ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 1999, 11));
54-
ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(2000, 1999, 11));
55-
56-
// tests when max dynamic shards is above end of prime list
57-
ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1998, 3000, 11));
58-
ASSERT_EQ(1999u, RGWBucketReshard::get_prime_shard_count(1999, 3000, 11));
59-
ASSERT_EQ(2000u, RGWBucketReshard::get_prime_shard_count(2000, 3000, 11));
60-
ASSERT_EQ(2001u, RGWBucketReshard::get_prime_shard_count(2001, 3000, 11));
61-
62-
// tests when max dynamic shards is below end of prime list
63-
ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(1998, 500, 11));
64-
ASSERT_EQ(500u, RGWBucketReshard::get_prime_shard_count(2001, 500, 11));
65-
66-
// tests when max dynamic shards is below end of prime list
67-
ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(498, 1999, 499));
68-
ASSERT_EQ(499u, RGWBucketReshard::get_prime_shard_count(499, 1999, 499));
69-
ASSERT_EQ(503u, RGWBucketReshard::get_prime_shard_count(500, 1999, 499));
58+
59+
ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(238));
60+
ASSERT_EQ(239u, RGWBucketReshard::nearest_prime(239));
61+
ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(240));
62+
ASSERT_EQ(241u, RGWBucketReshard::nearest_prime(241));
63+
ASSERT_EQ(251u, RGWBucketReshard::nearest_prime(242));
64+
65+
ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1995));
66+
ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1996));
67+
ASSERT_EQ(1997u, RGWBucketReshard::nearest_prime(1997));
68+
ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1998));
69+
ASSERT_EQ(1999u, RGWBucketReshard::nearest_prime(1999));
70+
ASSERT_EQ(2000u, RGWBucketReshard::nearest_prime(2000));
71+
}
72+
73+
TEST(TestRGWReshard, calculate_preferred_shards)
74+
{
75+
bool needs_resharding;
76+
uint32_t suggested_shard_count = 0;
77+
78+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 10000000, 200,
79+
needs_resharding, &suggested_shard_count);
80+
81+
ASSERT_EQ(false, needs_resharding) << "no need to reshard when shards are half-used";
82+
83+
84+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
85+
needs_resharding, &suggested_shard_count, false);
86+
ASSERT_EQ(true, needs_resharding);
87+
ASSERT_EQ(404, suggested_shard_count) << "number of shards when primes are not preferred";
88+
89+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, false, 20200000, 200,
90+
needs_resharding, &suggested_shard_count, true);
91+
ASSERT_EQ(true, needs_resharding);
92+
ASSERT_EQ(409, suggested_shard_count) << "number of shards when primes are preferred";
93+
94+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 101, 100000, true, 20200000, 200,
95+
needs_resharding, &suggested_shard_count, true);
96+
ASSERT_EQ(true, needs_resharding);
97+
ASSERT_EQ(1619, suggested_shard_count) <<
98+
"number of shards under multisite with primes preferred since "
99+
"multisite quadruples number of shards to reduce need to reshaard";
100+
101+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 650000, 700,
102+
needs_resharding, &suggested_shard_count, true);
103+
// 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
104+
// 50000 = 13
105+
ASSERT_EQ(true, needs_resharding);
106+
ASSERT_EQ(13, suggested_shard_count) << "shard reduction without hitting min_layout_shards";
107+
108+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 3, 100000, false, 350000, 400,
109+
needs_resharding, &suggested_shard_count, true);
110+
// 350,000 objs across 400 shards -> <1000 objs per shard; 350000 /
111+
// 50000 = 7, but hard-coded minimum of 11
112+
ASSERT_EQ(true, needs_resharding);
113+
ASSERT_EQ(11, suggested_shard_count) << "shard reduction and hitting hard-coded minimum of 11";
114+
115+
RGWBucketReshard::calculate_preferred_shards(nullptr, 1999, 51, 100000, false, 650000, 700,
116+
needs_resharding, &suggested_shard_count, true);
117+
// 650,000 objs across 700 shards -> <1000 objs per shard; 650000 /
118+
// 50000 = 13, but bucket min of 51
119+
ASSERT_EQ(true, needs_resharding);
120+
ASSERT_EQ(51, suggested_shard_count) << "shard reduction and hitting min_layout_shards";
70121
}

0 commit comments

Comments
 (0)