Skip to content

Commit 49225d3

Browse files
authored
Merge pull request ceph#62223 from aainscow/enable-switch-flag
osd: Add ability to switch EC optimizations on Reviewed-by: Bill Scales <[email protected]>| Reviewed-by: Radoslaw Zarzynski <[email protected]> Reviewed-by: Samuel Just <[email protected]>
2 parents 73c6e1b + 446df87 commit 49225d3

File tree

11 files changed

+123
-37
lines changed

11 files changed

+123
-37
lines changed

src/common/io_exerciser/RadosIo.cc

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ RadosIo::RadosIo(librados::Rados& rados, boost::asio::io_context& asio,
4141
const std::string& pool, const std::string& oid,
4242
const std::optional<std::vector<int>>& cached_shard_order,
4343
uint64_t block_size, int seed, int threads, ceph::mutex& lock,
44-
ceph::condition_variable& cond)
44+
ceph::condition_variable& cond, bool ec_optimizations)
4545
: Model(oid, block_size),
4646
rados(rados),
4747
asio(asio),
@@ -58,6 +58,9 @@ RadosIo::RadosIo(librados::Rados& rados, boost::asio::io_context& asio,
5858
rc = rados.ioctx_create(pool.c_str(), io);
5959
ceph_assert(rc == 0);
6060
allow_ec_overwrites(true);
61+
if (ec_optimizations) {
62+
allow_ec_optimizations();
63+
}
6164
}
6265

6366
RadosIo::~RadosIo() {}
@@ -92,6 +95,17 @@ void RadosIo::allow_ec_overwrites(bool allow) {
9295
ceph_assert(rc == 0);
9396
}
9497

98+
void RadosIo::allow_ec_optimizations()
99+
{
100+
int rc;
101+
bufferlist inbl, outbl;
102+
std::string cmdstr =
103+
"{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \
104+
\"var\": \"allow_ec_optimizations\", \"val\": \"true\"}";
105+
rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
106+
ceph_assert(rc == 0);
107+
}
108+
95109
template <int N>
96110
RadosIo::AsyncOpInfo<N>::AsyncOpInfo(const std::array<uint64_t, N>& offset,
97111
const std::array<uint64_t, N>& length)

src/common/io_exerciser/RadosIo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,12 @@ class RadosIo : public Model {
4242
const std::string& pool, const std::string& oid,
4343
const std::optional<std::vector<int>>& cached_shard_order,
4444
uint64_t block_size, int seed, int threads, ceph::mutex& lock,
45-
ceph::condition_variable& cond);
45+
ceph::condition_variable& cond, bool ec_optimizations);
4646

4747
~RadosIo();
4848

4949
void allow_ec_overwrites(bool allow);
50+
void allow_ec_optimizations();
5051

5152
template <int N>
5253
class AsyncOpInfo {

src/mon/MonCommands.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,11 +1170,11 @@ COMMAND("osd pool rename "
11701170
"rename <srcpool> to <destpool>", "osd", "rw")
11711171
COMMAND("osd pool get "
11721172
"name=pool,type=CephPoolname "
1173-
"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay",
1173+
"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations",
11741174
"get pool parameter <var>", "osd", "r")
11751175
COMMAND("osd pool set "
11761176
"name=pool,type=CephPoolname "
1177-
"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay "
1177+
"name=var,type=CephChoices,strings=size|min_size|pg_num|pgp_num|pgp_num_actual|crush_rule|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites|fingerprint_algorithm|pg_autoscale_mode|pg_autoscale_bias|pg_num_min|pg_num_max|target_size_bytes|target_size_ratio|dedup_tier|dedup_chunk_algorithm|dedup_cdc_chunk_size|eio|bulk|read_ratio|pct_update_delay|allow_ec_optimizations "
11781178
"name=val,type=CephString "
11791179
"name=yes_i_really_mean_it,type=CephBool,req=false",
11801180
"set pool parameter <var> to <val>", "osd", "rw")

src/mon/OSDMonitor.cc

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5410,7 +5410,8 @@ namespace {
54105410
CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
54115411
PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
54125412
PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5413-
DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO };
5413+
DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX, READ_RATIO,
5414+
EC_OPTIMIZATIONS };
54145415

54155416
std::set<osd_pool_get_choices>
54165417
subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -6215,7 +6216,8 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
62156216
{"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
62166217
{"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
62176218
{"bulk", BULK},
6218-
{"read_ratio", READ_RATIO}
6219+
{"read_ratio", READ_RATIO},
6220+
{"allow_ec_optimizations", EC_OPTIMIZATIONS}
62196221
};
62206222

62216223
typedef std::set<osd_pool_get_choices> choices_set_t;
@@ -6230,7 +6232,7 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
62306232
HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
62316233
};
62326234
const choices_set_t ONLY_ERASURE_CHOICES = {
6233-
EC_OVERWRITES, ERASURE_CODE_PROFILE
6235+
EC_OVERWRITES, ERASURE_CODE_PROFILE, EC_OPTIMIZATIONS
62346236
};
62356237
const choices_set_t ONLY_REPLICA_CHOICES = {
62366238
READ_RATIO
@@ -6462,17 +6464,23 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
64626464
case DEDUP_CHUNK_ALGORITHM:
64636465
case DEDUP_CDC_CHUNK_SIZE:
64646466
case READ_RATIO:
6465-
pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6466-
if (p->opts.is_set(key)) {
6467-
if(*it == CSUM_TYPE) {
6468-
int64_t val;
6469-
p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6470-
f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6471-
} else {
6472-
p->opts.dump(i->first, f.get());
6473-
}
6467+
{
6468+
pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6469+
if (p->opts.is_set(key)) {
6470+
if(*it == CSUM_TYPE) {
6471+
int64_t val;
6472+
p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6473+
f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6474+
} else {
6475+
p->opts.dump(i->first, f.get());
6476+
}
6477+
}
64746478
}
64756479
break;
6480+
case EC_OPTIMIZATIONS:
6481+
f->dump_bool("allow_ec_optimizations",
6482+
p->has_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS));
6483+
break;
64766484
}
64776485
}
64786486
f->close_section();
@@ -6644,6 +6652,11 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
66446652
}
66456653
}
66466654
break;
6655+
case EC_OPTIMIZATIONS:
6656+
ss << "allow_ec_optimizations: " <<
6657+
(p->has_flag(pg_pool_t::FLAG_EC_OPTIMIZATIONS) ? "true" : "false") <<
6658+
"\n";
6659+
break;
66476660
}
66486661
rdata.append(ss.str());
66496662
ss.str("");
@@ -8796,8 +8809,31 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
87968809
if (val == "true" || (interr.empty() && n == 1)) {
87978810
p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
87988811
} else if (val == "false" || (interr.empty() && n == 0)) {
8799-
ss << "ec overwrites cannot be disabled once enabled";
8812+
if ((p.flags & pg_pool_t::FLAG_EC_OVERWRITES) != 0) {
8813+
ss << "ec overwrites cannot be disabled once enabled";
8814+
return -EINVAL;
8815+
}
8816+
} else {
8817+
ss << "expecting value 'true', 'false', '0', or '1'";
88008818
return -EINVAL;
8819+
}
8820+
} else if (var == "allow_ec_optimizations") {
8821+
if (!p.is_erasure()) {
8822+
ss << "allow_ec_optimizations can only be enabled for an erasure coded pool";
8823+
return -EINVAL;
8824+
}
8825+
if (osdmap.require_osd_release < ceph_release_t::tentacle) {
8826+
ss << "All OSDs must be upgraded to tentacle or "
8827+
<< "later before setting allow_ec_optimizations";
8828+
return -EINVAL;
8829+
}
8830+
if (val == "true" || (interr.empty() && n == 1)) {
8831+
p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS;
8832+
} else if (val == "false" || (interr.empty() && n == 0)) {
8833+
if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) {
8834+
ss << "allow_ec_optimizations cannot be disabled once enabled";
8835+
return -EINVAL;
8836+
}
88018837
} else {
88028838
ss << "expecting value 'true', 'false', '0', or '1'";
88038839
return -EINVAL;

src/osd/ECSwitch.h

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class ECSwitch : public PGBackend
2929
friend class ECReadPred;
3030

3131
ECLegacy::ECBackendL legacy;
32-
ECLegacy::ECBackendL optimized;
32+
ECBackend optimized;
3333
bool is_optimized_actual;
3434

3535
public:
@@ -44,12 +44,13 @@ class ECSwitch : public PGBackend
4444
PGBackend(cct, pg, store, coll, ch),
4545
legacy(pg, cct, ec_impl, stripe_width, this),
4646
optimized(pg, cct, ec_impl, stripe_width, this),
47-
is_optimized_actual(false) {}
47+
is_optimized_actual(get_parent()->get_pool().allows_ecoptimizations()) {}
4848

4949
bool is_optimized() const
5050
{
51-
// FIXME: Interface not yet implemented.
52-
//ceph_assert(is_optimized_actual == get_parent()->get_pool().allows_ecoptimizations());
51+
// FIXME: Once we trust this, we can remove this assert, as it adds
52+
// function call overhead.
53+
ceph_assert(is_optimized_actual == get_parent()->get_pool().allows_ecoptimizations());
5354
return is_optimized_actual;
5455
}
5556

@@ -84,7 +85,7 @@ class ECSwitch : public PGBackend
8485
private:
8586
const ECSwitch *switcher;
8687
std::unique_ptr<ECLegacy::ECBackendL::ECRecPred> legacy;
87-
std::unique_ptr<ECLegacy::ECBackendL::ECRecPred> optimized;
88+
std::unique_ptr<ECBackend::ECRecPred> optimized;
8889
};
8990

9091
class ECReadPred : public IsPGReadablePredicate
@@ -111,7 +112,7 @@ class ECSwitch : public PGBackend
111112
private:
112113
const ECSwitch *switcher;
113114
std::unique_ptr<ECLegacy::ECBackendL::ECReadPred> legacy;
114-
std::unique_ptr<ECLegacy::ECBackendL::ECReadPred> optimized;
115+
std::unique_ptr<ECBackend::ECReadPred> optimized;
115116
};
116117

117118
RecoveryHandle *open_recovery_op() override
@@ -181,7 +182,11 @@ class ECSwitch : public PGBackend
181182
else {
182183
legacy.on_change();
183184
}
184-
//FIXME: Switch to new EC here.
185+
186+
if (!is_optimized_actual)
187+
is_optimized_actual = get_parent()->get_pool().allows_ecoptimizations();
188+
else
189+
ceph_assert(get_parent()->get_pool().allows_ecoptimizations());
185190
}
186191

187192
void clear_recovery_state() override
@@ -345,7 +350,7 @@ class ECSwitch : public PGBackend
345350
object_size_to_shard_size(const uint64_t size, int shard) const override
346351
{
347352
if (is_optimized()) {
348-
return optimized.object_size_to_shard_size(size);
353+
return optimized.object_size_to_shard_size(size, shard);
349354
}
350355
return legacy.object_size_to_shard_size(size);
351356
// All shards are the same size.

src/osd/osd_types.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4199,6 +4199,8 @@ bool PastIntervals::is_new_interval(
41994199
uint32_t new_crush_barrier,
42004200
int32_t old_crush_member,
42014201
int32_t new_crush_member,
4202+
bool old_allow_ec_optimizations,
4203+
bool new_allow_ec_optimizations,
42024204
pg_t pgid) {
42034205
return old_acting_primary != new_acting_primary ||
42044206
new_acting != old_acting ||
@@ -4222,7 +4224,8 @@ bool PastIntervals::is_new_interval(
42224224
old_crush_count != new_crush_count ||
42234225
old_crush_target != new_crush_target ||
42244226
old_crush_barrier != new_crush_barrier ||
4225-
old_crush_member != new_crush_member;
4227+
old_crush_member != new_crush_member ||
4228+
old_allow_ec_optimizations != new_allow_ec_optimizations;
42264229
}
42274230

42284231
bool PastIntervals::is_new_interval(
@@ -4271,6 +4274,7 @@ bool PastIntervals::is_new_interval(
42714274
plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
42724275
plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
42734276
plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
4277+
plast->allows_ecoptimizations(), pi->allows_ecoptimizations(),
42744278
pgid);
42754279
}
42764280

src/osd/osd_types.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,7 @@ struct pg_pool_t {
12971297
// Pool features are restricted to those supported by crimson-osd.
12981298
// Note, does not prohibit being created on classic osd.
12991299
FLAG_CRIMSON = 1<<18,
1300+
FLAG_EC_OPTIMIZATIONS = 1<<19, // enable optimizations, once enabled, cannot be disabled
13001301
};
13011302

13021303
static const char *get_flag_name(uint64_t f) {
@@ -1320,6 +1321,7 @@ struct pg_pool_t {
13201321
case FLAG_EIO: return "eio";
13211322
case FLAG_BULK: return "bulk";
13221323
case FLAG_CRIMSON: return "crimson";
1324+
case FLAG_EC_OPTIMIZATIONS: return "ec_optimizations";
13231325
default: return "???";
13241326
}
13251327
}
@@ -1376,6 +1378,8 @@ struct pg_pool_t {
13761378
return FLAG_BULK;
13771379
if (name == "crimson")
13781380
return FLAG_CRIMSON;
1381+
if (name == "ec_optimizations")
1382+
return FLAG_EC_OPTIMIZATIONS;
13791383
return 0;
13801384
}
13811385

@@ -1790,6 +1794,10 @@ struct pg_pool_t {
17901794
return has_flag(FLAG_EC_OVERWRITES);
17911795
}
17921796

1797+
bool allows_ecoptimizations() const {
1798+
return has_flag(FLAG_EC_OPTIMIZATIONS);
1799+
}
1800+
17931801
bool is_crimson() const {
17941802
return has_flag(FLAG_CRIMSON);
17951803
}
@@ -3453,6 +3461,8 @@ class PastIntervals {
34533461
uint32_t new_crush_barrier,
34543462
int32_t old_crush_member,
34553463
int32_t new_crush_member,
3464+
bool old_allow_ec_optimizations,
3465+
bool new_allow_ec_optimizations,
34563466
pg_t pgid
34573467
);
34583468

src/osdc/Objecter.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2934,6 +2934,8 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
29342934
pi->peering_crush_bucket_barrier,
29352935
t->peering_crush_mandatory_member,
29362936
pi->peering_crush_mandatory_member,
2937+
t->allows_ecoptimizations,
2938+
pi->allows_ecoptimizations(),
29372939
prev_pgid)) {
29382940
force_resend = true;
29392941
}
@@ -2989,6 +2991,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
29892991
t->peering_crush_bucket_target = pi->peering_crush_bucket_target;
29902992
t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier;
29912993
t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member;
2994+
t->allows_ecoptimizations = pi->allows_ecoptimizations();
29922995
ldout(cct, 10) << __func__ << " "
29932996
<< " raw pgid " << pgid << " -> actual " << t->actual_pgid
29942997
<< " acting " << t->acting

src/osdc/Objecter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,7 @@ class Objecter : public md_config_obs_t, public Dispatcher {
18561856
int min_size = -1; ///< the min size of the pool when were were last mapped
18571857
bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
18581858
bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
1859+
bool allows_ecoptimizations = false; ///< whether EC plugin optimizations are enabled.
18591860
uint32_t peering_crush_bucket_count = 0;
18601861
uint32_t peering_crush_bucket_target = 0;
18611862
uint32_t peering_crush_bucket_barrier = 0;

0 commit comments

Comments
 (0)