Skip to content

Commit f3966ca

Browse files
committed
osd: New options for configuring new EC
Adding three new configuration options which will apply once new EC is in place: osd_pool_default_flag_ec_optimizations This allows EC optimizations to be turned on by default. ec_extent_cache_size This allows the user to specify the size of the per-shard extent cache if they feel that the default 10MiB is too large or too small. The default value may well change following more extensive testing. ec_pd_write_mode This is a development flag for testing the parity delta write RMW mechanism within the EC code. Setting to anything other than 0 will cause performance problems. It is provided as a test mechanism for performance and teuthology. Performance may wish too turn off all PDW writes for a particular IO pattern. This will allow us to determine if the automatic mode should be using conventional RMW writes. The force-on mode allows testing on more unusual scenarios and on smaller configurations. Finally, we tweak the way optimisations are enabled, so as to be common between enabling and default-enabled. Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
1 parent cf3c413 commit f3966ca

File tree

6 files changed

+160
-35
lines changed

6 files changed

+160
-35
lines changed

src/common/options/global.yaml.in

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2703,6 +2703,15 @@ options:
27032703
services:
27042704
- mon
27052705
with_legacy: true
2706+
- name: osd_pool_default_flag_ec_optimizations
2707+
type: bool
2708+
level: advanced
2709+
desc: Control whether to create new erasure coded pools with EC optimizations turned on by default.
2710+
fmt_desc: Set the ``allow_ec_optimizations`` flag on new erasure coded pools.
2711+
default: false
2712+
services:
2713+
- mon
2714+
with_legacy: true
27062715
- name: osd_pool_default_hit_set_bloom_fpp
27072716
type: float
27082717
level: advanced
@@ -6765,3 +6774,15 @@ options:
67656774
The format is ``{file}:{line} [, {file}:{line}]``
67666775
level: dev
67676776
with_legacy: false
6777+
- name: ec_extent_cache_size
6778+
type: uint
6779+
level: advanced
6780+
desc: Size of per-shard extent cache
6781+
default: 10485760
6782+
services:
6783+
- osd
6784+
- name: ec_pdw_write_mode
6785+
type: uint
6786+
level: dev
6787+
default: 0
6788+
desc: When EC writes should generate PDWs (development only) 0=optimal 1=never 2=when possible

src/erasure-code/ErasureCodeInterface.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,10 @@ namespace ceph {
677677
* clay). Other plugins will not process the overhead of stub sub-chunks.
678678
*/
679679
FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS = 1<<5,
680+
/* Optimized EC is supported only if this flag is set. All other flags
681+
* are irrelevant if this flag is false.
682+
*/
683+
FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED = 1<<6,
680684
};
681685
static const char *get_optimization_flag_name(const plugin_flags flag) {
682686
switch (flag) {
@@ -686,6 +690,7 @@ namespace ceph {
686690
case FLAG_EC_PLUGIN_ZERO_PADDING_OPTIMIZATION: return "zeropadding";
687691
case FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION: return "paritydelta";
688692
case FLAG_EC_PLUGIN_REQUIRE_SUB_CHUNKS: return "requiresubchunks";
693+
case FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED: return "optimizedsupport";
689694
default: return "???";
690695
}
691696
}

src/erasure-code/isa/ErasureCodeIsa.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626
#define CEPH_ERASURE_CODE_ISA_L_H
2727

2828
// -----------------------------------------------------------------------------
29+
#include <string_view>
2930
#include "erasure-code/ErasureCode.h"
3031
#include "ErasureCodeIsaTableCache.h"
3132
// -----------------------------------------------------------------------------
3233

34+
using namespace std::literals;
35+
3336
#define EC_ISA_ADDRESS_ALIGNMENT 32u
3437

3538
#define is_aligned(POINTER, BYTE_COUNT) \
@@ -51,6 +54,7 @@ class ErasureCodeIsa : public ceph::ErasureCode {
5154

5255
ErasureCodeIsaTableCache &tcache;
5356
const char *technique;
57+
uint64_t flags;
5458

5559
ErasureCodeIsa(const char *_technique,
5660
ErasureCodeIsaTableCache &_tcache) :
@@ -60,6 +64,15 @@ class ErasureCodeIsa : public ceph::ErasureCode {
6064
tcache(_tcache),
6165
technique(_technique)
6266
{
67+
flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
68+
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
69+
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
70+
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
71+
72+
if (technique == "reed_sol_van"sv ||
73+
technique == "default"sv) {
74+
flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED;
75+
}
6376
}
6477

6578

@@ -68,10 +81,7 @@ class ErasureCodeIsa : public ceph::ErasureCode {
6881
}
6982

7083
uint64_t get_supported_optimizations() const override {
71-
return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
72-
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
73-
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
74-
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
84+
return flags;
7585
}
7686

7787
unsigned int

src/erasure-code/jerasure/ErasureCodeJerasure.h

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@
1818
#ifndef CEPH_ERASURE_CODE_JERASURE_H
1919
#define CEPH_ERASURE_CODE_JERASURE_H
2020

21+
#include <string_view>
22+
2123
#include "erasure-code/ErasureCode.h"
2224

25+
using namespace std::literals;
26+
2327
class ErasureCodeJerasure : public ceph::ErasureCode {
2428
public:
2529
int k;
@@ -32,28 +36,33 @@ class ErasureCodeJerasure : public ceph::ErasureCode {
3236
std::string rule_root;
3337
std::string rule_failure_domain;
3438
bool per_chunk_alignment;
39+
uint64_t flags;
40+
41+
explicit ErasureCodeJerasure(const char *_technique)
42+
: k(0),
43+
DEFAULT_K("2"),
44+
m(0),
45+
DEFAULT_M("1"),
46+
w(0),
47+
DEFAULT_W("8"),
48+
technique(_technique),
49+
per_chunk_alignment(false) {
50+
flags = FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
51+
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
52+
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
53+
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
3554

36-
explicit ErasureCodeJerasure(const char *_technique) :
37-
k(0),
38-
DEFAULT_K("2"),
39-
m(0),
40-
DEFAULT_M("1"),
41-
w(0),
42-
DEFAULT_W("8"),
43-
technique(_technique),
44-
per_chunk_alignment(false)
45-
{}
55+
if (technique == "reed_sol_van"sv) {
56+
flags |= FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED;
57+
}
58+
}
4659

4760
~ErasureCodeJerasure() override {}
4861

4962
uint64_t get_supported_optimizations() const override {
50-
return FLAG_EC_PLUGIN_PARTIAL_READ_OPTIMIZATION |
51-
FLAG_EC_PLUGIN_PARTIAL_WRITE_OPTIMIZATION |
52-
FLAG_EC_PLUGIN_ZERO_INPUT_ZERO_OUTPUT_OPTIMIZATION |
53-
FLAG_EC_PLUGIN_PARITY_DELTA_OPTIMIZATION;
63+
return flags;
5464
}
5565

56-
5766
unsigned int get_chunk_count() const override {
5867
return k + m;
5968
}

src/mon/OSDMonitor.cc

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8226,12 +8226,12 @@ int OSDMonitor::prepare_new_pool(string& name,
82268226
pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
82278227
pi->create_time = ceph_clock_now();
82288228
pi->type = pool_type;
8229-
pi->fast_read = fread;
8229+
pi->fast_read = fread;
82308230
pi->flags = g_conf()->osd_pool_default_flags;
82318231
if (bulk) {
82328232
pi->set_flag(pg_pool_t::FLAG_BULK);
82338233
} else if (g_conf()->osd_pool_default_flag_bulk) {
8234-
pi->set_flag(pg_pool_t::FLAG_BULK);
8234+
pi->set_flag(pg_pool_t::FLAG_BULK);
82358235
}
82368236
if (g_conf()->osd_pool_default_flag_hashpspool)
82378237
pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
@@ -8331,6 +8331,11 @@ int OSDMonitor::prepare_new_pool(string& name,
83318331
pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
83328332
pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
83338333

8334+
if (cct->_conf.get_val<bool>("osd_pool_default_flag_ec_optimizations")) {
8335+
// This will fail if the pool cannot support ec optimizations.
8336+
enable_pool_ec_optimizations(*pi, nullptr, true);
8337+
}
8338+
83348339
pending_inc.new_pool_names[pool] = name;
83358340
return 0;
83368341
}
@@ -8361,6 +8366,70 @@ bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
83618366
return true;
83628367
}
83638368

8369+
int OSDMonitor::enable_pool_ec_optimizations(pg_pool_t &p,
8370+
stringstream *ss, bool enable) {
8371+
if (!p.is_erasure()) {
8372+
if (ss) {
8373+
*ss << "allow_ec_optimizations can only be enabled for an erasure coded pool";
8374+
}
8375+
return -EINVAL;
8376+
}
8377+
if (osdmap.require_osd_release < ceph_release_t::tentacle) {
8378+
if (ss) {
8379+
*ss << "All OSDs must be upgraded to tentacle or "
8380+
<< "later before setting allow_ec_optimizations";
8381+
}
8382+
return -EINVAL;
8383+
}
8384+
if (enable) {
8385+
ErasureCodeInterfaceRef erasure_code;
8386+
unsigned int k, m;
8387+
stringstream tmp;
8388+
int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8389+
if (err == 0) {
8390+
k = erasure_code->get_data_chunk_count();
8391+
m = erasure_code->get_coding_chunk_count();
8392+
} else {
8393+
if (ss) {
8394+
*ss << "get_erasure_code failed: " << tmp.str();
8395+
}
8396+
return -EINVAL;
8397+
}
8398+
if ((erasure_code->get_supported_optimizations() &
8399+
ErasureCodeInterface::FLAG_EC_PLUGIN_OPTIMIZED_SUPPORTED) == 0) {
8400+
if (ss) {
8401+
*ss << "ec optimizations not currently supported for pool profile.";
8402+
}
8403+
return -EINVAL;
8404+
}
8405+
// Restrict the set of shards that can be a primary to the 1st data
8406+
// raw_shard (raw_shard 0) and the coding parity raw_shards because§
8407+
// the other shards (including local parity for LRC) may not have
8408+
// up to date copies of xattrs including OI
8409+
p.nonprimary_shards.clear();
8410+
for (raw_shard_id_t raw_shard; raw_shard < k + m; ++raw_shard) {
8411+
if (raw_shard > 0 && raw_shard < k) {
8412+
shard_id_t shard;
8413+
if (erasure_code->get_chunk_mapping().size() > raw_shard ) {
8414+
shard = shard_id_t(erasure_code->get_chunk_mapping().at(int(raw_shard)));
8415+
} else {
8416+
shard = shard_id_t(int(raw_shard));
8417+
}
8418+
p.nonprimary_shards.insert(shard);
8419+
}
8420+
}
8421+
p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS;
8422+
} else {
8423+
if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) {
8424+
if (ss) {
8425+
*ss << "allow_ec_optimizations cannot be disabled once enabled";
8426+
}
8427+
return -EINVAL;
8428+
}
8429+
}
8430+
return 0;
8431+
}
8432+
83648433
int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
83658434
stringstream& ss)
83668435
{
@@ -8828,26 +8897,34 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
88288897
return -EINVAL;
88298898
}
88308899
} else if (var == "allow_ec_optimizations") {
8831-
if (!p.is_erasure()) {
8832-
ss << "allow_ec_optimizations can only be enabled for an erasure coded pool";
8833-
return -EINVAL;
8834-
}
8835-
if (osdmap.require_osd_release < ceph_release_t::tentacle) {
8836-
ss << "All OSDs must be upgraded to tentacle or "
8837-
<< "later before setting allow_ec_optimizations";
8838-
return -EINVAL;
8839-
}
8900+
bool enable = false;
88408901
if (val == "true" || (interr.empty() && n == 1)) {
8841-
p.flags |= pg_pool_t::FLAG_EC_OPTIMIZATIONS;
8902+
enable = true;
88428903
} else if (val == "false" || (interr.empty() && n == 0)) {
8843-
if ((p.flags & pg_pool_t::FLAG_EC_OPTIMIZATIONS) != 0) {
8844-
ss << "allow_ec_optimizations cannot be disabled once enabled";
8845-
return -EINVAL;
8846-
}
8904+
enable = false;
88478905
} else {
88488906
ss << "expecting value 'true', 'false', '0', or '1'";
88498907
return -EINVAL;
88508908
}
8909+
bool was_enabled = p.allows_ecoptimizations();
8910+
int r = enable_pool_ec_optimizations(p, nullptr, enable);
8911+
if (r != 0) {
8912+
return r;
8913+
}
8914+
if (!was_enabled && p.allows_ecoptimizations()) {
8915+
// Pools with allow_ec_optimizations set store pg_temp in a different
8916+
// order to change the primary selection algorithm without breaking
8917+
// old clients. Modify any existing pg_temp for the pool now.
8918+
// This is only needed when switching on optimisations after creation.
8919+
for (auto pg_temp = osdmap.pg_temp->begin();
8920+
pg_temp != osdmap.pg_temp->end();
8921+
++pg_temp) {
8922+
if (pg_temp->first.pool() == pool) {
8923+
std::vector<int> new_pg_temp = osdmap.pgtemp_primaryfirst(p, pg_temp->second);
8924+
pending_inc.new_pg_temp[pg_temp->first] = mempool::osdmap::vector<int>(new_pg_temp.begin(), new_pg_temp.end());
8925+
}
8926+
}
8927+
}
88518928
} else if (var == "target_max_objects") {
88528929
if (interr.length()) {
88538930
ss << "error parsing int '" << val << "': " << interr;

src/mon/OSDMonitor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,9 @@ class OSDMonitor : public PaxosService,
741741
std::stringstream &ss,
742742
ceph::Formatter *f);
743743

744+
int enable_pool_ec_optimizations(pg_pool_t &pool,
745+
std::stringstream *ss,
746+
bool enable);
744747
int prepare_command_pool_set(const cmdmap_t& cmdmap,
745748
std::stringstream& ss);
746749

0 commit comments

Comments
 (0)