Skip to content

Commit 661e8e2

Browse files
committed
Allow setting peering_crush_bucket_{count|target|barrier}
In the command `ceph osd pool stretch set` <pool> <peering_crush_bucket_count> <peering_crush_bucket_target> <peering_crush_bucket_barrier> <crush_rule> <size> <min_size> user has the option of setting the value of `peering_crush_bucket_{count|target|barrier}`. This will then allow the utilization `calc_replicated_acting_stretch`, since with `peering_crush_bucket_count != 0` the pool is now a stretch_pool and we can handle pg_temp better by settubg barriers and limits to how much OSDs should be in a pg_temp. This will enable the specify pool to handle pg_temp properly during create_acting, as a stretch pool should. User can also use the command: `osd pool stretch show <pool> ` to show all the stretch related information for the pool pool: cephfs.a.data pool_id: 3 is_stretch_pool: 1 peering_crush_bucket_count: 3 peering_crush_bucket_target: 3 peering_crush_bucket_barrier: 8 crush_rule: replicated_rule_custom size: 3 min_size: 2 User can also unset the stretch pool wiith the commnad: `osd pool stretch unset <pool>` However, the pool must be a stretch pool. Fixes: https://tracker.ceph.com/issues/64802 Signed-off-by: Kamoltat <[email protected]>
1 parent 103cd8e commit 661e8e2

File tree

4 files changed

+245
-0
lines changed

4 files changed

+245
-0
lines changed

src/mon/MonCommands.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,6 +1200,25 @@ COMMAND("osd pool application get "
12001200
"name=key,type=CephString,req=false",
12011201
"get value of key <key> of application <app> on pool <poolname>",
12021202
"osd", "r")
1203+
COMMAND("osd pool stretch show "
1204+
"name=pool,type=CephPoolname",
1205+
"show all the stretch related information for the pool",
1206+
"osd", "r")
1207+
COMMAND("osd pool stretch set "
1208+
"name=pool,type=CephPoolname "
1209+
"name=peering_crush_bucket_count,type=CephInt,range=0 "
1210+
"name=peering_crush_bucket_target,type=CephInt,range=0 "
1211+
"name=peering_crush_bucket_barrier,type=CephString "
1212+
"name=crush_rule,type=CephString "
1213+
"name=size,type=CephInt,range=0 "
1214+
"name=min_size,type=CephInt,range=0 "
1215+
"name=yes_i_really_mean_it,type=CephBool,req=false",
1216+
"make the pool stretched across the specified number of CRUSH buckets",
1217+
"osd", "rw")
1218+
COMMAND("osd pool stretch unset "
1219+
"name=pool,type=CephPoolname",
1220+
"unset the stretch mode for the pool",
1221+
"osd", "rw")
12031222
COMMAND("osd utilization",
12041223
"get basic pg distribution stats",
12051224
"osd", "r")

src/mon/OSDMonitor.cc

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6089,6 +6089,62 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
60896089
}
60906090
r = 0;
60916091

6092+
} else if (prefix == "osd pool stretch show") {
6093+
string poolstr;
6094+
cmd_getval(cmdmap, "pool", poolstr);
6095+
int64_t pool = osdmap.lookup_pg_pool_name(poolstr);
6096+
if (pool < 0) {
6097+
ss << "unrecognized pool '" << poolstr << "'";
6098+
r = -ENOENT;
6099+
goto reply;
6100+
}
6101+
const pg_pool_t *p = osdmap.get_pg_pool(pool);
6102+
6103+
if (!p->is_stretch_pool()) {
6104+
ss << poolstr << " " << " is not a stretch pool.";
6105+
r = -ENOENT;
6106+
goto reply;
6107+
} else {
6108+
if (f) {
6109+
f->open_object_section("pool");
6110+
f->dump_string("pool", poolstr);
6111+
f->dump_int("pool_id", pool);
6112+
f->dump_bool("is_stretch_pool", p->is_stretch_pool());
6113+
f->dump_int("peering_crush_bucket_count", p->peering_crush_bucket_count);
6114+
f->dump_int("peering_crush_bucket_target", p->peering_crush_bucket_target);
6115+
f->dump_string("peering_crush_bucket_barrier", stringify(osdmap.crush->get_type_name(p->peering_crush_bucket_barrier)));
6116+
if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6117+
f->dump_string("crush_rule", osdmap.crush->get_rule_name(p->get_crush_rule()));
6118+
} else {
6119+
f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6120+
// warn if the rule does not exist
6121+
mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist";
6122+
}
6123+
f->dump_int("size", p->get_size());
6124+
f->dump_int("min_size", p->get_min_size());
6125+
f->close_section();
6126+
f->flush(rdata);
6127+
} else {
6128+
stringstream ss;
6129+
ss << "pool: " << poolstr << "\n";
6130+
ss << "pool_id: " << pool << "\n";
6131+
ss << "is_stretch_pool: " << p->is_stretch_pool() << "\n";
6132+
ss << "peering_crush_bucket_count: " << p->peering_crush_bucket_count << "\n";
6133+
ss << "peering_crush_bucket_target: " << p->peering_crush_bucket_target << "\n";
6134+
ss << "peering_crush_bucket_barrier: " << osdmap.crush->get_type_name(p->peering_crush_bucket_barrier) << "\n";
6135+
if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6136+
ss << "crush_rule: " << osdmap.crush->get_rule_name(p->get_crush_rule()) << "\n";
6137+
} else {
6138+
ss << "crush_rule: " << p->get_crush_rule() << "\n";
6139+
// warn if the rule does not exist
6140+
mon.clog->warn() << __func__ << " pool " << poolstr << " crush rule " << stringify(p->get_crush_rule()) << " does not exist";
6141+
}
6142+
ss << "size: " << p->get_size() << "\n";
6143+
ss << "min_size: " << p->get_min_size() << "\n";
6144+
rdata.append(ss.str());
6145+
}
6146+
}
6147+
r = 0;
60926148
} else if (prefix == "osd pool get") {
60936149
string poolstr;
60946150
cmd_getval(cmdmap, "pool", poolstr);
@@ -9042,6 +9098,149 @@ int OSDMonitor::prepare_command_pool_application(const string &prefix,
90429098
return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
90439099
}
90449100

9101+
int OSDMonitor::prepare_command_pool_stretch_set(const cmdmap_t& cmdmap,
9102+
stringstream& ss)
9103+
{
9104+
string pool_name;
9105+
cmd_getval(cmdmap, "pool", pool_name);
9106+
int64_t pool = osdmap.lookup_pg_pool_name(pool_name);
9107+
if (pool < 0) {
9108+
ss << "unrecognized pool '" << pool_name << "'";
9109+
return -ENOENT;
9110+
}
9111+
9112+
pg_pool_t p = *osdmap.get_pg_pool(pool);
9113+
if (pending_inc.new_pools.count(pool))
9114+
p = pending_inc.new_pools[pool];
9115+
9116+
int64_t bucket_count = cmd_getval_or<int64_t>(cmdmap, "peering_crush_bucket_count", 0);
9117+
if (bucket_count <= 0) {
9118+
ss << "peering_crush_bucket_count must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values";
9119+
return -EINVAL;
9120+
}
9121+
9122+
int64_t bucket_target = cmd_getval_or<int64_t>(cmdmap, "peering_crush_bucket_target", 0);
9123+
if (bucket_target <= 0) {
9124+
ss << "peering_crush_bucket_target must be >= 0! FYI use 'ceph osd pool stretch unset' to unset the stretch values";
9125+
return -EINVAL;
9126+
}
9127+
9128+
int bucket_barrier = 0;
9129+
string bucket_barrier_str;
9130+
cmd_getval(cmdmap, "peering_crush_bucket_barrier", bucket_barrier_str);
9131+
CrushWrapper& crush = _get_stable_crush();
9132+
if (bucket_barrier_str.empty()) {
9133+
ss << "peering_crush_bucket_barrier must be provided";
9134+
return -EINVAL;
9135+
} else {
9136+
bucket_barrier = crush.get_type_id(bucket_barrier_str);
9137+
if (bucket_barrier < 0) {
9138+
ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " does not exist";
9139+
return -EINVAL;
9140+
} else if (bucket_barrier == 0) {
9141+
ss << "peering_crush_bucket_barrier " << bucket_barrier_str << " is not a bucket type";
9142+
return -EINVAL;
9143+
}
9144+
}
9145+
// Check if the number of peering_crush_bucket_count and peering_crush_bucket_target
9146+
// exceeds the number of subtrees of the specified bucket_barrier in the cluster.
9147+
vector<int> subtrees;
9148+
bool sure = false;
9149+
crush.get_subtree_of_type(bucket_barrier, &subtrees);
9150+
cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
9151+
if (static_cast<uint32_t>(bucket_count) > subtrees.size()) {
9152+
if (!sure) {
9153+
ss << "peering_crush_bucket_count=" << bucket_count
9154+
<< " > " << bucket_barrier_str << "=" << subtrees.size()
9155+
<< " can lead to data unavailability, pass --yes-i-really-mean-it to proceed";
9156+
return -EPERM;
9157+
}
9158+
} else if (static_cast<uint32_t>(bucket_target) > subtrees.size()) {
9159+
if (!sure) {
9160+
ss << "peering_crush_bucket_target=" << bucket_target
9161+
<< " > " << bucket_barrier_str << "=" << subtrees.size()
9162+
<< " can lead to data unavailability, pass --yes-i-really-mean-it to proceed";
9163+
return -EPERM;
9164+
}
9165+
}
9166+
9167+
string crush_rule_str;
9168+
cmd_getval(cmdmap, "crush_rule", crush_rule_str);
9169+
if (crush_rule_str.empty()) {
9170+
ss << "crush_rule must be provided";
9171+
return -EINVAL;
9172+
}
9173+
int crush_rule = crush.get_rule_id(crush_rule_str);
9174+
if (crush_rule < 0) {
9175+
ss << "crush rule " << crush_rule_str << " does not exist";
9176+
return -ENOENT;
9177+
}
9178+
if (!crush.rule_valid_for_pool_type(crush_rule, p.get_type())) {
9179+
ss << "crush rule " << crush_rule << " type does not match pool";
9180+
return -EINVAL;
9181+
}
9182+
int64_t pool_size = cmd_getval_or<int64_t>(cmdmap, "size", 0);
9183+
if (pool_size < 0) {
9184+
ss << "pool size must be non-negative";
9185+
return -EINVAL;
9186+
}
9187+
9188+
int64_t pool_min_size = cmd_getval_or<int64_t>(cmdmap, "min_size", 0);
9189+
if (pool_min_size < 0) {
9190+
ss << "pool min_size must be non-negative";
9191+
return -EINVAL;
9192+
}
9193+
9194+
p.peering_crush_bucket_count = static_cast<uint32_t>(bucket_count);
9195+
p.peering_crush_bucket_target = static_cast<uint32_t>(bucket_target);
9196+
p.peering_crush_bucket_barrier = static_cast<uint32_t>(bucket_barrier);
9197+
p.crush_rule = static_cast<__u8>(crush_rule);
9198+
p.size = static_cast<__u8>(pool_size);
9199+
p.min_size = static_cast<__u8>(pool_min_size);
9200+
p.last_change = pending_inc.epoch;
9201+
pending_inc.new_pools[pool] = p;
9202+
ss << "pool " << pool_name << " stretch values are set successfully";
9203+
return 0;
9204+
}
9205+
9206+
int OSDMonitor::prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap,
9207+
stringstream& ss)
9208+
{
9209+
/**
9210+
* Command syntax:
9211+
* ceph osd pool stretch unset <pool>
9212+
*/
9213+
string pool_name;
9214+
cmd_getval(cmdmap, "pool", pool_name);
9215+
int64_t pool = osdmap.lookup_pg_pool_name(pool_name);
9216+
// check if pool exists
9217+
if (pool < 0) {
9218+
ss << "unrecognized pool '" << pool_name << "'";
9219+
return -ENOENT;
9220+
}
9221+
9222+
// get pool
9223+
pg_pool_t p = *osdmap.get_pg_pool(pool);
9224+
if (pending_inc.new_pools.count(pool))
9225+
p = pending_inc.new_pools[pool];
9226+
9227+
// check if pool is a stretch pool
9228+
if (!p.is_stretch_pool()) {
9229+
ss << "pool " << pool_name << " is not a stretch pool";
9230+
return -ENOENT;
9231+
}
9232+
9233+
// unset stretch values
9234+
p.peering_crush_bucket_count = 0;
9235+
p.peering_crush_bucket_target = 0;
9236+
p.peering_crush_bucket_barrier = 0;
9237+
p.last_change = pending_inc.epoch;
9238+
pending_inc.new_pools[pool] = p;
9239+
ss << "pool " << pool_name
9240+
<< " is no longer a stretch pool, all stretch values are unset successfully";
9241+
return 0;
9242+
}
9243+
90459244
int OSDMonitor::preprocess_command_pool_application(const string &prefix,
90469245
const cmdmap_t& cmdmap,
90479246
stringstream& ss,
@@ -13897,6 +14096,28 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
1389714096
} else {
1389814097
goto update;
1389914098
}
14099+
} else if (prefix == "osd pool stretch set") {
14100+
err = prepare_command_pool_stretch_set(cmdmap, ss);
14101+
if (err == -EAGAIN)
14102+
goto wait;
14103+
if (err < 0)
14104+
goto reply_no_propose;
14105+
14106+
getline(ss, rs);
14107+
wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
14108+
get_last_committed() + 1));
14109+
return true;
14110+
} else if (prefix == "osd pool stretch unset") {
14111+
err = prepare_command_pool_stretch_unset(cmdmap, ss);
14112+
if (err == -EAGAIN)
14113+
goto wait;
14114+
if (err < 0)
14115+
goto reply_no_propose;
14116+
14117+
getline(ss, rs);
14118+
wait_for_commit(op, new Monitor::C_Command(mon, op, 0, rs,
14119+
get_last_committed() + 1));
14120+
return true;
1390014121
} else if (prefix == "osd force-create-pg") {
1390114122
pg_t pgid;
1390214123
string pgidstr;

src/mon/OSDMonitor.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,10 @@ class OSDMonitor : public PaxosService,
751751
const cmdmap_t& cmdmap,
752752
std::stringstream& ss,
753753
bool *modified);
754+
int prepare_command_pool_stretch_set(const cmdmap_t& cmdmap,
755+
std::stringstream& ss);
756+
int prepare_command_pool_stretch_unset(const cmdmap_t& cmdmap,
757+
std::stringstream& ss);
754758
int _command_pool_application(const std::string &prefix,
755759
const cmdmap_t& cmdmap,
756760
std::stringstream& ss,

src/osd/osd_types.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1568,6 +1568,7 @@ void pg_pool_t::dump(Formatter *f) const
15681568
f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
15691569
f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
15701570
f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
1571+
f->dump_bool("is_stretch_pool", is_stretch_pool());
15711572
f->dump_int("object_hash", get_object_hash());
15721573
f->dump_string("pg_autoscale_mode",
15731574
get_pg_autoscale_mode_name(pg_autoscale_mode));

0 commit comments

Comments
 (0)