Skip to content

Commit a222787

Browse files
authored
Merge pull request ceph#65940 from Tom-Sollers/fixing_blaum_roth_for_ec_profiles
Mon: Add health warning for blaum-roth EC profile when w+1 is not prime
2 parents cf11d9d + 256e277 commit a222787

File tree

9 files changed

+139
-4
lines changed

9 files changed

+139
-4
lines changed

doc/rados/operations/health-checks.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1900,3 +1900,21 @@ To disable the debug mode, run the following command:
19001900
.. prompt:: bash $
19011901

19021902
ceph dashboard debug disable
1903+
1904+
BLAUM_ROTH_W_IS_NOT_PRIME
1905+
_________________________
1906+
1907+
An EC pool is using the ``blaum_roth`` technique and ``w + 1`` is not a prime number.
1908+
This can result in data corruption.
1909+
1910+
To check the list of Erasure Code Profiles use the command:
1911+
1912+
.. prompt:: bash $
1913+
1914+
ceph osd erasure-code-profile ls
1915+
1916+
Then to check the ``w`` value for a particular profile use the command:
1917+
1918+
.. prompt:: bash $
1919+
1920+
ceph osd erasure-code-profile get <name of profile>

qa/standalone/erasure-code/test-erasure-code-plugins.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,29 @@ function TEST_ec_profile_warning() {
115115
teardown $dir || return 1
116116
}
117117

118+
function TEST_ec_profile_blaum_roth_warning() {
119+
local dir=$1
120+
121+
setup $dir || return 1
122+
run_mon $dir a || return 1
123+
run_mgr $dir x || return 1
124+
for id in $(seq 0 2) ; do
125+
run_osd $dir $id || return 1
126+
done
127+
create_rbd_pool || return 1
128+
wait_for_clean || return 1
129+
130+
echo "Starting test for blaum-roth profile health warning"
131+
132+
#Check that the health warn for incorrect blaum-roth profiles is correct.
133+
ceph osd erasure-code-profile set prof-${plugin} plugin=jerasure k=3 m=1 technique=blaum_roth w=7 --yes-i-really-mean-it --force
134+
CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
135+
sleep 10
136+
grep -F "1 or more EC profiles have a w value such that w+1 is not prime. This can result in data corruption" $dir/mon.a.log || return 1
137+
grep -F "w+1=8 for the EC profile prof-${plugin} is not prime and could lead to data corruption" $dir/mon.a.log || return 1
138+
139+
teardown $dir || return 1
140+
return 0
141+
}
142+
118143
main test-erasure-code-plugins "$@"

src/erasure-code/jerasure/ErasureCodeJerasure.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -707,14 +707,14 @@ void ErasureCodeJerasureLiberation::prepare()
707707
// ErasureCodeJerasureBlaumRoth
708708
//
709709
bool ErasureCodeJerasureBlaumRoth::check_w(ostream *ss) const
710-
{
710+
{
711711
// back in Firefly, w = 7 was the default and produced usable
712712
// chunks. Tolerate this value for backward compatibility.
713713
if (w == 7)
714714
return true;
715715
if (w <= 2 || !is_prime(w+1)) {
716716
*ss << "w=" << w << " must be greater than two and "
717-
<< "w+1 must be prime" << std::endl;
717+
<< "w+1 must be prime" << std::endl;
718718
return false;
719719
} else {
720720
return true;

src/erasure-code/jerasure/ErasureCodeJerasure.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ class ErasureCodeJerasureBlaumRoth : public ErasureCodeJerasureLiberation {
305305
ErasureCodeJerasureBlaumRoth() :
306306
ErasureCodeJerasureLiberation("blaum_roth")
307307
{
308+
DEFAULT_W = "6";
308309
}
309310

310311
bool check_w(std::ostream *ss) const override;

src/mon/HealthMonitor.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "mon/Monitor.h"
2828
#include "mon/HealthMonitor.h"
2929
#include "mon/OSDMonitor.h"
30+
#include "osd/OSDMap.h"
31+
3032

3133
#include "messages/MMonCommand.h"
3234
#include "messages/MMonHealthChecks.h"
@@ -749,6 +751,9 @@ bool HealthMonitor::check_leader_health()
749751
// STRETCH MODE
750752
check_mon_crush_loc_stretch_mode(&next);
751753

754+
//CHECK_ERASURE_CODE_PROFILE
755+
check_erasure_code_profiles(&next);
756+
752757
if (next != leader_checks) {
753758
changed = true;
754759
leader_checks = next;
@@ -1169,3 +1174,34 @@ void HealthMonitor::check_netsplit(health_check_map_t *checks, std::set<std::str
11691174
d.detail.swap(details);
11701175
}
11711176
}
1177+
1178+
void HealthMonitor::check_erasure_code_profiles(health_check_map_t *checks)
1179+
{
1180+
list<string> details;
1181+
1182+
//This is a loop that will go through all the erasure code profiles
1183+
for (auto& erasure_code_profile : mon.osdmon()->osdmap.get_erasure_code_profiles()) {
1184+
dout(20) << "check_erasure_code_profiles" << "checking" << erasure_code_profile << dendl;
1185+
1186+
//This will look at the erasure code profiles technique is blaum_roth and will check that the w key exists
1187+
if (erasure_code_profile.second.at("technique") == "blaum_roth" &&
1188+
erasure_code_profile.second.count("w") == 1) {
1189+
//Read the w value from the profile and convert it to an int
1190+
int w = std::stoi(erasure_code_profile.second.at("w"));
1191+
1192+
if (!mon.is_prime(w + 1)) {
1193+
ostringstream ds;
1194+
ds << "w+1="<< w+1 << " for the EC profile " << erasure_code_profile.first
1195+
<< " is not prime and could lead to data corruption";
1196+
details.push_back(ds.str());
1197+
}
1198+
}
1199+
}
1200+
if (!details.empty()) {
1201+
ostringstream ss;
1202+
ss << "1 or more EC profiles have a w value such that w+1 is not prime."
1203+
<< " This can result in data corruption";
1204+
auto &d = checks->add("BLAUM_ROTH_W_IS_NOT_PRIME", HEALTH_WARN, ss.str(), details.size());
1205+
d.detail.swap(details);
1206+
}
1207+
}

src/mon/HealthMonitor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class HealthMonitor : public PaxosService
7373
void check_for_clock_skew(health_check_map_t *checks);
7474
void check_mon_crush_loc_stretch_mode(health_check_map_t *checks);
7575
void check_if_msgr2_enabled(health_check_map_t *checks);
76+
void check_erasure_code_profiles(health_check_map_t *checks);
7677
void check_netsplit(health_check_map_t *checks, std::set<std::string> &mons_down);
7778
bool check_leader_health();
7879
bool check_member_health();

src/mon/Monitor.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7083,3 +7083,19 @@ void Monitor::disconnect_disallowed_stretch_sessions()
70837083
session_stretch_allowed(*j, blank);
70847084
}
70857085
}
7086+
7087+
// A utility function that will check to see if the given value is prime using a set of the first 55 prime numbers
7088+
bool Monitor::is_prime(int value) {
7089+
int prime55[] = {
7090+
2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,
7091+
73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,
7092+
151,157,163,167,173,179,
7093+
181,191,193,197,199,211,223,227,229,233,239,241,251,257
7094+
};
7095+
7096+
for (int i: prime55)
7097+
if (value == i)
7098+
return true;
7099+
7100+
return false;
7101+
}

src/mon/Monitor.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,8 @@ class Monitor : public Dispatcher,
283283
bool is_degraded_stretch_mode() { return degraded_stretch_mode; }
284284
bool is_recovering_stretch_mode() { return recovering_stretch_mode; }
285285

286+
bool is_prime(int value); // A utility funtion that returns true if value is prime
287+
286288
/**
287289
* This set of functions maintains the in-memory stretch state
288290
* and sets up transitions of the map states by calling in to

src/mon/OSDMonitor.cc

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11668,6 +11668,43 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
1166811668
err = -EINVAL;
1166911669
goto reply_no_propose;
1167011670
}
11671+
11672+
bool force_no_fake = false;
11673+
cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake);
11674+
11675+
11676+
11677+
//This is the start of the validation for the w value in a blaum_roth profile
11678+
//this will search the Profile map, which contains the values for the parameters given in the command, for the technique parameter
11679+
if (auto found = profile_map.find("technique"); found != profile_map.end()) {
11680+
11681+
//if the technique parameter is found then save the value of it
11682+
string technique = found->second;
11683+
11684+
11685+
//then search the profile map again for the w value, which doesnt have to be specified, and if it is found and the technique used is blaum-roth then check that the w value is correct.
11686+
if (found = profile_map.find("w"); technique == "blaum_roth" && found != profile_map.end()) {
11687+
int w = std::stoi(found->second);
11688+
11689+
//checks if w+1 is not prime
11690+
if (w <= 2 || !mon.is_prime(w + 1)) {
11691+
11692+
if (force ^ force_no_fake) {
11693+
err = -EPERM;
11694+
ss << "Creating a blaum-roth erasure code profile with a w+1 value that is not prime is dangerious,"
11695+
<< " as it can cause data corruption."
11696+
<< " You need to use both --yes-i-really-mean-it and --force flags." << std::endl;
11697+
goto reply_no_propose;
11698+
} else if (!force && !force_no_fake) {
11699+
ss << "erasure-code-profile: " << profile_map
11700+
<< " must use a w value such that w+1 is prime and w is greater than 2." << std::endl;
11701+
err = -EINVAL;
11702+
goto reply_no_propose;
11703+
}
11704+
}
11705+
}
11706+
}
11707+
1167111708
string plugin = profile_map["plugin"];
1167211709

1167311710
if (pending_inc.has_erasure_code_profile(name)) {
@@ -11689,8 +11726,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
1168911726
err = 0;
1169011727
goto reply_no_propose;
1169111728
}
11692-
bool force_no_fake = false;
11693-
cmd_getval(cmdmap, "yes_i_really_mean_it", force_no_fake);
11729+
1169411730
if (!force) {
1169511731
err = -EPERM;
1169611732
ss << "will not override erasure code profile " << name

0 commit comments

Comments
 (0)