Skip to content

Commit 13b1d28

Browse files
authored
Merge pull request ceph#64045 from shraddhaag/wip-71743-tentacle
tentacle: mon: add config option to toggle availability score feature
2 parents ab22957 + b4fbc95 commit 13b1d28

File tree

8 files changed

+123
-5
lines changed

8 files changed

+123
-5
lines changed

PendingReleaseNotes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,8 @@
151151
users to view the availability score for each pool in a cluster. A pool is considered
152152
unavailable if any PG in the pool is not in active state or if there are unfound
153153
objects. Otherwise the pool is considered available. The score is updated every
154-
5 seconds. This feature is in tech preview.
154+
5 seconds. The feature is on by default. A new config option `enable_availability_tracking`
155+
can be used to turn off the feature if required. This feature is in tech preview.
155156
Related trackers:
156157
- https://tracker.ceph.com/issues/67777
157158

doc/rados/configuration/mon-config-ref.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ Miscellaneous
627627
.. confval:: mon_osd_cache_size_min
628628
.. confval:: mon_memory_target
629629
.. confval:: mon_memory_autotune
630+
.. confval:: enable_availability_tracking
630631

631632
.. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
632633
.. _Monitor Keyrings: ../../../dev/mon-bootstrap#secret-keys

doc/rados/operations/monitoring.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -784,4 +784,14 @@ the ratio of MTBF to the total time.
784784
The score is updated every five seconds. This interval is currently
785785
not configurable. Any intermittent changes to the pools that
786786
occur between this duration but are reset before we recheck the pool
787-
status will not be captured by this feature.
787+
status will not be captured by this feature.
788+
789+
This feature is on by default. To turn the feature off, e.g. - for an expected
790+
downtime, the ``enable_availability_tracking`` config option can be set to ``false``.
791+
792+
.. prompt:: bash $
793+
794+
ceph config set mon enable_availability_tracking false
795+
796+
While the feature is turned off, the last calculated score will be preserved. The
797+
score will again start updating once the feature is turned on again.

qa/standalone/mon/availability.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,30 @@ function TEST_availablity_score() {
5757
AVAILABILITY_STATUS=$(ceph osd pool availability-status | grep -w "foo")
5858
SCORE=$(echo "$AVAILABILITY_STATUS" | awk '{print $7}')
5959
IS_AVAILABLE=$(echo "$AVAILABILITY_STATUS" | awk '{print $8}')
60+
UPTIME_DURATION=$(echo "$AVAILABILITY_STATUS" | awk '{print $2}')
61+
UPTIME_SECONDS=$(( ${UPTIME_DURATION%[sm]} * (${UPTIME_DURATION: -1} == "m" ? 60 : 1) ))
6062
if [ $IS_AVAILABLE -ne 1 ]; then
6163
echo "Failed: Pool is not available in availabilty status"
64+
return 1
65+
fi
66+
67+
# unset config option enable_availability_tracking to disable feature
68+
ceph config set mon enable_availability_tracking false
69+
AVAILABILITY_STATUS=$(ceph osd pool availability-status | grep -w "foo")
70+
if [ "$AVAILABILITY_STATUS" != "" ]; then
71+
echo "Failed: feature not disabled successfully."
72+
return 1
73+
fi
74+
sleep 120
75+
76+
# enable feature and check is score updated when it was off
77+
ceph config set mon enable_availability_tracking true
78+
AVAILABILITY_STATUS=$(ceph osd pool availability-status | grep -w "foo")
79+
UPTIME_DURATION=$(echo "$AVAILABILITY_STATUS" | awk '{print $2}')
80+
NEW_UPTIME_SECONDS=$(( ${UPTIME_DURATION%[sm]} * (${UPTIME_DURATION: -1} == "m" ? 60 : 1) ))
81+
if [ "$NEW_UPTIME_SECONDS" -gt $((UPTIME_SECONDS + 120)) ]; then
82+
echo "Failed: score is updated even when feature is disabled"
83+
return 1
6284
fi
6385

6486
# write some objects

src/common/options/mon.yaml.in

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,3 +1395,14 @@ options:
13951395
default: 2
13961396
services:
13971397
- mon
1398+
- name: enable_availability_tracking
1399+
type: bool
1400+
level: advanced
1401+
desc: Calculate and store availablity score for each pool in the
1402+
cluster at regular intervals
1403+
default: true
1404+
services :
1405+
- mon
1406+
flags:
1407+
- runtime
1408+

src/mon/MgrStatMonitor.cc

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,44 @@ static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
5252
MgrStatMonitor::MgrStatMonitor(Monitor &mn, Paxos &p, const string& service_name)
5353
: PaxosService(mn, p, service_name)
5454
{
55+
g_conf().add_observer(this);
5556
}
5657

57-
MgrStatMonitor::~MgrStatMonitor() = default;
58+
MgrStatMonitor::~MgrStatMonitor()
59+
{
60+
g_conf().remove_observer(this);
61+
}
62+
63+
std::vector<std::string> MgrStatMonitor::get_tracked_keys() const noexcept
64+
{
65+
return {
66+
"enable_availability_tracking",
67+
};
68+
}
69+
70+
void MgrStatMonitor::handle_conf_change(
71+
const ConfigProxy& conf,
72+
const std::set<std::string>& changed)
73+
{
74+
if (changed.count("enable_availability_tracking")) {
75+
std::scoped_lock l(lock);
76+
bool oldval = enable_availability_tracking;
77+
bool newval = g_conf().get_val<bool>("enable_availability_tracking");
78+
dout(10) << __func__ << " enable_availability_tracking config option is changed from "
79+
<< oldval << " to " << newval
80+
<< dendl;
81+
82+
// if fetaure is toggled from off to on,
83+
// store the new value of last_uptime and last_downtime
84+
// (to be updated in calc_pool_availability)
85+
if (newval > oldval) {
86+
reset_availability_last_uptime_downtime_val = ceph_clock_now();
87+
dout(10) << __func__ << " reset_availability_last_uptime_downtime_val "
88+
<< reset_availability_last_uptime_downtime_val << dendl;
89+
}
90+
enable_availability_tracking = newval;
91+
}
92+
}
5893

5994
void MgrStatMonitor::create_initial()
6095
{
@@ -69,6 +104,29 @@ void MgrStatMonitor::create_initial()
69104
void MgrStatMonitor::calc_pool_availability()
70105
{
71106
dout(20) << __func__ << dendl;
107+
std::scoped_lock l(lock);
108+
109+
// if feature is disabled by user, do not update the uptime
110+
// and downtime, exit early
111+
if (!enable_availability_tracking) {
112+
dout(20) << __func__ << " tracking availability score is disabled" << dendl;
113+
return;
114+
}
115+
116+
// if reset_availability_last_uptime_downtime_val is not utime_t(1, 2),
117+
// update last_uptime and last_downtime for all pools to the
118+
// recorded values
119+
if (reset_availability_last_uptime_downtime_val.has_value()) {
120+
for (const auto& i : pool_availability) {
121+
const auto& poolid = i.first;
122+
pool_availability[poolid].last_downtime = reset_availability_last_uptime_downtime_val.value();
123+
pool_availability[poolid].last_uptime = reset_availability_last_uptime_downtime_val.value();
124+
}
125+
dout(20) << __func__ << " reset last_uptime and last_downtime to "
126+
<< reset_availability_last_uptime_downtime_val << dendl;
127+
reset_availability_last_uptime_downtime_val.reset();
128+
}
129+
72130
auto pool_avail_end = pool_availability.end();
73131
for (const auto& i : digest.pool_pg_unavailable_map) {
74132
const auto& poolid = i.first;

src/mon/MgrStatMonitor.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
#include "mon/PGMap.h"
99
#include "mgr/ServiceMap.h"
1010

11-
class MgrStatMonitor : public PaxosService {
11+
class MgrStatMonitor : public PaxosService,
12+
public md_config_obs_t {
1213
// live version
1314
version_t version = 0;
1415
PGMapDigest digest;
@@ -27,6 +28,8 @@ class MgrStatMonitor : public PaxosService {
2728
MgrStatMonitor(Monitor &mn, Paxos &p, const std::string& service_name);
2829
~MgrStatMonitor() override;
2930

31+
ceph::mutex lock = ceph::make_mutex("MgrStatMonitor::lock");
32+
3033
void init() override {}
3134
void on_shutdown() override {}
3235

@@ -52,7 +55,9 @@ class MgrStatMonitor : public PaxosService {
5255
bool preprocess_statfs(MonOpRequestRef op);
5356

5457
void calc_pool_availability();
55-
58+
bool enable_availability_tracking = g_conf().get_val<bool>("enable_availability_tracking"); ///< tracking availability score feature
59+
std::optional<utime_t> reset_availability_last_uptime_downtime_val;
60+
5661
void check_sub(Subscription *sub);
5762
void check_subs();
5863
void send_digests();
@@ -114,4 +119,9 @@ class MgrStatMonitor : public PaxosService {
114119
bool verbose) const {
115120
digest.dump_pool_stats_full(osdm, ss, f, verbose);
116121
}
122+
123+
// config observer
124+
std::vector<std::string> get_tracked_keys() const noexcept override;
125+
void handle_conf_change(const ConfigProxy& conf,
126+
const std::set <std::string> &changed) override;
117127
};

src/mon/OSDMonitor.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14410,6 +14410,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
1441014410
get_last_committed() + 1));
1441114411
return true;
1441214412
} else if (prefix == "osd pool availability-status") {
14413+
if (!g_conf().get_val<bool>("enable_availability_tracking")) {
14414+
ss << "availability tracking is disabled; you can enable it by setting the config option enable_availability_tracking";
14415+
err = -EOPNOTSUPP;
14416+
goto reply_no_propose;
14417+
}
1441314418
TextTable tbl;
1441414419
tbl.define_column("POOL", TextTable::LEFT, TextTable::LEFT);
1441514420
tbl.define_column("UPTIME", TextTable::LEFT, TextTable::RIGHT);

0 commit comments

Comments
 (0)