Skip to content

Commit 1b881fe

Browse files
Merge pull request ceph#65172 from shraddhaag/wip-shraddhaag-availability-frequency-config
mon: add config option to change availability score update interval
2 parents 43c69a3 + 1cbe41b commit 1b881fe

File tree

6 files changed

+62
-6
lines changed

6 files changed

+62
-6
lines changed

PendingReleaseNotes

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,14 @@
179179
five seconds and the feature is enabled by default. A new config option ``enable_availability_tracking``
180180
can be used to turn off the feature if required. Another command is added to clear the
181181
availability status for a specific pool, ``ceph osd pool clear-availability-status <pool-name>``.
182+
users to view the availability score for each pool in a cluster. A pool is considered
183+
unavailable if any PG in the pool is not in active state or if there are unfound
184+
objects. Otherwise the pool is considered available. The score is updated every
185+
one second by default. This interval can be changed using the new config option
186+
``pool_availability_update_interval.``. The feature is on by default. A new config option
187+
``enable_availability_tracking`` can be used to turn off the feature if required.
188+
Another command is added to clear the availability status for a specific pool,
189+
``ceph osd pool clear-availability-status <pool-name>``.
182190
This feature is in tech preview.
183191
Related trackers:
184192
- https://tracker.ceph.com/issues/67777

doc/rados/configuration/mon-config-ref.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,7 @@ Miscellaneous
628628
.. confval:: mon_memory_target
629629
.. confval:: mon_memory_autotune
630630
.. confval:: enable_availability_tracking
631+
.. confval:: pool_availability_update_interval
631632

632633
NVMe-oF Monitor Client
633634
======================

doc/rados/operations/monitoring.rst

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -781,10 +781,19 @@ the Mean Time Between Failures (MTBF) and Mean Time To Recover (MTTR)
781781
for each pool. The availability score is then calculated by finding
782782
the ratio of MTBF to the total time.
783783

784-
The score is updated every five seconds. This interval is currently
785-
not configurable. Any intermittent changes to the pools that
786-
occur between this duration but are reset before we recheck the pool
787-
status will not be captured by this feature.
784+
The score is updated every one second. Transient changes to pools that
785+
occur and are reverted between successive updates will not be captured.
786+
It is possible to configure this interval with a command of the following
787+
form:
788+
789+
.. prompt:: bash $
790+
791+
ceph config set mon pool_availability_update_interval 2
792+
793+
This will set the update interval to two seconds. Please note that
794+
it is not possible to set this interval less than the config value set
795+
for ``paxos_propose_interval``.
796+
788797

789798
This feature is on by default. To turn the feature off, e.g. - for an expected
790799
downtime, the ``enable_availability_tracking`` config option can be set to ``false``.

src/common/options/mon.yaml.in

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,3 +1420,13 @@ options:
14201420
default: 30
14211421
services:
14221422
- mon
1423+
- name: pool_availability_update_interval
1424+
type: float
1425+
level: advanced
1426+
desc: Update data availability score at this interval. By default the interval
1427+
is same as paxos_propose_interval configuration.
1428+
default: 1
1429+
services :
1430+
- mon
1431+
flags:
1432+
- runtime

src/mon/MgrStatMonitor.cc

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ std::vector<std::string> MgrStatMonitor::get_tracked_keys() const noexcept
6565
{
6666
return {
6767
"enable_availability_tracking",
68+
"pool_availability_update_interval",
6869
};
6970
}
7071

@@ -94,6 +95,16 @@ void MgrStatMonitor::handle_conf_change(
9495
}
9596
enable_availability_tracking = newval;
9697
}
98+
99+
if (changed.count("pool_availability_update_interval")) {
100+
std::scoped_lock l(lock);
101+
dout(10) << __func__ << " pool_availability_update_interval config changed from "
102+
<< pool_availability_update_interval << " to "
103+
<< g_conf().get_val<double>("pool_availability_update_interval")
104+
<< dendl;
105+
106+
pool_availability_update_interval = g_conf().get_val<double>("pool_availability_update_interval");
107+
}
97108
}
98109

99110
void MgrStatMonitor::create_initial()
@@ -120,6 +131,18 @@ void MgrStatMonitor::clear_pool_availability(int64_t poolid)
120131
dout(20) << __func__ << " cleared availability score for pool: " << poolid << dendl;
121132
}
122133

134+
bool MgrStatMonitor::should_calc_pool_availability()
135+
{
136+
dout(20) << __func__ << dendl;
137+
std::scoped_lock l(lock);
138+
139+
utime_t now = ceph_clock_now();
140+
if ((now - pool_availability_last_updated) >= pool_availability_update_interval) {
141+
return true;
142+
}
143+
return false;
144+
}
145+
123146
void MgrStatMonitor::calc_pool_availability()
124147
{
125148
dout(20) << __func__ << dendl;
@@ -202,6 +225,7 @@ void MgrStatMonitor::calc_pool_availability()
202225

203226
}
204227
pending_pool_availability = pool_availability;
228+
pool_availability_last_updated = now;
205229
}
206230

207231
void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
@@ -239,7 +263,8 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
239263
mon.osdmon()->notify_new_pg_digest();
240264

241265
// only calculate pool_availability within leader mon
242-
if (mon.is_leader()) {
266+
// and if configured interval has elapsed
267+
if (mon.is_leader() && should_calc_pool_availability()) {
243268
calc_pool_availability();
244269
}
245270
}

src/mon/MgrStatMonitor.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,11 @@
5656

5757
void calc_pool_availability();
5858
bool enable_availability_tracking = g_conf().get_val<bool>("enable_availability_tracking"); ///< tracking availability score feature
59-
59+
double pool_availability_update_interval = g_conf().get_val<double>("pool_availability_update_interval");
60+
utime_t pool_availability_last_updated = ceph_clock_now();
61+
6062
void clear_pool_availability(int64_t poolid);
63+
bool should_calc_pool_availability();
6164

6265
void check_sub(Subscription *sub);
6366
void check_subs();

0 commit comments

Comments
 (0)