Skip to content

Commit 1017b1d

Browse files
Merge pull request ceph#59743 from sseshasa/wip-fix-mclock-low-iops-capacity-threshold
common,osd: Use last valid OSD IOPS value if measured IOPS is unrealistic
2 parents f94ff60 + da4b85c commit 1017b1d

File tree

5 files changed

+68
-16
lines changed

5 files changed

+68
-16
lines changed

doc/rados/configuration/mclock-config-ref.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,8 @@ mClock Config Options
748748
.. confval:: osd_mclock_skip_benchmark
749749
.. confval:: osd_mclock_override_recovery_settings
750750
.. confval:: osd_mclock_iops_capacity_threshold_hdd
751+
.. confval:: osd_mclock_iops_capacity_low_threshold_hdd
751752
.. confval:: osd_mclock_iops_capacity_threshold_ssd
753+
.. confval:: osd_mclock_iops_capacity_low_threshold_ssd
752754

753755
.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf

qa/suites/rados/valgrind-leaks/1-start.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ overrides:
1212
- overall HEALTH_
1313
- \(PG_
1414
- \(POOL_APP_NOT_ENABLED\)
15+
- OSD bench result
1516
conf:
1617
global:
1718
osd heartbeat grace: 40

qa/suites/rados/verify/validater/valgrind.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ overrides:
2626
- \(MON_DOWN\)
2727
- \(SLOW_OPS\)
2828
- slow request
29+
- OSD bench result
2930
valgrind:
3031
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
3132
osd: [--tool=memcheck]

src/common/options/osd.yaml.in

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,25 +1283,67 @@ options:
12831283
level: basic
12841284
desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
12851285
the OSD bench results for an OSD (for rotational media)
1286-
long_desc: This option specifies the threshold IOPS capacity for an OSD under
1287-
which the OSD bench results can be considered for QoS calculations. Only
1288-
considered for osd_op_queue = mclock_scheduler
1286+
long_desc: This option specifies the high threshold IOPS capacity for an OSD
1287+
below which the OSD bench results can be considered for QoS calculations.
1288+
Only considered when osd_op_queue = mclock_scheduler
12891289
fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
1290-
ignore OSD bench results for an OSD (for rotational media)
1290+
ignore OSD bench results for an OSD (for rotational media) and fall back to
1291+
the last valid or default IOPS capacity defined by
1292+
``osd_mclock_max_capacity_iops_hdd``.
12911293
default: 500
1294+
see_also:
1295+
- osd_mclock_max_capacity_iops_hdd
1296+
flags:
1297+
- runtime
1298+
- name: osd_mclock_iops_capacity_low_threshold_hdd
1299+
type: float
1300+
level: basic
1301+
desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
1302+
the OSD bench results for an OSD (for rotational media)
1303+
long_desc: This option specifies the low threshold IOPS capacity of an OSD
1304+
above which the OSD bench results can be considered for QoS calculations.
1305+
Only considered when osd_op_queue = mclock_scheduler
1306+
fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
1307+
ignore OSD bench results for an OSD (for rotational media) and fall back to
1308+
the last valid or default IOPS capacity defined by
1309+
``osd_mclock_max_capacity_iops_hdd``.
1310+
default: 50
1311+
see_also:
1312+
- osd_mclock_max_capacity_iops_hdd
12921313
flags:
12931314
- runtime
12941315
- name: osd_mclock_iops_capacity_threshold_ssd
12951316
type: float
12961317
level: basic
12971318
desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
12981319
the OSD bench results for an OSD (for solid state media)
1299-
long_desc: This option specifies the threshold IOPS capacity for an OSD under
1300-
which the OSD bench results can be considered for QoS calculations. Only
1301-
considered for osd_op_queue = mclock_scheduler
1320+
long_desc: This option specifies the high threshold IOPS capacity for an OSD
1321+
below which the OSD bench results can be considered for QoS calculations.
1322+
Only considered when osd_op_queue = mclock_scheduler
13021323
fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
1303-
ignore OSD bench results for an OSD (for solid state media)
1324+
ignore OSD bench results for an OSD (for solid state media) and fall back to
1325+
the last valid or default IOPS capacity defined by
1326+
``osd_mclock_max_capacity_iops_ssd``.
13041327
default: 80000
1328+
see_also:
1329+
- osd_mclock_max_capacity_iops_ssd
1330+
flags:
1331+
- runtime
1332+
- name: osd_mclock_iops_capacity_low_threshold_ssd
1333+
type: float
1334+
level: basic
1335+
desc: The threshold IOPs capacity (at 4KiB block size) below which to ignore
1336+
the OSD bench results for an OSD (for solid state media)
1337+
long_desc: This option specifies the low threshold IOPS capacity for an OSD
1338+
above which the OSD bench results can be considered for QoS calculations.
1339+
Only considered when osd_op_queue = mclock_scheduler
1340+
fmt_desc: The threshold IOPS capacity (at 4KiB block size) below which to
1341+
ignore OSD bench results for an OSD (for solid state media) and fall back to
1342+
the last valid or default IOPS capacity defined by
1343+
``osd_mclock_max_capacity_iops_ssd``.
1344+
default: 1000
1345+
see_also:
1346+
- osd_mclock_max_capacity_iops_ssd
13051347
flags:
13061348
- runtime
13071349
# Set to true for testing. Users should NOT set this.

src/osd/OSD.cc

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10173,22 +10173,28 @@ void OSD::maybe_override_max_osd_capacity_for_qos()
1017310173
<< dendl;
1017410174

1017510175
// Get the threshold IOPS set for the underlying hdd/ssd.
10176-
double threshold_iops = 0.0;
10176+
double hi_threshold_iops = 0.0;
10177+
double lo_threshold_iops = 0.0;
1017710178
if (store_is_rotational) {
10178-
threshold_iops = cct->_conf.get_val<double>(
10179+
hi_threshold_iops = cct->_conf.get_val<double>(
1017910180
"osd_mclock_iops_capacity_threshold_hdd");
10181+
lo_threshold_iops = cct->_conf.get_val<double>(
10182+
"osd_mclock_iops_capacity_low_threshold_hdd");
1018010183
} else {
10181-
threshold_iops = cct->_conf.get_val<double>(
10184+
hi_threshold_iops = cct->_conf.get_val<double>(
1018210185
"osd_mclock_iops_capacity_threshold_ssd");
10186+
lo_threshold_iops = cct->_conf.get_val<double>(
10187+
"osd_mclock_iops_capacity_low_threshold_ssd");
1018310188
}
1018410189

1018510190
// Persist the iops value to the MON store or throw cluster warning
10186-
// if the measured iops exceeds the set threshold. If the iops exceed
10187-
// the threshold, the default value is used.
10188-
if (iops > threshold_iops) {
10191+
// if the measured iops is not in the threshold range. If the iops is
10192+
// not within the threshold range, the current/default value is retained.
10193+
if (iops < lo_threshold_iops || iops > hi_threshold_iops) {
1018910194
clog->warn() << "OSD bench result of " << std::to_string(iops)
10190-
<< " IOPS exceeded the threshold limit of "
10191-
<< std::to_string(threshold_iops) << " IOPS for osd."
10195+
<< " IOPS is not within the threshold limit range of "
10196+
<< std::to_string(lo_threshold_iops) << " IOPS and "
10197+
<< std::to_string(hi_threshold_iops) << " IOPS for osd."
1019210198
<< std::to_string(whoami) << ". IOPS capacity is unchanged"
1019310199
<< " at " << std::to_string(cur_iops) << " IOPS. The"
1019410200
<< " recommendation is to establish the osd's IOPS capacity"

0 commit comments

Comments
 (0)