Skip to content

Commit 9295089

Browse files
authored
Merge pull request ceph#54597 from sseshasa/wip-mclk-snaptrim-cost
osd: Tune snap trim item cost to reflect a PGs' average object size for mClock scheduler Reviewed-by: Samuel Just <[email protected]>
2 parents 559773e + fbd5c40 commit 9295089

File tree

5 files changed

+38
-13
lines changed

5 files changed

+38
-13
lines changed

src/osd/OSD.cc

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,14 +1712,32 @@ void OSDService::queue_recovery_context(
17121712
e));
17131713
}
17141714

1715-
void OSDService::queue_for_snap_trim(PG *pg)
1715+
void OSDService::queue_for_snap_trim(PG *pg, uint64_t cost_per_object)
17161716
{
17171717
dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1718+
uint64_t cost_for_queue = [this, cost_per_object] {
1719+
if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1720+
/* The cost calculation is valid for most snap trim iterations except
1721+
* for the following cases:
1722+
* 1) The penultimate iteration which may return 1 object to trim, in
1723+
* which case the cost will be off by a factor equivalent to the
1724+
* average object size, and,
1725+
* 2) The final iteration which returns -ENOENT and performs clean-ups.
1726+
*/
1727+
return cost_per_object * cct->_conf->osd_pg_max_concurrent_snap_trims;
1728+
} else {
1729+
/* We retain this legacy behavior for WeightedPriorityQueue.
1730+
* This branch should be removed after Squid.
1731+
*/
1732+
return cct->_conf->osd_snap_trim_cost;
1733+
}
1734+
}();
1735+
17181736
enqueue_back(
17191737
OpSchedulerItem(
17201738
unique_ptr<OpSchedulerItem::OpQueueable>(
17211739
new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1722-
cct->_conf->osd_snap_trim_cost,
1740+
cost_for_queue,
17231741
cct->_conf->osd_snap_trim_priority,
17241742
ceph_clock_now(),
17251743
0,

src/osd/OSD.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ class OSDService : public Scrub::ScrubSchedListener {
498498
GenContext<ThreadPool::TPHandle&> *c,
499499
uint64_t cost,
500500
int priority);
501-
void queue_for_snap_trim(PG *pg);
501+
void queue_for_snap_trim(PG *pg, uint64_t cost);
502502
void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
503503

504504
void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);

src/osd/PG.cc

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -420,15 +420,7 @@ void PG::queue_recovery()
420420
dout(10) << "queue_recovery -- queuing" << dendl;
421421
recovery_queued = true;
422422
// Let cost per object be the average object size
423-
auto num_bytes = static_cast<uint64_t>(
424-
std::max<int64_t>(
425-
0, // ensure bytes is non-negative
426-
info.stats.stats.sum.num_bytes));
427-
auto num_objects = static_cast<uint64_t>(
428-
std::max<int64_t>(
429-
1, // ensure objects is non-negative and non-zero
430-
info.stats.stats.sum.num_objects));
431-
uint64_t cost_per_object = std::max<uint64_t>(num_bytes / num_objects, 1);
423+
uint64_t cost_per_object = get_average_object_size();
432424
osd->queue_for_recovery(
433425
this, cost_per_object, recovery_state.get_recovery_op_priority()
434426
);

src/osd/PG.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,19 @@ class PG : public DoutPrefixProvider,
10301030
return num_bytes;
10311031
}
10321032

1033+
uint64_t get_average_object_size() {
1034+
ceph_assert(ceph_mutex_is_locked_by_me(_lock));
1035+
auto num_bytes = static_cast<uint64_t>(
1036+
std::max<int64_t>(
1037+
0, // ensure bytes is non-negative
1038+
info.stats.stats.sum.num_bytes));
1039+
auto num_objects = static_cast<uint64_t>(
1040+
std::max<int64_t>(
1041+
1, // ensure objects is non-negative and non-zero
1042+
info.stats.stats.sum.num_objects));
1043+
return std::max<uint64_t>(num_bytes / num_objects, 1);
1044+
}
1045+
10331046
protected:
10341047

10351048
/*

src/osd/PrimaryLogPG.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15605,8 +15605,10 @@ PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
1560515605
NamedState(nullptr, "Trimming/AwaitAsyncWork")
1560615606
{
1560715607
auto *pg = context< SnapTrimmer >().pg;
15608+
// Determine cost in terms of the average object size
15609+
uint64_t cost_per_object = pg->get_average_object_size();
1560815610
context< SnapTrimmer >().log_enter(state_name);
15609-
context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15611+
context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg, cost_per_object);
1561015612
pg->state_set(PG_STATE_SNAPTRIM);
1561115613
pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
1561215614
pg->publish_stats_to_osd();

0 commit comments

Comments
 (0)