Skip to content

Commit 50e984c

Browse files
authored
Merge pull request ceph#51171 from amathuria/wip-amat-scrub-cost-related-changes
osd/scrub: Change scrub cost to average object size Reviewed-by: Samuel Just <[email protected]> Reviewed-by: Pere Diaz Bou <[email protected]> Reviewed-by: Sridhar Seshasayee <[email protected]>
2 parents 6db8780 + 77dda5e commit 50e984c

File tree

6 files changed

+120
-45
lines changed

6 files changed

+120
-45
lines changed

src/osd/OSD.cc

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1693,6 +1693,11 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
16931693
osd->op_shardedwq.queue_front(std::move(qi));
16941694
}
16951695

1696+
double OSDService::get_cost_per_io() const
1697+
{
1698+
return osd->op_shardedwq.get_cost_per_io();
1699+
}
1700+
16961701
void OSDService::queue_recovery_context(
16971702
PG *pg,
16981703
GenContext<ThreadPool::TPHandle&> *c,
@@ -1761,56 +1766,66 @@ template <class MSG_TYPE>
17611766
void OSDService::queue_scrub_event_msg(PG* pg,
17621767
Scrub::scrub_prio_t with_priority,
17631768
unsigned int qu_priority,
1764-
Scrub::act_token_t act_token)
1769+
Scrub::act_token_t act_token,
1770+
uint64_t cost)
17651771
{
17661772
const auto epoch = pg->get_osdmap_epoch();
17671773
auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
17681774
dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
17691775
<< ". Epoch: " << epoch << " token: " << act_token << dendl;
17701776
enqueue_back(OpSchedulerItem(
1771-
unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1777+
unique_ptr<OpSchedulerItem::OpQueueable>(msg), cost,
17721778
pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
17731779
}
17741780

17751781
template <class MSG_TYPE>
17761782
void OSDService::queue_scrub_event_msg(PG* pg,
1777-
Scrub::scrub_prio_t with_priority)
1783+
Scrub::scrub_prio_t with_priority,
1784+
uint64_t cost)
17781785
{
17791786
const auto epoch = pg->get_osdmap_epoch();
17801787
auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
17811788
dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
17821789
enqueue_back(OpSchedulerItem(
1783-
unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1790+
unique_ptr<OpSchedulerItem::OpQueueable>(msg), cost,
17841791
pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
17851792
}
17861793

1787-
int64_t OSDService::get_scrub_cost()
1794+
template <class MSG_TYPE>
1795+
void OSDService::queue_scrub_event_msg_default_cost(PG* pg,
1796+
Scrub::scrub_prio_t with_priority,
1797+
unsigned int qu_priority,
1798+
Scrub::act_token_t act_token)
17881799
{
1800+
uint64_t cost = cct->_conf->osd_scrub_event_cost;
1801+
queue_scrub_event_msg<MSG_TYPE>(pg, with_priority, qu_priority, act_token, cost);
1802+
}
17891803

1790-
int64_t cost_for_queue = cct->_conf->osd_scrub_cost;
1791-
if (op_queue_type_t::mClockScheduler == osd->osd_op_queue_type()) {
1792-
cost_for_queue = cct->_conf->osd_scrub_event_cost *
1793-
cct->_conf->osd_shallow_scrub_chunk_max;
1794-
}
1795-
return cost_for_queue;
1804+
template <class MSG_TYPE>
1805+
void OSDService::queue_scrub_event_msg_default_cost(PG* pg,
1806+
Scrub::scrub_prio_t with_priority)
1807+
{
1808+
uint64_t cost = cct->_conf->osd_scrub_event_cost;
1809+
queue_scrub_event_msg<MSG_TYPE>(pg, with_priority, cost);
17961810
}
17971811

17981812
void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
17991813
{
1800-
queue_scrub_event_msg<PGScrub>(pg, with_priority);
1814+
queue_scrub_event_msg_default_cost<PGScrub>(pg, with_priority);
18011815
}
18021816

18031817
void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
18041818
{
1805-
queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1819+
queue_scrub_event_msg_default_cost<PGScrubAfterRepair>(pg, with_priority);
18061820
}
18071821

18081822
void OSDService::queue_for_rep_scrub(PG* pg,
18091823
Scrub::scrub_prio_t with_priority,
18101824
unsigned int qu_priority,
1811-
Scrub::act_token_t act_token)
1825+
Scrub::act_token_t act_token,
1826+
uint64_t cost)
18121827
{
1813-
queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1828+
queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token, cost);
18141829
}
18151830

18161831
void OSDService::queue_for_rep_scrub_resched(PG* pg,
@@ -1819,73 +1834,73 @@ void OSDService::queue_for_rep_scrub_resched(PG* pg,
18191834
Scrub::act_token_t act_token)
18201835
{
18211836
// Resulting scrub event: 'SchedReplica'
1822-
queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1823-
act_token);
1837+
queue_scrub_event_msg_default_cost<PGRepScrubResched>(pg, with_priority, qu_priority,
1838+
act_token);
18241839
}
18251840

18261841
void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
18271842
{
18281843
// Resulting scrub event: 'InternalSchedScrub'
1829-
queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1844+
queue_scrub_event_msg_default_cost<PGScrubResched>(pg, with_priority);
18301845
}
18311846

18321847
void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
18331848
{
18341849
// Resulting scrub event: 'ActivePushesUpd'
1835-
queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1850+
queue_scrub_event_msg_default_cost<PGScrubPushesUpdate>(pg, with_priority);
18361851
}
18371852

1838-
void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1853+
void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority, uint64_t cost)
18391854
{
18401855
// Resulting scrub event: 'SelectedChunkFree'
1841-
queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1856+
queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority, cost);
18421857
}
18431858

18441859
void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
18451860
{
18461861
// Resulting scrub event: 'ChunkIsBusy'
1847-
queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1862+
queue_scrub_event_msg_default_cost<PGScrubChunkIsBusy>(pg, with_priority);
18481863
}
18491864

18501865
void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
18511866
{
1852-
queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1867+
queue_scrub_event_msg_default_cost<PGScrubAppliedUpdate>(pg, with_priority);
18531868
}
18541869

18551870
void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
18561871
{
18571872
// Resulting scrub event: 'Unblocked'
1858-
queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1873+
queue_scrub_event_msg_default_cost<PGScrubUnblocked>(pg, with_priority);
18591874
}
18601875

18611876
void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
18621877
{
18631878
// Resulting scrub event: 'DigestUpdate'
1864-
queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1879+
queue_scrub_event_msg_default_cost<PGScrubDigestUpdate>(pg, with_priority);
18651880
}
18661881

18671882
void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
18681883
{
18691884
// Resulting scrub event: 'GotReplicas'
1870-
queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1885+
queue_scrub_event_msg_default_cost<PGScrubGotReplMaps>(pg, with_priority);
18711886
}
18721887

18731888
void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
18741889
{
18751890
// Resulting scrub event: 'ReplicaPushesUpd'
1876-
queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1891+
queue_scrub_event_msg_default_cost<PGScrubReplicaPushes>(pg, with_priority);
18771892
}
18781893

18791894
void OSDService::queue_scrub_is_finished(PG *pg)
18801895
{
18811896
// Resulting scrub event: 'ScrubFinished'
1882-
queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1897+
queue_scrub_event_msg_default_cost<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
18831898
}
18841899

18851900
void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
18861901
{
18871902
// Resulting scrub event: 'NextChunk'
1888-
queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1903+
queue_scrub_event_msg_default_cost<PGScrubGetNextChunk>(pg, with_priority);
18891904
}
18901905

18911906
void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e, int64_t num_objects)

src/osd/OSD.h

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ class OSDService : public Scrub::ScrubSchedListener {
119119

120120
void enqueue_back(OpSchedulerItem&& qi);
121121
void enqueue_front(OpSchedulerItem&& qi);
122+
/// scheduler cost per io, only valid for mclock, asserts for wpq
123+
double get_cost_per_io() const;
122124

123125
void maybe_inject_dispatch_delay() {
124126
if (g_conf()->osd_debug_inject_dispatch_delay_probability > 0) {
@@ -525,7 +527,7 @@ class OSDService : public Scrub::ScrubSchedListener {
525527
void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority);
526528

527529
/// Signals that the selected chunk (objects range) is available for scrubbing
528-
void queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority);
530+
void queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority, uint64_t cost);
529531

530532
/// The chunk selected is blocked by user operations, and cannot be scrubbed now
531533
void queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority);
@@ -551,7 +553,8 @@ class OSDService : public Scrub::ScrubSchedListener {
551553
void queue_for_rep_scrub(PG* pg,
552554
Scrub::scrub_prio_t with_high_priority,
553555
unsigned int qu_priority,
554-
Scrub::act_token_t act_token);
556+
Scrub::act_token_t act_token,
557+
uint64_t cost);
555558

556559
/// Signals a change in the number of in-flight recovery writes
557560
void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
@@ -584,14 +587,20 @@ class OSDService : public Scrub::ScrubSchedListener {
584587
void queue_scrub_event_msg(PG* pg,
585588
Scrub::scrub_prio_t with_priority,
586589
unsigned int qu_priority,
587-
Scrub::act_token_t act_token);
590+
Scrub::act_token_t act_token,
591+
uint64_t cost);
588592

589593
/// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
590594
/// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
591595
template <class MSG_TYPE>
592-
void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority);
593-
int64_t get_scrub_cost();
594-
596+
void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority, uint64_t cost);
597+
template <class MSG_TYPE>
598+
void queue_scrub_event_msg_default_cost(PG* pg, Scrub::scrub_prio_t with_priority);
599+
template <class MSG_TYPE>
600+
void queue_scrub_event_msg_default_cost(PG* pg,
601+
Scrub::scrub_prio_t with_priority,
602+
unsigned int qu_priority,
603+
Scrub::act_token_t act_token);
595604
utime_t defer_recovery_until;
596605
uint64_t recovery_ops_active;
597606
uint64_t recovery_ops_reserved;
@@ -1622,6 +1631,11 @@ class OSD : public Dispatcher,
16221631
p->complete(0);
16231632
}
16241633
}
1634+
1635+
double get_cost_per_io() const {
1636+
auto &sdata = osd->shards[0];
1637+
return sdata->scheduler->get_cost_per_io();
1638+
}
16251639
} op_shardedwq;
16261640

16271641

src/osd/scheduler/OpScheduler.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "mon/MonClient.h"
2323
#include "osd/scheduler/OpSchedulerItem.h"
2424

25+
#include "include/ceph_assert.h"
26+
2527
namespace ceph::osd::scheduler {
2628

2729
using client = uint64_t;
@@ -58,6 +60,11 @@ class OpScheduler {
5860
// Get the scheduler type set for the queue
5961
virtual op_queue_type_t get_type() const = 0;
6062

63+
virtual double get_cost_per_io() const {
64+
ceph_assert(0 == "impossible for wpq");
65+
return 0.0;
66+
}
67+
6168
// Destructor
6269
virtual ~OpScheduler() {};
6370
};

src/osd/scheduler/mClockScheduler.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,10 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
261261
const char** get_tracked_conf_keys() const final;
262262
void handle_conf_change(const ConfigProxy& conf,
263263
const std::set<std::string> &changed) final;
264+
265+
double get_cost_per_io() const {
266+
return osd_bandwidth_cost_per_io;
267+
}
264268
private:
265269
// Enqueue the op to the high priority queue
266270
void enqueue_high(unsigned prio, OpSchedulerItem &&item, bool front = false);

src/osd/scrubber/pg_scrubber.cc

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -876,8 +876,11 @@ int PgScrubber::get_whoami() const
876876
* - m_max_end
877877
* - end
878878
* - start
879+
* returns:
880+
* - std::nullopt if the range is blocked
881+
* - otherwise, the number of objects in the selected range
879882
*/
880-
bool PgScrubber::select_range()
883+
std::optional<uint64_t> PgScrubber::select_range()
881884
{
882885
m_be->new_chunk();
883886

@@ -959,7 +962,7 @@ bool PgScrubber::select_range()
959962
// we'll be requeued by whatever made us unavailable for scrub
960963
dout(10) << __func__ << ": scrub blocked somewhere in range "
961964
<< "[" << m_start << ", " << candidate_end << ")" << dendl;
962-
return false;
965+
return std::nullopt;
963966
}
964967

965968
m_end = candidate_end;
@@ -972,20 +975,20 @@ bool PgScrubber::select_range()
972975
// debug: be 'blocked' if told so by the 'pg scrub_debug block' asok command
973976
if (m_debug_blockrange > 0) {
974977
m_debug_blockrange--;
975-
return false;
978+
return std::nullopt;
976979
}
977-
return true;
980+
return objects.size();
978981
}
979982

980983
void PgScrubber::select_range_n_notify()
981984
{
982985
get_counters_set().inc(scrbcnt_chunks_selected);
983-
984-
if (select_range()) {
986+
auto num_chunk_objects = select_range();
987+
if (num_chunk_objects.has_value()) {
985988
// the next chunk to handle is not blocked
986989
dout(20) << __func__ << ": selection OK" << dendl;
987-
m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
988-
990+
auto cost = get_scrub_cost(num_chunk_objects.value());
991+
m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority, cost);
989992
} else {
990993
// we will wait for the objects range to become available for scrubbing
991994
dout(10) << __func__ << ": selected chunk is busy" << dendl;
@@ -994,6 +997,28 @@ void PgScrubber::select_range_n_notify()
994997
}
995998
}
996999

1000+
uint64_t PgScrubber::get_scrub_cost(uint64_t num_chunk_objects)
1001+
{
1002+
const auto& conf = m_pg->get_cct()->_conf;
1003+
if (op_queue_type_t::WeightedPriorityQueue == m_osds->osd->osd_op_queue_type()) {
1004+
// if the osd_op_queue is WPQ, we will use the default osd_scrub_cost value
1005+
return conf->osd_scrub_cost;
1006+
}
1007+
uint64_t cost = 0;
1008+
double scrub_metadata_cost = m_osds->get_cost_per_io();
1009+
if (m_is_deep) {
1010+
auto pg_avg_object_size = m_pg->get_average_object_size();
1011+
cost = conf->osd_scrub_event_cost + (num_chunk_objects
1012+
* (scrub_metadata_cost + pg_avg_object_size));
1013+
dout(20) << fmt::format("{} : deep-scrub cost = {}", __func__, cost) << dendl;
1014+
return cost;
1015+
} else {
1016+
cost = conf->osd_scrub_event_cost + (num_chunk_objects * scrub_metadata_cost);
1017+
dout(20) << fmt::format("{} : shallow-scrub cost = {}", __func__, cost) << dendl;
1018+
return cost;
1019+
}
1020+
}
1021+
9971022
bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
9981023
{
9991024
if (soid < m_start || soid >= m_end) {
@@ -1574,10 +1599,15 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
15741599

15751600
set_queued_or_active();
15761601
advance_token();
1602+
const auto& conf = m_pg->get_cct()->_conf;
1603+
const int max_from_conf = size_from_conf(
1604+
m_is_deep, conf, "osd_scrub_chunk_max", "osd_shallow_scrub_chunk_max");
1605+
auto cost = get_scrub_cost(max_from_conf);
15771606
m_osds->queue_for_rep_scrub(m_pg,
15781607
m_replica_request_priority,
15791608
m_flags.priority,
1580-
m_current_token);
1609+
m_current_token,
1610+
cost);
15811611
}
15821612

15831613
void PgScrubber::set_op_parameters(const requested_scrub_t& request)

src/osd/scrubber/pg_scrubber.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,8 @@ class PgScrubber : public ScrubPgIF,
729729
/// Returns epoch of current osdmap
730730
epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
731731

732+
uint64_t get_scrub_cost(uint64_t num_chunk_objects);
733+
732734
// collected statistics
733735
int m_shallow_errors{0};
734736
int m_deep_errors{0};
@@ -802,8 +804,11 @@ class PgScrubber : public ScrubPgIF,
802804
* - handling some head/clones issues
803805
*
804806
* The selected range is set directly into 'm_start' and 'm_end'
807+
*
808+
* Returns std::nullopt if the range is busy otherwise returns the
809+
* number of objects in the range.
805810
*/
806-
bool select_range();
811+
std::optional<uint64_t> select_range();
807812

808813
std::list<Context*> m_callbacks;
809814

0 commit comments

Comments
 (0)