Skip to content

Commit 8f02b07

Browse files
committed
Merge PR ceph#52652 into main
* refs/pull/52652/head: PendingReleaseNotes: add note about new mdlog trimming configurations mds: drive mdlog trimming via a separate thread mds: allow runtime modification of mdlog trimming configuration mds: remove a bunch of heuristics from MDLog::trim() mds: add mdlog trimming threshold and decay counter Reviewed-by: Leonid Usov <[email protected]>
2 parents c9f71b1 + e579ac5 commit 8f02b07

File tree

5 files changed

+117
-31
lines changed

5 files changed

+117
-31
lines changed

PendingReleaseNotes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
117117
manager module. Users may choose between two new modes: ``upmap-read``, which
118118
offers upmap and read optimization simultaneously, or ``read``, which may be used
119119
to only optimize reads. For more detailed information see https://docs.ceph.com/en/latest/rados/operations/read-balancer/#online-optimization.
120+
* CephFS: MDS log trimming is now driven by a separate thread which tries to
121+
trim the log every second (`mds_log_trim_upkeep_interval` config). Also,
122+
a couple of configs govern how much time the MDS spends in trimming its
123+
logs. These configs are `mds_log_trim_threshold` and `mds_log_trim_decay_rate`.
120124

121125
>=18.0.0
122126

src/common/options/mds.yaml.in

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,3 +1597,41 @@ options:
15971597
- mds
15981598
flags:
15991599
- runtime
1600+
- name: mds_log_trim_threshold
1601+
type: size
1602+
level: advanced
1603+
desc: MDS log trim threshold
1604+
long_desc: The threshold of the number of log segment that can be trimmed.
1605+
default: 128
1606+
min: 1
1607+
services:
1608+
- mds
1609+
see_also:
1610+
- mds_log_max_events
1611+
- mds_log_max_segments
1612+
flags:
1613+
- runtime
1614+
- name: mds_log_trim_decay_rate
1615+
type: float
1616+
level: advanced
1617+
desc: MDS log trim decay rate
1618+
long_desc: The decay rate for trimming the MDS log. Increasing this value leads to the MDS spending less time in trimming the log.
1619+
default: 1.0
1620+
min: 0.01
1621+
services:
1622+
- mds
1623+
see_also:
1624+
- mds_log_max_events
1625+
- mds_log_max_segments
1626+
flags:
1627+
- runtime
1628+
- name: mds_log_trim_upkeep_interval
1629+
type: millisecs
1630+
level: advanced
1631+
desc: MDS log trimming interval
1632+
long_desc: Interval in milliseconds to trim MDS logs.
1633+
default: 1000
1634+
services:
1635+
- mds
1636+
flags:
1637+
- runtime

src/mds/MDLog.cc

Lines changed: 59 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "common/entity_name.h"
2525
#include "common/perf_counters.h"
2626
#include "common/Cond.h"
27+
#include "common/ceph_time.h"
2728

2829
#include "events/ESubtreeMap.h"
2930
#include "events/ESegment.h"
@@ -45,7 +46,8 @@ MDLog::MDLog(MDSRank* m)
4546
mds(m),
4647
replay_thread(this),
4748
recovery_thread(this),
48-
submit_thread(this)
49+
submit_thread(this),
50+
log_trim_counter(DecayCounter(g_conf().get_val<double>("mds_log_trim_decay_rate")))
4951
{
5052
debug_subtrees = g_conf().get_val<bool>("mds_debug_subtrees");
5153
event_large_threshold = g_conf().get_val<uint64_t>("mds_log_event_large_threshold");
@@ -56,6 +58,7 @@ MDLog::MDLog(MDSRank* m)
5658
max_events = g_conf().get_val<int64_t>("mds_log_max_events");
5759
skip_corrupt_events = g_conf().get_val<bool>("mds_log_skip_corrupt_events");
5860
skip_unbounded_events = g_conf().get_val<bool>("mds_log_skip_unbounded_events");
61+
upkeep_thread = std::thread(&MDLog::log_trim_upkeep, this);
5962
}
6063

6164
MDLog::~MDLog()
@@ -68,7 +71,6 @@ MDLog::~MDLog()
6871
}
6972
}
7073

71-
7274
void MDLog::create_logger()
7375
{
7476
PerfCountersBuilder plb(g_ceph_context, "mds_log", l_mdl_first, l_mdl_last);
@@ -555,6 +557,13 @@ void MDLog::shutdown()
555557
}
556558
}
557559

560+
upkeep_log_trim_shutdown = true;
561+
cond.notify_one();
562+
563+
mds->mds_lock.unlock();
564+
upkeep_thread.join();
565+
mds->mds_lock.lock();
566+
558567
// Replay thread can be stuck inside e.g. Journaler::wait_for_readable,
559568
// so we need to shutdown the journaler first.
560569
if (journaler) {
@@ -605,11 +614,23 @@ void MDLog::try_to_commit_open_file_table(uint64_t last_seq)
605614
}
606615
}
607616

608-
void MDLog::trim(int m)
617+
void MDLog::log_trim_upkeep(void) {
618+
dout(10) << dendl;
619+
620+
std::unique_lock mds_lock(mds->mds_lock);
621+
while (!upkeep_log_trim_shutdown.load()) {
622+
if (mds->is_active() || mds->is_stopping()) {
623+
trim();
624+
}
625+
626+
cond.wait_for(mds_lock, g_conf().get_val<std::chrono::milliseconds>("mds_log_trim_upkeep_interval"));
627+
}
628+
dout(10) << __func__ << ": finished" << dendl;
629+
}
630+
631+
void MDLog::trim()
609632
{
610633
int max_ev = max_events;
611-
if (m >= 0)
612-
max_ev = m;
613634

614635
if (mds->mdcache->is_readonly()) {
615636
dout(10) << "trim, ignoring read-only FS" << dendl;
@@ -636,10 +657,6 @@ void MDLog::trim(int m)
636657
return;
637658
}
638659

639-
// hack: only trim for a few seconds at a time
640-
utime_t stop = ceph_clock_now();
641-
stop += 2.0;
642-
643660
int op_prio = CEPH_MSG_PRIO_LOW +
644661
(CEPH_MSG_PRIO_HIGH - CEPH_MSG_PRIO_LOW) *
645662
expiring_segments.size() / max_segments;
@@ -648,32 +665,43 @@ void MDLog::trim(int m)
648665

649666
unsigned new_expiring_segments = 0;
650667

651-
unsigned max_expiring_segments = 0;
652-
if (pre_segments_size > 0){
653-
max_expiring_segments = max_segments/2;
668+
if (pre_segments_size > 0) {
654669
ceph_assert(segments.size() >= pre_segments_size);
655-
max_expiring_segments = std::max<unsigned>(max_expiring_segments,segments.size() - pre_segments_size);
656670
}
657-
671+
658672
map<uint64_t,LogSegment*>::iterator p = segments.begin();
673+
674+
auto trim_start = ceph::coarse_mono_clock::now();
675+
std::optional<ceph::coarse_mono_time> trim_end;
676+
677+
auto log_trim_counter_start = log_trim_counter.get();
678+
auto log_trim_threshold = g_conf().get_val<Option::size_t>("mds_log_trim_threshold");
679+
659680
while (p != segments.end()) {
660-
if (stop < ceph_clock_now())
681+
// throttle - break out of trimmming if we've hit the threshold
682+
if (log_trim_counter_start + new_expiring_segments >= log_trim_threshold) {
683+
auto time_spent = std::chrono::duration<double>::zero();
684+
if (trim_end) {
685+
time_spent = std::chrono::duration<double>(*trim_end - trim_start);
686+
}
687+
dout(10) << __func__ << ": breaking out of trim loop - trimmed "
688+
<< new_expiring_segments << " segment(s) in " << time_spent.count()
689+
<< "s" << dendl;
661690
break;
691+
}
662692

663693
unsigned num_remaining_segments = (segments.size() - expired_segments.size() - expiring_segments.size());
664-
if ((num_remaining_segments <= max_segments) &&
665-
(max_ev < 0 || (num_events - expiring_events - expired_events) <= (uint64_t)max_ev))
666-
break;
694+
dout(10) << __func__ << ": new_expiring_segments=" << new_expiring_segments
695+
<< ", num_remaining_segments=" << num_remaining_segments
696+
<< ", max_segments=" << max_segments << dendl;
667697

668-
// Do not trim too many segments at once for peak workload. If mds keeps creating N segments each tick,
669-
// the upper bound of 'num_remaining_segments - max_segments' is '2 * N'
670-
if (new_expiring_segments * 2 > num_remaining_segments)
698+
if ((num_remaining_segments <= max_segments) &&
699+
(max_ev < 0 || (num_events - expiring_events - expired_events) <= (uint64_t)max_ev)) {
700+
dout(10) << __func__ << ": breaking out of trim loop - segments/events fell below ceiling"
701+
<< " max_segments/max_ev" << dendl;
671702
break;
703+
}
672704

673-
if (max_expiring_segments > 0 &&
674-
expiring_segments.size() >= max_expiring_segments)
675-
break;
676-
677705
// look at first segment
678706
LogSegment *ls = p->second;
679707
ceph_assert(ls);
@@ -699,6 +727,8 @@ void MDLog::trim(int m)
699727

700728
uint64_t last_seq = ls->seq;
701729
try_expire(ls, op_prio);
730+
log_trim_counter.hit();
731+
trim_end = ceph::coarse_mono_clock::now();
702732

703733
submit_mutex.lock();
704734
p = segments.lower_bound(last_seq + 1);
@@ -784,6 +814,7 @@ int MDLog::trim_all()
784814

785815
void MDLog::try_expire(LogSegment *ls, int op_prio)
786816
{
817+
ceph_assert(ceph_mutex_is_locked(mds->mds_lock));
787818
MDSGatherBuilder gather_bld(g_ceph_context);
788819
ls->try_to_expire(mds, gather_bld, op_prio);
789820

@@ -1581,4 +1612,7 @@ void MDLog::handle_conf_change(const std::set<std::string>& changed, const MDSMa
15811612
if (changed.count("mds_log_skip_unbounded_events")) {
15821613
skip_unbounded_events = g_conf().get_val<bool>("mds_log_skip_unbounded_events");
15831614
}
1615+
if (changed.count("mds_log_trim_decay_rate")){
1616+
log_trim_counter = DecayCounter(g_conf().get_val<double>("mds_log_trim_decay_rate"));
1617+
}
15841618
}

src/mds/MDLog.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ enum {
4646

4747
#include "MDSContext.h"
4848
#include "common/Cond.h"
49+
#include "common/DecayCounter.h"
4950
#include "common/Finisher.h"
5051
#include "common/Thread.h"
5152

@@ -65,6 +66,7 @@ class ESubtreeMap;
6566

6667
class MDLog {
6768
public:
69+
6870
MDLog(MDSRank *m);
6971
~MDLog();
7072

@@ -145,7 +147,6 @@ class MDLog {
145147
}
146148

147149
void trim_expired_segments();
148-
void trim(int max=-1);
149150
int trim_all();
150151

151152
void create(MDSContext *onfinish); // fresh, empty log!
@@ -287,6 +288,9 @@ class MDLog {
287288
void _trim_expired_segments();
288289
void write_head(MDSContext *onfinish);
289290

291+
void trim();
292+
void log_trim_upkeep(void);
293+
290294
bool debug_subtrees;
291295
std::atomic_uint64_t event_large_threshold; // accessed by submit thread
292296
uint64_t events_per_segment;
@@ -301,5 +305,14 @@ class MDLog {
301305
std::set<LogSegment*> expired_segments;
302306
std::set<LogSegment*> expiring_segments;
303307
uint64_t events_since_last_major_segment = 0;
308+
309+
// log trimming decay counter
310+
DecayCounter log_trim_counter;
311+
312+
// log trimming upkeeper thread
313+
std::thread upkeep_thread;
314+
// guarded by mds_lock
315+
std::condition_variable_any cond;
316+
std::atomic<bool> upkeep_log_trim_shutdown{false};
304317
};
305318
#endif

src/mds/MDSRank.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -743,10 +743,6 @@ void MDSRankDispatcher::tick()
743743
// update average session uptime
744744
sessionmap.update_average_session_age();
745745

746-
if (is_active() || is_stopping()) {
747-
mdlog->trim(); // NOT during recovery!
748-
}
749-
750746
// ...
751747
if (is_clientreplay() || is_active() || is_stopping()) {
752748
server->clear_laggy_clients();
@@ -789,7 +785,6 @@ void MDSRankDispatcher::tick()
789785

790786
// shut down?
791787
if (is_stopping()) {
792-
mdlog->trim();
793788
if (mdcache->shutdown_pass()) {
794789
uint64_t pq_progress = 0 ;
795790
uint64_t pq_total = 0;
@@ -3880,6 +3875,8 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
38803875
"mds_session_max_caps_throttle_ratio",
38813876
"mds_symlink_recovery",
38823877
"mds_session_metadata_threshold",
3878+
"mds_log_trim_threshold",
3879+
"mds_log_trim_decay_rate",
38833880
NULL
38843881
};
38853882
return KEYS;

0 commit comments

Comments
 (0)