Skip to content

Commit 94d1759

Browse files
author
zhangjianwei
committed
src/msg: fix high CPU consumption of msgr worker thread
problem analysis: - std::multimap<clock_type::time_point, TimeEvent> time_events - time precision is nanoseconds - in EventCenter::process_events function - end_time > now : Nanosecond comparison - std::chrono::microseconds>(end_time - now) : - but converted to microseconds difference - so timeout_microseconds = 0 - epoll_wait(..., 0) not sleep - rados bench count : 6000 - Proportion of 0 events processed - 41898337 / 44796903 = 93.52% - osd single msgr worker thread cpu high to 100% solution: - due to epoll_wait is milliseconds - add ms_time_events_min_wait_interval to control how long time_events should wait at least - so default value aligned to 1000 microseconds - rados bench count : 6000 - Proportion of 0 events processed - 424466 / 4489181 = 9.45% - osd single msgr worker thread cpu high to 30~40% issue: https://tracker.ceph.com/issues/62512 co-author: yanghonggang <[email protected]> Signed-off-by: zhangjianwei <[email protected]>
1 parent 08d35a8 commit 94d1759

File tree

4 files changed

+27
-6
lines changed

4 files changed

+27
-6
lines changed

src/common/options/global.yaml.in

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,6 +1276,23 @@ options:
12761276
desc: Inject a network congestions that stuck with N times operations
12771277
default: 0
12781278
with_legacy: true
1279+
- name: ms_time_events_min_wait_interval
1280+
type: uint
1281+
level: dev
1282+
desc: In microseconds, msgr-worker's time_events min wait time for epoll_wait timeout
1283+
default: 1000
1284+
min: 0
1285+
max: 60000000
1286+
with_legacy: true
1287+
- name: ms_client_throttle_retry_time_interval
1288+
type: uint
1289+
level: dev
1290+
desc: In microseconds, user client, the time interval between the next retry
1291+
when the throttle get_or_fail.
1292+
default: 5000
1293+
min: 1000
1294+
max: 60000000
1295+
with_legacy: true
12791296
- name: ms_blackhole_osd
12801297
type: bool
12811298
level: dev

src/msg/async/Event.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,8 @@ int EventCenter::process_events(unsigned timeout_microseconds, ceph::timespan *
404404

405405
if (end_time > now) {
406406
timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end_time - now).count();
407+
timeout_microseconds = std::max<unsigned>(timeout_microseconds,
408+
cct->_conf->ms_time_events_min_wait_interval);
407409
} else {
408410
timeout_microseconds = 0;
409411
}

src/msg/async/ProtocolV1.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,7 +677,7 @@ CtPtr ProtocolV1::throttle_message() {
677677
// short time, so we can wait a ms.
678678
if (connection->register_time_events.empty()) {
679679
connection->register_time_events.insert(
680-
connection->center->create_time_event(1000,
680+
connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
681681
connection->wakeup_handler));
682682
}
683683
return nullptr;
@@ -710,7 +710,8 @@ CtPtr ProtocolV1::throttle_bytes() {
710710
if (connection->register_time_events.empty()) {
711711
connection->register_time_events.insert(
712712
connection->center->create_time_event(
713-
1000, connection->wakeup_handler));
713+
cct->_conf->ms_client_throttle_retry_time_interval,
714+
connection->wakeup_handler));
714715
}
715716
return nullptr;
716717
}
@@ -737,7 +738,7 @@ CtPtr ProtocolV1::throttle_dispatch_queue() {
737738
// short time, so we can wait a ms.
738739
if (connection->register_time_events.empty()) {
739740
connection->register_time_events.insert(
740-
connection->center->create_time_event(1000,
741+
connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
741742
connection->wakeup_handler));
742743
}
743744
return nullptr;

src/msg/async/ProtocolV2.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,7 +1552,7 @@ CtPtr ProtocolV2::throttle_message() {
15521552
// short time, so we can wait a ms.
15531553
if (connection->register_time_events.empty()) {
15541554
connection->register_time_events.insert(
1555-
connection->center->create_time_event(1000,
1555+
connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
15561556
connection->wakeup_handler));
15571557
}
15581558
return nullptr;
@@ -1584,7 +1584,8 @@ CtPtr ProtocolV2::throttle_bytes() {
15841584
if (connection->register_time_events.empty()) {
15851585
connection->register_time_events.insert(
15861586
connection->center->create_time_event(
1587-
1000, connection->wakeup_handler));
1587+
cct->_conf->ms_client_throttle_retry_time_interval,
1588+
connection->wakeup_handler));
15881589
}
15891590
return nullptr;
15901591
}
@@ -1612,7 +1613,7 @@ CtPtr ProtocolV2::throttle_dispatch_queue() {
16121613
// short time, so we can wait a ms.
16131614
if (connection->register_time_events.empty()) {
16141615
connection->register_time_events.insert(
1615-
connection->center->create_time_event(1000,
1616+
connection->center->create_time_event(cct->_conf->ms_client_throttle_retry_time_interval,
16161617
connection->wakeup_handler));
16171618
}
16181619
return nullptr;

0 commit comments

Comments
 (0)