Skip to content

Commit e774b60

Browse files
authored
Merge pull request ClickHouse#79052 from ClickHouse/alex-dc
Make settings controlling connection drop on overloaded CPU hot-reloadable
2 parents 25f2e13 + 0618c8f commit e774b60

File tree

5 files changed

+77
-6
lines changed

5 files changed

+77
-6
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
---
2+
description: 'Controlling behavior on server CPU overload.'
3+
sidebar_label: 'Server overload'
4+
slug: /operations/settings/server-overload
5+
title: 'Server overload'
6+
---
7+
8+
# Server overload
9+
10+
## Overview {#overview}
11+
12+
Sometimes server can become overloaded due to different reasons. In order to determine the current CPU overload,
13+
ClickHouse server calculates the ratio of CPU wait time (`OSCPUWaitMicroseconds` metric) to busy time
14+
(`OSCPUVirtualTimeMicroseconds` metric). When the server is overloaded above certain ratio,
15+
it makes sense to discard some queries or even drop connection requests to not increase the load even more.
16+
17+
There's a server setting `os_cpu_busy_time_threshold` which controls the minimum busy time to consider CPU
18+
doing some useful work. If the current value of `OSCPUVirtualTimeMicroseconds` metric is below this value,
19+
CPU overload is assumed to be 0.
20+
21+
## Rejecting queries {#rejecting-queries}
22+
23+
The behavior of rejecting queries is controlled by query-level settings `min_os_cpu_wait_time_ratio_to_throw` and
24+
`max_os_cpu_wait_time_ratio_to_throw`. If those settings are set and `min_os_cpu_wait_time_ratio_to_throw` is less
25+
than `max_os_cpu_wait_time_ratio_to_throw`, then the query is rejected and `SERVER_OVERLOADED` error is thrown
26+
with some probability is the overload ratio is at least `min_os_cpu_wait_time_ratio_to_throw`. The probability
27+
is determined as a linear interpolation between min and max ratios. For example, if `min_os_cpu_wait_time_ratio_to_throw = 2`,
28+
`max_os_cpu_wait_time_ratio_to_throw = 6`, and `cpu_overload = 4`, then the query will be rejected with a probability of `0.5`.
29+
30+
## Dropping connections {#dropping-connections}
31+
32+
Dropping connections is controlled by server-level settings `min_os_cpu_wait_time_ratio_to_drop_connection` and
33+
`max_os_cpu_wait_time_ratio_to_drop_connection`. Those settings can be changed without server restart. The idea behind
34+
those settings is similar to the one with rejecting queries. The only difference in this case is if the server is overloaded,
35+
the connection attempt will be rejected from the server side.

programs/server/Server.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1961,6 +1961,8 @@ try
19611961
global_context->setMaxPendingMutationsExecutionTimeToWarn(new_server_settings[ServerSetting::max_pending_mutations_execution_time_to_warn]);
19621962
global_context->getAccessControl().setAllowTierSettings(new_server_settings[ServerSetting::allow_feature_tier]);
19631963

1964+
global_context->setOSCPUOverloadSettings(new_server_settings[ServerSetting::min_os_cpu_wait_time_ratio_to_drop_connection], new_server_settings[ServerSetting::max_os_cpu_wait_time_ratio_to_drop_connection]);
1965+
19641966
size_t read_bandwidth = new_server_settings[ServerSetting::max_remote_read_network_bandwidth_for_server];
19651967
size_t write_bandwidth = new_server_settings[ServerSetting::max_remote_write_network_bandwidth_for_server];
19661968

@@ -2869,10 +2871,9 @@ void Server::createServers(
28692871

28702872
const TCPServerConnectionFilter::Ptr & connection_filter = new TCPServerConnectionFilter{[&]()
28712873
{
2872-
const auto & server_settings = global_context->getServerSettings();
2873-
return !ProfileEvents::checkCPUOverload(server_settings[ServerSetting::os_cpu_busy_time_threshold],
2874-
server_settings[ServerSetting::min_os_cpu_wait_time_ratio_to_drop_connection],
2875-
server_settings[ServerSetting::max_os_cpu_wait_time_ratio_to_drop_connection],
2874+
return !ProfileEvents::checkCPUOverload(global_context->getServerSettings()[ServerSetting::os_cpu_busy_time_threshold],
2875+
global_context->getMinOSCPUWaitTimeRatioToDropConnection(),
2876+
global_context->getMaxOSCPUWaitTimeRatioToDropConnection(),
28762877
/*should_throw*/ false);
28772878
}};
28782879

src/Core/ServerSettings.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,8 +1058,14 @@ The policy on how to perform a scheduling of CPU slots specified by `concurrent_
10581058
DECLARE(Bool, storage_shared_set_join_use_inner_uuid, true, "If enabled, an inner UUID is generated during the creation of SharedSet and SharedJoin. ClickHouse Cloud only", 0) \
10591059
DECLARE(UInt64, startup_mv_delay_ms, 0, R"(Debug parameter to simulate materizlied view creation delay)", 0) \
10601060
DECLARE(UInt64, os_cpu_busy_time_threshold, 1'000'000, "Threshold of OS CPU busy time in microseconds (OSCPUVirtualTimeMicroseconds metric) to consider CPU doing some useful work, no CPU overload would be considered if busy time was below this value.", 0) \
1061-
DECLARE(Float, min_os_cpu_wait_time_ratio_to_drop_connection, 0, "Min ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider dropping connections. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 0 at this point.", 0) \
1062-
DECLARE(Float, max_os_cpu_wait_time_ratio_to_drop_connection, 0, "Max ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider dropping connections. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 1 at this point.", 0) \
1061+
DECLARE(Float, min_os_cpu_wait_time_ratio_to_drop_connection, 0, R"(
1062+
Min ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider dropping connections. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 0 at this point.
1063+
See [Controlling behavior on server CPU overload](/operations/settings/server-overload) for more details.
1064+
)", 0) \
1065+
DECLARE(Float, max_os_cpu_wait_time_ratio_to_drop_connection, 0, R"(
1066+
Max ratio between OS CPU wait (OSCPUWaitMicroseconds metric) and busy (OSCPUVirtualTimeMicroseconds metric) times to consider dropping connections. Linear interpolation between min and max ratio is used to calculate the probability, the probability is 1 at this point.
1067+
See [Controlling behavior on server CPU overload](/operations/settings/server-overload) for more details.
1068+
)", 0) \
10631069
DECLARE(Float, distributed_cache_keep_up_free_connections_ratio, 0.1f, "Soft limit for number of active connection distributed cache will try to keep free. After the number of free connections goes below distributed_cache_keep_up_free_connections_ratio * max_connections, connections with oldest activity will be closed until the number goes above the limit.", 0) \
10641070

10651071

@@ -1172,6 +1178,9 @@ void ServerSettings::dumpToSystemServerSettingsColumns(ServerSettingColumnsParam
11721178
{"max_pending_mutations_execution_time_to_warn", {std::to_string(context->getMaxPendingMutationsExecutionTimeToWarn()), ChangeableWithoutRestart::Yes}},
11731179
{"max_partition_size_to_drop", {std::to_string(context->getMaxPartitionSizeToDrop()), ChangeableWithoutRestart::Yes}},
11741180

1181+
{"min_os_cpu_wait_time_ratio_to_drop_connection", {std::to_string(context->getMinOSCPUWaitTimeRatioToDropConnection()), ChangeableWithoutRestart::Yes}},
1182+
{"max_os_cpu_wait_time_ratio_to_drop_connection", {std::to_string(context->getMaxOSCPUWaitTimeRatioToDropConnection()), ChangeableWithoutRestart::Yes}},
1183+
11751184
{"max_concurrent_queries", {std::to_string(context->getProcessList().getMaxSize()), ChangeableWithoutRestart::Yes}},
11761185
{"max_concurrent_insert_queries",
11771186
{std::to_string(context->getProcessList().getMaxInsertQueriesAmount()), ChangeableWithoutRestart::Yes}},

src/Interpreters/Context.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,9 @@ struct ContextSharedPart : boost::noncopyable
529529
/// Only for system.server_settings, actually value stored in reloader itself
530530
std::atomic_size_t config_reload_interval_ms = ConfigReloader::DEFAULT_RELOAD_INTERVAL.count();
531531

532+
double min_os_cpu_wait_time_ratio_to_drop_connection = 15.0;
533+
double max_os_cpu_wait_time_ratio_to_drop_connection = 30.0;
534+
532535
String format_schema_path; /// Path to a directory that contains schema files used by input formats.
533536
String google_protos_path; /// Path to a directory that contains the proto files for the well-known Protobuf types.
534537
mutable OnceFlag action_locks_manager_initialized;
@@ -4566,6 +4569,25 @@ void Context::setMaxDatabaseNumToWarn(size_t max_database_to_warn)
45664569
shared->max_database_num_to_warn = max_database_to_warn;
45674570
}
45684571

4572+
double Context::getMinOSCPUWaitTimeRatioToDropConnection() const
4573+
{
4574+
SharedLockGuard lock(shared->mutex);
4575+
return shared->min_os_cpu_wait_time_ratio_to_drop_connection;
4576+
}
4577+
4578+
double Context::getMaxOSCPUWaitTimeRatioToDropConnection() const
4579+
{
4580+
SharedLockGuard lock(shared->mutex);
4581+
return shared->max_os_cpu_wait_time_ratio_to_drop_connection;
4582+
}
4583+
4584+
void Context::setOSCPUOverloadSettings(double min_os_cpu_wait_time_ratio_to_drop_connection, double max_os_cpu_wait_time_ratio_to_drop_connection)
4585+
{
4586+
SharedLockGuard lock(shared->mutex);
4587+
shared->min_os_cpu_wait_time_ratio_to_drop_connection = min_os_cpu_wait_time_ratio_to_drop_connection;
4588+
shared->max_os_cpu_wait_time_ratio_to_drop_connection = max_os_cpu_wait_time_ratio_to_drop_connection;
4589+
}
4590+
45694591
std::shared_ptr<Cluster> Context::getCluster(const std::string & cluster_name) const
45704592
{
45714593
if (auto res = tryGetCluster(cluster_name))

src/Interpreters/Context.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,10 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>
10401040
void setMaxPendingMutationsToWarn(size_t max_pending_mutations_to_warn);
10411041
void setMaxPendingMutationsExecutionTimeToWarn(size_t max_pending_mutations_execution_time_to_warn);
10421042

1043+
double getMinOSCPUWaitTimeRatioToDropConnection() const;
1044+
double getMaxOSCPUWaitTimeRatioToDropConnection() const;
1045+
void setOSCPUOverloadSettings(double min_os_cpu_wait_time_ratio_to_drop_connection, double max_os_cpu_wait_time_ratio_to_drop_connection);
1046+
10431047
/// The port that the server listens for executing SQL queries.
10441048
UInt16 getTCPPort() const;
10451049

0 commit comments

Comments
 (0)