Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 8e94f55

Browse files
authored
Merge pull request #233 from grafana/add-grpc-config
Fine-tuned gRPC keepalive pings settings
2 parents 7001a48 + 965967f commit 8e94f55

9 files changed

+63
-43
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
- Cortex / Queries: added "Lazy loaded index-headers" and "Index-header lazy load duration"
1616
- Cortex / Compactor: added "Tenants compaction progress", "Average blocks / tenant" and "Tenants with largest number of blocks"
1717
- Alerts: added "CortexMemoryMapAreasTooHigh"
18+
* [ENHANCEMENT] Fine-tuned gRPC keepalive pings to work nicely with Cortex default settings.
19+
- `-server.grpc.keepalive.min-time-between-pings=10s`
20+
- `-server.grpc.keepalive.ping-without-stream-allowed:true`
1821
* [BUGFIX] Fixed workingset memory panel while rolling out a StatefulSet. #229
1922
* [BUGFIX] Fixed `CortexRequestErrors` alert to not include `ready` route. #230
2023

cortex/alertmanager.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
else [],
1818

1919
alertmanager_args::
20+
$._config.grpcConfig +
2021
{
2122
target: 'alertmanager',
2223
'log.level': 'debug',

cortex/config.libsonnet

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@
139139
}
140140
else {},
141141

142+
grpcConfig:: {
143+
'server.grpc.keepalive.min-time-between-pings': '10s',
144+
'server.grpc.keepalive.ping-without-stream-allowed': true,
145+
},
146+
142147
storageConfig:
143148
$._config.client_configs.aws +
144149
$._config.client_configs.cassandra +

cortex/distributor.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
local containerPort = $.core.v1.containerPort,
44

55
distributor_args::
6+
$._config.grpcConfig +
67
$._config.ringConfig +
78
$._config.distributorConfig +
89
{

cortex/ingester.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
ingester_args::
3+
$._config.grpcConfig +
34
$._config.ringConfig +
45
$._config.storeConfig +
56
$._config.storageConfig +

cortex/querier.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
local container = $.core.v1.container,
33

44
querier_args::
5+
$._config.grpcConfig +
56
$._config.ringConfig +
67
$._config.storeConfig +
78
$._config.storageConfig +

cortex/query-frontend.libsonnet

Lines changed: 48 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,54 @@
11
{
22
local container = $.core.v1.container,
33

4-
query_frontend_args:: {
5-
target: 'query-frontend',
6-
7-
// Need log.level=debug so all queries are logged, needed for analyse.py.
8-
'log.level': 'debug',
9-
10-
// Increase HTTP server response write timeout, as we were seeing some
11-
// queries that return a lot of data timeing out.
12-
'server.http-write-timeout': '1m',
13-
14-
// Split long queries up into multiple day-long queries.
15-
'querier.split-queries-by-interval': '24h',
16-
17-
// Cache query results.
18-
'querier.align-querier-with-step': true,
19-
'querier.cache-results': true,
20-
'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace,
21-
'frontend.memcached.service': 'memcached-client',
22-
'frontend.memcached.timeout': '500ms',
23-
24-
// So that exporters like cloudwatch can still send in data and be un-cached.
25-
'frontend.max-cache-freshness': '10m',
26-
27-
// Compress HTTP responses; improves latency for very big results and slow
28-
// connections.
29-
'querier.compress-http-responses': true,
30-
31-
// So it can recieve big responses from the querier.
32-
'server.grpc-max-recv-msg-size-bytes': 100 << 20,
33-
34-
// Limit queries to 500 days, allow this to be override per-user.
35-
'store.max-query-length': '12000h', // 500 Days
36-
'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
37-
} + if $._config.queryFrontend.sharded_queries_enabled then {
38-
'querier.parallelise-shardable-queries': 'true',
39-
40-
// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
41-
// basically base * shard_factor * query_split_factor / num_frontends where
42-
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
43-
44-
'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
45-
} + $._config.storageConfig
46-
else {},
4+
query_frontend_args::
5+
$._config.ringConfig +
6+
{
7+
target: 'query-frontend',
8+
9+
// Need log.level=debug so all queries are logged, needed for analyse.py.
10+
'log.level': 'debug',
11+
12+
// Increase HTTP server response write timeout, as we were seeing some
13+
// queries that return a lot of data timeing out.
14+
'server.http-write-timeout': '1m',
15+
16+
// Split long queries up into multiple day-long queries.
17+
'querier.split-queries-by-interval': '24h',
18+
19+
// Cache query results.
20+
'querier.align-querier-with-step': true,
21+
'querier.cache-results': true,
22+
'frontend.memcached.hostname': 'memcached-frontend.%s.svc.cluster.local' % $._config.namespace,
23+
'frontend.memcached.service': 'memcached-client',
24+
'frontend.memcached.timeout': '500ms',
25+
26+
// So that exporters like cloudwatch can still send in data and be un-cached.
27+
'frontend.max-cache-freshness': '10m',
28+
29+
// Compress HTTP responses; improves latency for very big results and slow
30+
// connections.
31+
'querier.compress-http-responses': true,
32+
33+
// So it can receive big responses from the querier.
34+
'server.grpc-max-recv-msg-size-bytes': 100 << 20,
35+
36+
// Limit queries to 500 days, allow this to be override per-user.
37+
'store.max-query-length': '12000h', // 500 Days
38+
'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
39+
} + (
40+
if $._config.queryFrontend.sharded_queries_enabled then
41+
{
42+
'querier.parallelise-shardable-queries': 'true',
43+
44+
// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
45+
// basically base * shard_factor * query_split_factor / num_frontends where
46+
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
47+
48+
'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
49+
} + $._config.storageConfig
50+
else {}
51+
),
4752

4853
query_frontend_container::
4954
container.new('query-frontend', $._images.query_frontend) +

cortex/ruler.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
local container = $.core.v1.container,
33

44
ruler_args::
5+
$._config.grpcConfig +
56
$._config.ringConfig +
67
$._config.storeConfig +
78
$._config.storageConfig +

cortex/tsdb.libsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@
123123
pvc.mixin.metadata.withName('compactor-data'),
124124

125125
compactor_args::
126+
$._config.grpcConfig +
126127
$._config.storageConfig +
127128
$._config.blocksStorageConfig +
128129
{
@@ -178,6 +179,7 @@
178179
pvc.mixin.metadata.withName('store-gateway-data'),
179180

180181
store_gateway_args::
182+
$._config.grpcConfig +
181183
$._config.storageConfig +
182184
$._config.blocksStorageConfig +
183185
{

0 commit comments

Comments
 (0)