Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Commit 2c29f32

Browse files
authored
Merge branch 'master' into compactor-alert
2 parents 6c26c63 + 4f603c7 commit 2c29f32

File tree

10 files changed

+277
-10
lines changed

10 files changed

+277
-10
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,21 @@
33
## master / unreleased
44

55
* [CHANGE] Add the default preset 'extra_small_user' and reference it in the CLI flags. This will raise the limits of the 'small_user' preset to the defaults for `ingester.max-samples-per-query` and `ingester.max-series-per-query`. #200
6+
* [CHANGE] Removed the config option `$._config.ingester.statefulset_replicas` which was used only when running Cortex chunks storage with WAL enabled. To configure the number of ingester replicas you should now use the following: #210
7+
```
8+
ingester_statefulset+:
9+
statefulSet.mixin.spec.withReplicas(6),
10+
```
611
* [ENHANCEMENT] Add the Ruler to the read resources dashboard #205
712
* [ENHANCEMENT] Read dashboards now use `cortex_querier_request_duration_seconds` metrics to allow for accurate dashboards when deploying Cortex as a single-binary. #199
813
* [ENHANCEMENT] Improved Ruler dashboard. Includes information about notifications, reads/writes, and per user per rule group evaluation. #197, #205
914
* [ENHANCEMENT] Add new `CortexCompactorRunFailed` alert when compactor run fails. #206
15+
* [ENHANCEMENT] Add `flusher-job-blocks.libsonnet` with support for flushing blocks disks. #187
16+
* [ENHANCEMENT] Add more alerts on failure conditions for ingesters when running the blocks storage. #208
1017
* [FEATURE] Latency recording rules for the metric`cortex_querier_request_duration_seconds` are now part of a `cortex_querier_api` rule group. #199
1118
* [FEATURE] Add overrides-exporter as optional deployment to expose configured runtime overrides and presets. #198
19+
* [FEATURE] Add a dashboard for the alertmanager. #207
20+
* [BUGFIX] Added `ingester-blocks` to ingester's job label matcher, in order to correctly get metrics when migrating a Cortex cluster from chunks to blocks. #203
1221

1322
## 1.4.0 / 2020-10-02
1423

cortex-mixin/alerts/blocks.libsonnet

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,79 @@
5555
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.',
5656
},
5757
},
58+
{
59+
alert: 'CortexIngesterTSDBHeadTruncationFailed',
60+
expr: |||
61+
rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
62+
|||,
63+
labels: {
64+
severity: 'critical',
65+
},
66+
annotations: {
67+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.',
68+
},
69+
},
70+
{
71+
alert: 'CortexIngesterTSDBCheckpointCreationFailed',
72+
expr: |||
73+
rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
74+
|||,
75+
labels: {
76+
severity: 'critical',
77+
},
78+
annotations: {
79+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.',
80+
},
81+
},
82+
{
83+
alert: 'CortexIngesterTSDBCheckpointDeletionFailed',
84+
expr: |||
85+
rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
86+
|||,
87+
labels: {
88+
severity: 'critical',
89+
},
90+
annotations: {
91+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.',
92+
},
93+
},
94+
{
95+
alert: 'CortexIngesterTSDBWALTruncationFailed',
96+
expr: |||
97+
rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
98+
|||,
99+
labels: {
100+
severity: 'warning',
101+
},
102+
annotations: {
103+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.',
104+
},
105+
},
106+
{
107+
alert: 'CortexIngesterTSDBWALCorrupted',
108+
expr: |||
109+
rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0
110+
|||,
111+
labels: {
112+
severity: 'critical',
113+
},
114+
annotations: {
115+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.',
116+
},
117+
},
118+
{
119+
alert: 'CortexIngesterTSDBWALWritesFailed',
120+
'for': '3m',
121+
expr: |||
122+
rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
123+
|||,
124+
labels: {
125+
severity: 'critical',
126+
},
127+
annotations: {
128+
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.',
129+
},
130+
},
58131
{
59132
// Alert if the querier is not successfully scanning the bucket.
60133
alert: 'CortexQuerierHasNotScanTheBucket',

cortex-mixin/config.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
// These are used by the dashboards and allow for the simultaneous display of
2727
// microservice and single binary cortex clusters.
2828
job_names: {
29-
ingester: '(ingester|cortex$)',
29+
ingester: '(ingester.*|cortex$)', // Match also ingester-blocks, which is used during the migration from chunks to blocks.
3030
distributor: '(distributor|cortex$)',
3131
querier: '(querier|cortex$)',
3232
ruler: '(ruler|cortex$)',

cortex-mixin/dashboards.libsonnet

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
(import 'dashboards/queries.libsonnet') +
55
(import 'dashboards/reads.libsonnet') +
66
(import 'dashboards/ruler.libsonnet') +
7+
(import 'dashboards/alertmanager.libsonnet') +
78
(import 'dashboards/scaling.libsonnet') +
89
(import 'dashboards/writes.libsonnet') +
910

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
local utils = import 'mixin-utils/utils.libsonnet';
2+
3+
(import 'dashboard-utils.libsonnet') {
4+
5+
'alertmanager.json':
6+
($.dashboard('Cortex / Alertmanager') + { uid: 'a76bee5913c97c918d9e56a3cc88cc28' })
7+
.addClusterSelectorTemplates()
8+
.addRow(
9+
($.row('Headlines') + {
10+
height: '100px',
11+
showTitle: false,
12+
})
13+
.addPanel(
14+
$.panel('Total Alerts') +
15+
$.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short')
16+
)
17+
.addPanel(
18+
$.panel('Total Silences') +
19+
$.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short')
20+
)
21+
)
22+
.addRow(
23+
$.row('Alerts Received')
24+
.addPanel(
25+
$.panel('APS') +
26+
$.queryPanel(
27+
[
28+
|||
29+
sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__interval]))
30+
-
31+
sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval]))
32+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
33+
'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'),
34+
],
35+
['success', 'failed']
36+
)
37+
)
38+
)
39+
.addRow(
40+
$.row('Alert Notifications')
41+
.addPanel(
42+
$.panel('NPS') +
43+
$.queryPanel(
44+
[
45+
|||
46+
sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval]))
47+
-
48+
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval]))
49+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
50+
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval]))' % $.jobMatcher('alertmanager'),
51+
],
52+
['success', 'failed']
53+
)
54+
)
55+
.addPanel(
56+
$.panel('NPS by integration') +
57+
$.queryPanel(
58+
[
59+
|||
60+
(
61+
sum(rate(cortex_alertmanager_notifications_total{%s}[$__interval])) by(integration)
62+
-
63+
sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration)
64+
) > 0
65+
or on () vector(0)
66+
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
67+
'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__interval])) by(integration)' % $.jobMatcher('alertmanager'),
68+
],
69+
['success - {{ integration }}', 'failed - {{ integration }}']
70+
)
71+
)
72+
.addPanel(
73+
$.panel('Latency') +
74+
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher('alertmanager'))
75+
)
76+
)
77+
.addRow(
78+
$.row('Configuration API (gateway) + Alertmanager UI')
79+
.addPanel(
80+
$.panel('QPS') +
81+
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway))
82+
)
83+
.addPanel(
84+
$.panel('Latency') +
85+
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')])
86+
)
87+
),
88+
}

cortex-mixin/dashboards/ruler.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
9292
$.rulerQueries.ruleEvaluations.success % [$.jobMatcher('ruler'), $.jobMatcher('ruler')],
9393
$.rulerQueries.ruleEvaluations.failure % $.jobMatcher('ruler'),
9494
],
95-
['sucess', 'failed'],
95+
['success', 'failed'],
9696
),
9797
)
9898
.addPanel(

cortex-mixin/docs/playbooks.md

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ This alert occurs when a ruler is unable to validate whether or not it should cl
8282

8383
This alert fires when a Cortex ingester is not uploading any block to the long-term storage. An ingester is expected to upload a block to the storage every block range period (defaults to 2h) and if a longer time elapse since the last successful upload it means something is not working correctly.
8484

85-
How to investigate:
85+
How to **investigate**:
8686
- Ensure the ingester is receiving write-path traffic (samples to ingest)
8787
- Look for any upload error in the ingester logs (ie. networking or authentication issues)
8888

@@ -115,33 +115,81 @@ The cause triggering this alert could **lead to**:
115115
How to **investigate**:
116116
- Look for details in the ingester logs
117117

118+
### CortexIngesterTSDBHeadTruncationFailed
119+
120+
This alert fires when a Cortex ingester fails to truncate the TSDB head.
121+
122+
The TSDB head is the in-memory store used to keep series and samples not compacted into a block yet. If head truncation fails for a long time, the ingester disk might get full as it won't continue to the WAL truncation stage and the subsequent ingester restart may take a long time or even go into an OOMKilled crash loop because of the huge WAL to replay. For this reason, it's important to investigate and address the issue as soon as it happen.
123+
124+
How to **investigate**:
125+
- Look for details in the ingester logs
126+
127+
### CortexIngesterTSDBCheckpointCreationFailed
128+
129+
This alert fires when a Cortex ingester fails to create a TSDB checkpoint.
130+
131+
How to **investigate**:
132+
- Look for details in the ingester logs
133+
- If the checkpoint fails because of a `corruption in segment`, you can restart the ingester because at next startup TSDB will try to "repair" it. After restart, if the issue is repaired and the ingester is running, you should also get paged by `CortexIngesterTSDBWALCorrupted` to signal you the WAL was corrupted and manual investigation is required.
134+
135+
### CortexIngesterTSDBCheckpointDeletionFailed
136+
137+
This alert fires when a Cortex ingester fails to delete a TSDB checkpoint.
138+
139+
Generally, this is not an urgent issue, but manual investigation is required to find the root cause of the issue and fix it.
140+
141+
How to **investigate**:
142+
- Look for details in the ingester logs
143+
144+
### CortexIngesterTSDBWALTruncationFailed
145+
146+
This alert fires when a Cortex ingester fails to truncate the TSDB WAL.
147+
148+
How to **investigate**:
149+
- Look for details in the ingester logs
150+
151+
### CortexIngesterTSDBWALCorrupted
152+
153+
This alert fires when a Cortex ingester finds a corrupted TSDB WAL (stored on disk) while replaying it at ingester startup or when creation of a checkpoint comes across a WAL corruption.
154+
155+
If this alert fires during an **ingester startup**, the WAL should have been auto-repaired, but manual investigation is required. The WAL repair mechanism cause data loss because all WAL records after the corrupted segment are discarded and so their samples lost while replaying the WAL. If this issue happen only on 1 ingester then Cortex doesn't suffer any data loss because of the replication factor, while if it happens on multiple ingesters then some data loss is possible.
156+
157+
If this alert fires during a **checkpoint creation**, you should have also been paged with `CortexIngesterTSDBCheckpointCreationFailed`, and you can follow the steps under that alert.
158+
159+
### CortexIngesterTSDBWALWritesFailed
160+
161+
This alert fires when a Cortex ingester is failing to log records to the TSDB WAL on disk.
162+
163+
How to **investigate**:
164+
- Look for details in the ingester logs
165+
118166
### CortexQuerierHasNotScanTheBucket
119167

120168
This alert fires when a Cortex querier is not successfully scanning blocks in the storage (bucket). A querier is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket since a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results.
121169

122-
How to investigate:
170+
How to **investigate**:
123171
- Look for any scan error in the querier logs (ie. networking or rate limiting issues)
124172

125173
### CortexQuerierHighRefetchRate
126174

127175
This alert fires when there's an high number of queries for which series have been refetched from a different store-gateway because of missing blocks. This could happen for a short time whenever a store-gateway ring resharding occurs (e.g. during/after an outage or while rolling out store-gateway) but store-gateways should reconcile in a short time. This alert fires if the issue persist for an unexpected long time and thus it should be investigated.
128176

129-
How to investigate:
177+
How to **investigate**:
130178
- Ensure there are no errors related to blocks scan or sync in the queriers and store-gateways
131179
- Check store-gateway logs to see if all store-gateway have successfully completed a blocks sync
132180

133181
### CortexStoreGatewayHasNotSyncTheBucket
134182

135183
This alert fires when a Cortex store-gateway is not successfully scanning blocks in the storage (bucket). A store-gateway is expected to periodically iterate the bucket to find new and deleted blocks (defaults to every 5m) and if it's not successfully synching the bucket for a long time, it may end up querying only a subset of blocks, thus leading to potentially partial results.
136184

137-
How to investigate:
185+
How to **investigate**:
138186
- Look for any scan error in the store-gateway logs (ie. networking or rate limiting issues)
139187

140188
### CortexCompactorHasNotSuccessfullyCleanedUpBlocks
141189

142190
This alert fires when a Cortex compactor is not successfully deleting blocks marked for deletion for a long time.
143191

144-
How to investigate:
192+
How to **investigate**:
145193
- Ensure the compactor is not crashing during compaction (ie. `OOMKilled`)
146194
- Look for any error in the compactor logs (ie. bucket Delete API errors)
147195

@@ -153,7 +201,7 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas
153201

154202
This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time.
155203

156-
How to investigate:
204+
How to **investigate**:
157205
- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first
158206
- If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first
159207
- Ensure ingesters are successfully shipping blocks to the storage

cortex/config.libsonnet

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
ingester: {
7676
// These config options are only for the chunks storage.
7777
wal_dir: '/wal_data',
78-
statefulset_replicas: 3,
7978
statefulset_disk: '150Gi',
8079
},
8180

cortex/flusher-job-blocks.libsonnet

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
// Usage example:
3+
//
4+
// local flusher_job = import 'cortex/flusher-job-blocks.libsonnet';
5+
//
6+
// flusher_job {
7+
// 'flusher-25': $.flusher_job_func('flusher-25', 'ingester-data-ingester-25'),
8+
// }
9+
//
10+
// Where 'flusher-25' is a job name, and 'ingester-data-ingester-25' is PVC to flush.
11+
12+
local container = $.core.v1.container,
13+
local job = $.batch.v1.job,
14+
local volumeMount = $.core.v1.volumeMount,
15+
local volume = $.core.v1.volume,
16+
17+
flusher_container::
18+
container.new('flusher', $._images.flusher) +
19+
container.withPorts($.util.defaultPorts) +
20+
container.withArgsMixin($.util.mapToFlags($.ingester_args {
21+
target: 'flusher',
22+
'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon.
23+
})) +
24+
$.util.resourcesRequests('4', '15Gi') +
25+
$.util.resourcesLimits(null, '25Gi') +
26+
$.util.readinessProbe +
27+
$.jaeger_mixin,
28+
29+
flusher_job_func(jobName, pvcName)::
30+
job.new() +
31+
job.mixin.spec.template.spec.withContainers([
32+
$.flusher_container +
33+
container.withVolumeMountsMixin([
34+
volumeMount.new('flusher-data', '/data'),
35+
]),
36+
]) +
37+
job.mixin.spec.template.spec.withRestartPolicy('Never') +
38+
job.mixin.spec.template.spec.withVolumes([
39+
volume.fromPersistentVolumeClaim('flusher-data', pvcName),
40+
]) +
41+
job.mixin.metadata.withName(jobName) +
42+
job.mixin.metadata.withNamespace($._config.namespace) +
43+
job.mixin.metadata.withLabels({ name: 'flusher' }) +
44+
job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) +
45+
job.mixin.spec.template.spec.securityContext.withRunAsUser(0) +
46+
job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) +
47+
$.util.configVolumeMount('overrides', '/etc/cortex') +
48+
$.util.podPriority('high'),
49+
}

cortex/ingester.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191

9292
ingester_statefulset:
9393
if $._config.ingester_deployment_without_wal == false then
94-
statefulSet.new('ingester', $._config.ingester.statefulset_replicas, [$.ingester_statefulset_container], ingester_pvc) +
94+
statefulSet.new('ingester', 3, [$.ingester_statefulset_container], ingester_pvc) +
9595
statefulSet.mixin.spec.withServiceName('ingester') +
9696
statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) +
9797
statefulSet.mixin.metadata.withNamespace($._config.namespace) +

0 commit comments

Comments
 (0)