Skip to content

Commit 5bdb1b9

Browse files
committed
update monitoring-mixins
Signed-off-by: Weifeng Wang <[email protected]> update mimir-mixin Signed-off-by: Weifeng Wang <[email protected]> update pyroscope-mixin Signed-off-by: Weifeng Wang <[email protected]> update loki-mixin Signed-off-by: Weifeng Wang <[email protected]> update tempo-mixin Signed-off-by: Weifeng Wang <[email protected]> add tempo-mixin Signed-off-by: Weifeng Wang <[email protected]>
1 parent df0e697 commit 5bdb1b9

File tree

75 files changed

+2903
-916
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+2903
-916
lines changed

compose.override.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ services:
4545
volumes:
4646
- ./monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin
4747
- ./monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
48-
# - ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
48+
- ./monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
4949
environment:
5050
GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn}
5151
GF_DIAGNOSTICS_PROFILING_ENABLED: true
@@ -83,8 +83,8 @@ services:
8383
- ./monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
8484
- ./monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
8585
- ./monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml
86-
# - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
87-
# - ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
86+
- ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
87+
- ./monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
8888

8989
# override compose.yaml included service pyroscope's labels and environment to enabled traces data collection
9090
pyroscope:

docker-compose/microservices-mode/traces/grafana.override.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ services:
44
grafana:
55
volumes:
66
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
7-
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
7+
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
88

docker-compose/microservices-mode/traces/mimirtool.override.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ services:
55
volumes:
66
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
77
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
8-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
9-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
8+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
9+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml

docker-compose/monolithic-mode/all-in-one/grafana.override.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ services:
1212
volumes:
1313
- ../../../monitoring-mixins/pyroscope-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/pyroscope-mixin
1414
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
15-
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
15+
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
1616
environment:
1717
GF_LOG_LEVEL: ${GF_LOG_LEVEL:-warn}
1818
GF_DIAGNOSTICS_PROFILING_ENABLED: true

docker-compose/monolithic-mode/all-in-one/mimirtool.override.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ services:
66
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
77
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
88
- ../../../monitoring-mixins/pyroscope-mixin/deploy/pyroscope-mixin-rules.yaml:/rules/pyroscope-mixin-rules.yaml
9-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
10-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
9+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
10+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml

docker-compose/monolithic-mode/traces/grafana.override.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ services:
44
grafana:
55
volumes:
66
- ../../../monitoring-mixins/loki-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/loki-mixin
7-
# - ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
7+
- ../../../monitoring-mixins/tempo-mixin/deploy/dashboards_out:/var/lib/grafana/dashboards/tempo-mixin
88

docker-compose/monolithic-mode/traces/mimirtool.override.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ services:
55
volumes:
66
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-rules.yaml:/rules/loki-mixin-rules.yaml
77
- ../../../monitoring-mixins/loki-mixin/deploy/loki-mixin-alerts.yaml:/rules/loki-mixin-alerts.yaml
8-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
9-
# - ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml
8+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-rules.yaml:/rules/tempo-mixin-rules.yaml
9+
- ../../../monitoring-mixins/tempo-mixin/deploy/tempo-mixin-alerts.yaml:/rules/tempo-mixin-alerts.yaml

monitoring-mixins/agent-flow-mixin/deploy/agent-flow-mixin-alerts.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@ groups:
33
rules:
44
- alert: ClusterNotConverging
55
annotations:
6-
message: Cluster is not converging.
6+
message: 'Cluster is not converging: nodes report different number of peers in the cluster.'
77
expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0
88
for: 10m
9-
- alert: ClusterSplitBrain
9+
- alert: ClusterNodeCountMismatch
1010
annotations:
11-
message: Cluster nodes have entered a split brain state.
11+
message: Nodes report different number of peers vs. the count of observed agent metrics. Some agent metrics may be missing or the cluster is in a split brain state.
1212
expr: |
1313
sum without (state) (cluster_node_peers) !=
1414
on (cluster, namespace) group_left
1515
count by (cluster, namespace) (cluster_node_info)
16-
for: 10m
16+
for: 15m
1717
- alert: ClusterNodeUnhealthy
1818
annotations:
19-
message: Cluster node is reporting a health score > 0.
19+
message: Cluster node is reporting a gossip protocol health score > 0.
2020
expr: |
2121
cluster_node_gossip_health_score > 0
2222
for: 10m

monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-opentelemetry.json

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,17 @@
2929
{
3030
"datasource": "${datasource}",
3131
"description": "Number of spans successfully pushed into the pipeline.\n",
32+
"fieldConfig": {
33+
"defaults": {
34+
"custom": {
35+
"fillOpacity": 20,
36+
"gradientMode": "hue",
37+
"stacking": {
38+
"mode": "normal"
39+
}
40+
}
41+
}
42+
},
3243
"gridPos": {
3344
"h": 10,
3445
"w": 8,
@@ -82,11 +93,6 @@
8293
{
8394
"datasource": "${datasource}",
8495
"description": "The duration of inbound RPCs.\n",
85-
"fieldConfig": {
86-
"defaults": {
87-
"unit": "milliseconds"
88-
}
89-
},
9096
"gridPos": {
9197
"h": 10,
9298
"w": 8,
@@ -115,7 +121,7 @@
115121
"yHistogram": true
116122
},
117123
"yAxis": {
118-
"unit": "s"
124+
"unit": "ms"
119125
}
120126
},
121127
"pluginVersion": "9.0.6",
@@ -129,7 +135,7 @@
129135
"range": true
130136
}
131137
],
132-
"title": "RPC server duration (traces)",
138+
"title": "RPC server duration",
133139
"type": "heatmap"
134140
},
135141
{
@@ -140,12 +146,17 @@
140146
"x": 0,
141147
"y": 10
142148
},
143-
"title": "Batching [otelcol.processor.batch]",
149+
"title": "Batching of logs, metrics, and traces [otelcol.processor.batch]",
144150
"type": "row"
145151
},
146152
{
147153
"datasource": "${datasource}",
148-
"description": "Number of units in the batch\n",
154+
"description": "Number of spans, metric datapoints, or log lines in a batch\n",
155+
"fieldConfig": {
156+
"defaults": {
157+
"unit": "short"
158+
}
159+
},
149160
"gridPos": {
150161
"h": 10,
151162
"w": 8,
@@ -174,7 +185,7 @@
174185
"yHistogram": true
175186
},
176187
"yAxis": {
177-
"unit": "s"
188+
"unit": "short"
178189
}
179190
},
180191
"pluginVersion": "9.0.6",
@@ -247,6 +258,17 @@
247258
{
248259
"datasource": "${datasource}",
249260
"description": "Number of spans successfully sent to destination.\n",
261+
"fieldConfig": {
262+
"defaults": {
263+
"custom": {
264+
"fillOpacity": 20,
265+
"gradientMode": "hue",
266+
"stacking": {
267+
"mode": "normal"
268+
}
269+
}
270+
}
271+
},
250272
"gridPos": {
251273
"h": 10,
252274
"w": 8,
@@ -268,6 +290,17 @@
268290
{
269291
"datasource": "${datasource}",
270292
"description": "Number of spans in failed attempts to send to destination.\n",
293+
"fieldConfig": {
294+
"defaults": {
295+
"custom": {
296+
"fillOpacity": 20,
297+
"gradientMode": "hue",
298+
"stacking": {
299+
"mode": "normal"
300+
}
301+
}
302+
}
303+
},
271304
"gridPos": {
272305
"h": 10,
273306
"w": 8,

monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1438,6 +1438,17 @@ data:
14381438
{
14391439
"datasource": "${datasource}",
14401440
"description": "Number of spans successfully pushed into the pipeline.\n",
1441+
"fieldConfig": {
1442+
"defaults": {
1443+
"custom": {
1444+
"fillOpacity": 20,
1445+
"gradientMode": "hue",
1446+
"stacking": {
1447+
"mode": "normal"
1448+
}
1449+
}
1450+
}
1451+
},
14411452
"gridPos": {
14421453
"h": 10,
14431454
"w": 8,
@@ -1491,11 +1502,6 @@ data:
14911502
{
14921503
"datasource": "${datasource}",
14931504
"description": "The duration of inbound RPCs.\n",
1494-
"fieldConfig": {
1495-
"defaults": {
1496-
"unit": "milliseconds"
1497-
}
1498-
},
14991505
"gridPos": {
15001506
"h": 10,
15011507
"w": 8,
@@ -1524,7 +1530,7 @@ data:
15241530
"yHistogram": true
15251531
},
15261532
"yAxis": {
1527-
"unit": "s"
1533+
"unit": "ms"
15281534
}
15291535
},
15301536
"pluginVersion": "9.0.6",
@@ -1538,7 +1544,7 @@ data:
15381544
"range": true
15391545
}
15401546
],
1541-
"title": "RPC server duration (traces)",
1547+
"title": "RPC server duration",
15421548
"type": "heatmap"
15431549
},
15441550
{
@@ -1549,12 +1555,17 @@ data:
15491555
"x": 0,
15501556
"y": 10
15511557
},
1552-
"title": "Batching [otelcol.processor.batch]",
1558+
"title": "Batching of logs, metrics, and traces [otelcol.processor.batch]",
15531559
"type": "row"
15541560
},
15551561
{
15561562
"datasource": "${datasource}",
1557-
"description": "Number of units in the batch\n",
1563+
"description": "Number of spans, metric datapoints, or log lines in a batch\n",
1564+
"fieldConfig": {
1565+
"defaults": {
1566+
"unit": "short"
1567+
}
1568+
},
15581569
"gridPos": {
15591570
"h": 10,
15601571
"w": 8,
@@ -1583,7 +1594,7 @@ data:
15831594
"yHistogram": true
15841595
},
15851596
"yAxis": {
1586-
"unit": "s"
1597+
"unit": "short"
15871598
}
15881599
},
15891600
"pluginVersion": "9.0.6",
@@ -1656,6 +1667,17 @@ data:
16561667
{
16571668
"datasource": "${datasource}",
16581669
"description": "Number of spans successfully sent to destination.\n",
1670+
"fieldConfig": {
1671+
"defaults": {
1672+
"custom": {
1673+
"fillOpacity": 20,
1674+
"gradientMode": "hue",
1675+
"stacking": {
1676+
"mode": "normal"
1677+
}
1678+
}
1679+
}
1680+
},
16591681
"gridPos": {
16601682
"h": 10,
16611683
"w": 8,
@@ -1677,6 +1699,17 @@ data:
16771699
{
16781700
"datasource": "${datasource}",
16791701
"description": "Number of spans in failed attempts to send to destination.\n",
1702+
"fieldConfig": {
1703+
"defaults": {
1704+
"custom": {
1705+
"fillOpacity": 20,
1706+
"gradientMode": "hue",
1707+
"stacking": {
1708+
"mode": "normal"
1709+
}
1710+
}
1711+
}
1712+
},
16801713
"gridPos": {
16811714
"h": 10,
16821715
"w": 8,
@@ -2807,37 +2840,27 @@ spec:
28072840
rules:
28082841
- alert: ClusterNotConverging
28092842
annotations:
2810-
message: Cluster is not converging.
2843+
message: 'Cluster is not converging: nodes report different number of peers
2844+
in the cluster.'
28112845
expr: stddev by (cluster, namespace) (sum without (state) (cluster_node_peers))
28122846
!= 0
2813-
for: 5m
2814-
- alert: ClusterSplitBrain
2847+
for: 10m
2848+
- alert: ClusterNodeCountMismatch
28152849
annotations:
2816-
message: Cluster nodes have entered a split brain state.
2850+
message: Nodes report different number of peers vs. the count of observed
2851+
agent metrics. Some agent metrics may be missing or the cluster is in a
2852+
split brain state.
28172853
expr: |
28182854
sum without (state) (cluster_node_peers) !=
28192855
on (cluster, namespace) group_left
28202856
count by (cluster, namespace) (cluster_node_info)
2821-
for: 5m
2822-
- alert: ClusterLamportClockDrift
2823-
annotations:
2824-
message: Cluster nodes' lamport clocks are not converging.
2825-
expr: stddev by (cluster, namespace) (cluster_node_lamport_time) > 4 * sqrt(count
2826-
by (cluster, namespace) (cluster_node_info))
2827-
for: 5m
2857+
for: 15m
28282858
- alert: ClusterNodeUnhealthy
28292859
annotations:
2830-
message: Cluster node is reporting a health score > 0.
2860+
message: Cluster node is reporting a gossip protocol health score > 0.
28312861
expr: |
28322862
cluster_node_gossip_health_score > 0
2833-
for: 5m
2834-
- alert: ClusterLamportClockStuck
2835-
annotations:
2836-
message: Cluster nodes's lamport clocks is not progressing.
2837-
expr: |
2838-
sum by (cluster, namespace, instance) (rate(cluster_node_lamport_time[2m])) == 0
2839-
and on (cluster, namespace, instance) (cluster_node_peers > 1)
2840-
for: 5m
2863+
for: 10m
28412864
- alert: ClusterNodeNameConflict
28422865
annotations:
28432866
message: A node tried to join the cluster with a name conflicting with an
@@ -2850,7 +2873,7 @@ spec:
28502873
message: Cluster node stuck in Terminating state.
28512874
expr: sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"})
28522875
> 0
2853-
for: 5m
2876+
for: 10m
28542877
- alert: ClusterConfigurationDrift
28552878
annotations:
28562879
message: Cluster nodes are not using the same configuration file.
@@ -2870,6 +2893,6 @@ spec:
28702893
- alert: UnhealthyComponents
28712894
annotations:
28722895
message: Unhealthy Flow components detected.
2873-
expr: sum(agent_component_controller_running_components{health_type!="healthy"})
2896+
expr: sum by (cluster, namespace) (agent_component_controller_running_components{health_type!="healthy"})
28742897
> 0
28752898
for: 15m

0 commit comments

Comments
 (0)