|
| 1 | +local g = import 'grafana-builder/grafana.libsonnet'; |
| 2 | +local template = import 'grafonnet/template.libsonnet'; |
| 3 | + |
| 4 | +{ |
| 5 | + // Manually define the "instance" variable template in order to be able to change the "refresh" setting |
| 6 | + // and customise the all value. |
| 7 | + local instanceTemplate = |
| 8 | + template.new( |
| 9 | + name='instance', |
| 10 | + datasource='$datasource', |
| 11 | + query='label_values(envoy_server_uptime{job="$job"}, instance)', |
| 12 | + allValues='.*', // Make sure to always include all instances when "All" is selected. |
| 13 | + current='', |
| 14 | + hide='', |
| 15 | + refresh=2, // Refresh on time range change. |
| 16 | + includeAll=true, |
| 17 | + sort=1 |
| 18 | + ), |
| 19 | + |
| 20 | + // Envoy metrics: |
| 21 | + // - HTTP: https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/router_filter#statistics |
| 22 | + grafanaDashboards+:: { |
| 23 | + 'envoy-overview.json': |
| 24 | + g.dashboard('Envoy Overview') |
| 25 | + .addTemplate('job', 'envoy_server_uptime', 'job') |
| 26 | + |
| 27 | + // Hidden variables to be able to repeat panels for each upstream/downstream. |
| 28 | + .addMultiTemplate('envoy_cluster', 'envoy_cluster_version{job=~"$job",instance=~"$instance",envoy_cluster_name!="envoy-admin"}', 'envoy_cluster_name', 2) |
| 29 | + .addMultiTemplate('envoy_listener_filter', 'envoy_http_downstream_rq_total{job=~"$job",instance=~"$instance",envoy_http_conn_manager_prefix!~"admin|metrics",}', 'envoy_http_conn_manager_prefix', 2) |
| 30 | + |
| 31 | + .addRow( |
| 32 | + g.row('Traffic') |
| 33 | + .addPanel( |
| 34 | + g.panel('Connections / sec') + |
| 35 | + g.queryPanel('sum(rate(envoy_listener_downstream_cx_total{job=~"$job",instance=~"$instance"}[$__interval]))', 'Downstream / Ingress') + |
| 36 | + g.queryPanel('sum(rate(envoy_cluster_upstream_cx_total{job=~"$job",instance=~"$instance"}[$__interval]))', 'Upstream / Egress') + |
| 37 | + { yaxes: g.yaxes('cps') } |
| 38 | + ) |
| 39 | + .addPanel( |
| 40 | + g.panel('QPS') + |
| 41 | + g.queryPanel('sum(rate(envoy_http_downstream_rq_total{job=~"$job",instance=~"$instance"}[$__interval]))', 'Downstream / Ingress') + |
| 42 | + g.queryPanel('sum(rate(envoy_cluster_upstream_rq_total{job=~"$job",instance=~"$instance"}[$__interval]))', 'Upstream / Egress') + |
| 43 | + { yaxes: g.yaxes('rps') } |
| 44 | + ) |
| 45 | + ) |
| 46 | + |
| 47 | + .addRow( |
| 48 | + g.row('Upstream / Egress: $envoy_cluster') |
| 49 | + .addPanel( |
| 50 | + g.panel('QPS') + |
| 51 | + $.envoyQpsPanel('envoy_cluster_upstream_rq_xx{envoy_cluster_name="$envoy_cluster",job=~"$job",instance=~"$instance"}') |
| 52 | + ) |
| 53 | + .addPanel( |
| 54 | + g.panel('Latency') + |
| 55 | + // This metric is in ms, so we apply a multiplier=1 |
| 56 | + g.latencyPanel('envoy_cluster_upstream_rq_time', '{envoy_cluster_name="$envoy_cluster",job=~"$job",instance=~"$instance"}', '1') |
| 57 | + ) |
| 58 | + .addPanel( |
| 59 | + g.panel('Timeouts / sec') + |
| 60 | + g.queryPanel('sum(rate(envoy_cluster_upstream_rq_timeout{envoy_cluster_name="$envoy_cluster",job=~"$job",instance=~"$instance"}[$__interval]))', 'Timeouts') + |
| 61 | + { yaxes: g.yaxes('rps') } |
| 62 | + ) |
| 63 | + .addPanel( |
| 64 | + g.panel('Active') + |
| 65 | + g.queryPanel('sum(envoy_cluster_upstream_rq_active{envoy_cluster_name="$envoy_cluster",job=~"$job",instance=~"$instance"})', 'Requests') + |
| 66 | + g.queryPanel('sum(envoy_cluster_upstream_cx_active{envoy_cluster_name="$envoy_cluster",job=~"$job",instance=~"$instance"})', 'Connections') |
| 67 | + ) + |
| 68 | + |
| 69 | + // Repeat this row for each Envoy upstream cluster. |
| 70 | + { repeat: 'envoy_cluster' }, |
| 71 | + ) |
| 72 | + |
| 73 | + .addRow( |
| 74 | + g.row('Downstream / Ingress: $envoy_listener_filter') |
| 75 | + .addPanel( |
| 76 | + g.panel('QPS') + |
| 77 | + $.envoyQpsPanel('envoy_http_downstream_rq_xx{envoy_http_conn_manager_prefix="$envoy_listener_filter",job=~"$job",instance=~"$instance"}') |
| 78 | + ) |
| 79 | + .addPanel( |
| 80 | + g.panel('Latency') + |
| 81 | + // This metric is in ms, so we apply a multiplier=1 |
| 82 | + g.latencyPanel('envoy_http_downstream_rq_time', '{envoy_http_conn_manager_prefix="$envoy_listener_filter",job=~"$job",instance=~"$instance"}', '1') |
| 83 | + ) |
| 84 | + .addPanel( |
| 85 | + g.panel('Timeouts / sec') + |
| 86 | + g.queryPanel('sum(rate(envoy_http_downstream_rq_timeout{envoy_http_conn_manager_prefix="$envoy_listener_filter",job=~"$job",instance=~"$instance"}[$__interval]))', 'Timeouts') + |
| 87 | + { yaxes: g.yaxes('rps') } |
| 88 | + ) |
| 89 | + .addPanel( |
| 90 | + g.panel('Active') + |
| 91 | + g.queryPanel('sum(envoy_http_downstream_rq_active{envoy_http_conn_manager_prefix="$envoy_listener_filter",job=~"$job",instance=~"$instance"})', 'Requests') + |
| 92 | + g.queryPanel('sum(envoy_http_downstream_cx_active{envoy_http_conn_manager_prefix="$envoy_listener_filter",job=~"$job",instance=~"$instance"})', 'Connections') |
| 93 | + ) + |
| 94 | + |
| 95 | + // Repeat this row for each Envoy downstream filter. |
| 96 | + { repeat: 'envoy_listener_filter' }, |
| 97 | + ) + { |
| 98 | + templating+: { |
| 99 | + list+: [instanceTemplate], |
| 100 | + }, |
| 101 | + }, |
| 102 | + }, |
| 103 | + |
| 104 | + // This is a custom function used to display QPS by response status class captured |
| 105 | + // through the Envoy label "envoy_response_code_class". |
| 106 | + envoyQpsPanel(selector):: { |
| 107 | + aliasColors: { |
| 108 | + '1xx': '#EAB839', |
| 109 | + '2xx': '#7EB26D', |
| 110 | + '3xx': '#6ED0E0', |
| 111 | + '4xx': '#EF843C', |
| 112 | + '5xx': '#E24D42', |
| 113 | + }, |
| 114 | + targets: [ |
| 115 | + { |
| 116 | + expr: 'sum by (status) (label_replace(rate(' + selector + '[$__interval]), "status", "${1}xx", "envoy_response_code_class", "(.*)"))', |
| 117 | + format: 'time_series', |
| 118 | + intervalFactor: 2, |
| 119 | + legendFormat: '{{status}}', |
| 120 | + refId: 'A', |
| 121 | + step: 10, |
| 122 | + }, |
| 123 | + ], |
| 124 | + } + g.stack, |
| 125 | +} |
0 commit comments