diff --git a/dashboards/resources/cluster.libsonnet b/dashboards/resources/cluster.libsonnet index 71d647d9d..0140c0d03 100644 --- a/dashboards/resources/cluster.libsonnet +++ b/dashboards/resources/cluster.libsonnet @@ -1,10 +1,11 @@ +local defaultQueries = import './queries/cluster.libsonnet'; +local defaultVariables = import './variables/cluster.libsonnet'; local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; local prometheus = g.query.prometheus; local stat = g.panel.stat; local table = g.panel.table; local timeSeries = g.panel.timeSeries; -local var = g.dashboard.variable; { local statPanel(title, unit, query) = @@ -35,36 +36,15 @@ local var = g.dashboard.variable; grafanaDashboards+:: { 'k8s-resources-cluster.json': - local variables = { - datasource: - var.datasource.new('datasource', 'prometheus') - + var.datasource.withRegex($._config.datasourceFilterRegex) - + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() - + var.datasource.generalOptions.withLabel('Data source') - + { - current: { - selected: true, - text: $._config.datasourceName, - value: $._config.datasourceName, - }, - }, + // Allow overriding queries via $._queries.cluster, otherwise use default + local queries = if std.objectHas($, '_queries') && std.objectHas($._queries, 'cluster') + then $._queries.cluster + else defaultQueries; - cluster: - var.query.new('cluster') - + var.query.withDatasourceFromVariable(self.datasource) - + var.query.queryTypes.withLabelValues( - $._config.clusterLabel, - 'up{%(cadvisorSelector)s}' % $._config, - ) - + var.query.generalOptions.withLabel('cluster') - + var.query.refresh.onTime() - + ( - if $._config.showMultiCluster - then var.query.generalOptions.showOnDashboard.withLabelAndValue() - else var.query.generalOptions.showOnDashboard.withNothing() - ) - + var.query.withSort(type='alphabetical'), - }; + // Allow overriding variables via $._variables.cluster, otherwise use default + local variables = if std.objectHas($, '_variables') && std.objectHas($._variables, 'cluster') + then $._variables.cluster($._config) + else defaultVariables.cluster($._config); local links = { namespace: { @@ -80,7 +60,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Utilisation', 'percentunit', - 'cluster:node_cpu:ratio_rate5m{%(clusterLabel)s="$cluster"}' % $._config + queries.cpuUtilisation($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -88,7 +68,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Requests Commitment', 'percentunit', - 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config + queries.cpuRequestsCommitment($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -96,7 +76,7 @@ local var = g.dashboard.variable; statPanel( 'CPU Limits Commitment', 'percentunit', - 'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config + queries.cpuLimitsCommitment($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -104,7 +84,7 @@ local var = g.dashboard.variable; statPanel( 'Memory Utilisation', 'percentunit', - '1 - sum(:node_memory_MemAvailable_bytes:sum{%(clusterLabel)s="$cluster"}) / sum(node_memory_MemTotal_bytes{%(nodeExporterSelector)s,%(clusterLabel)s="$cluster"})' % $._config + queries.memoryUtilisation($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -112,7 +92,7 @@ local var = g.dashboard.variable; statPanel( 'Memory Requests Commitment', 'percentunit', - 'sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config + queries.memoryRequestsCommitment($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -120,7 +100,7 @@ local var = g.dashboard.variable; statPanel( 'Memory Limits Commitment', 'percentunit', - 'sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config + queries.memoryLimitsCommitment($._config) ) + stat.gridPos.withW(4) + stat.gridPos.withH(3), @@ -129,38 +109,38 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace)' % $._config + queries.cpuUsageByNamespace($._config) ) + prometheus.withLegendFormat('__auto'), ]), table.new('CPU Quota') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(kube_pod_owner{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.podsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'count(avg(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.workloadsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.cpuUsageByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.cpuRequestsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.cpuUsageVsRequests($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.cpuLimitsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.cpuUsageVsLimits($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) @@ -246,38 +226,38 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace)' % $._config + queries.memoryUsageByNamespace($._config) ) + prometheus.withLegendFormat('__auto'), ]), table.new('Memory Requests by Namespace') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(kube_pod_owner{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.podsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'count(avg(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.workloadsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.memoryUsageByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.memoryRequestsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.memoryUsageVsRequests($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.memoryLimitsByNamespace($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.memoryUsageVsLimits($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) @@ -396,27 +376,27 @@ local var = g.dashboard.variable; table.new('Current Network Usage') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum(rate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkReceiveBandwidth($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(rate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkTransmitBandwidth($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkReceivePackets($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkTransmitPackets($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(rate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkReceivePacketsDropped($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum(rate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config) + prometheus.new('${datasource}', queries.networkTransmitPacketsDropped($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) @@ -510,7 +490,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(rate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.networkReceiveBandwidth($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -520,7 +500,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(rate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.networkTransmitBandwidth($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -530,7 +510,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'avg(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.avgContainerReceiveBandwidth($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -540,7 +520,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'avg(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.avgContainerTransmitBandwidth($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -550,7 +530,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.rateOfReceivedPackets($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -560,7 +540,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.rateOfTransmittedPackets($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -570,7 +550,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.rateOfReceivedPacketsDropped($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -580,7 +560,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % $._config + queries.rateOfTransmittedPacketsDropped($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -590,7 +570,7 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'ceil(sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(diskDeviceSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s])))' % $._config + queries.iopsReadsWrites($._config) ) + prometheus.withLegendFormat('__auto'), ]), @@ -600,34 +580,34 @@ local var = g.dashboard.variable; + tsPanel.queryOptions.withTargets([ prometheus.new( '${datasource}', - 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(diskDeviceSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config + queries.throughputReadWrite($._config) ) + prometheus.withLegendFormat('__auto'), ]), table.new('Current Storage IO') + table.queryOptions.withTargets([ - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.iopsReads($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_writes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.iopsWrites($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.iopsReadsWritesCombined($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.throughputRead($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.throughputWrite($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), - prometheus.new('${datasource}', 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % $._config) + prometheus.new('${datasource}', queries.throughputReadWriteCombined($._config)) + prometheus.withInstant(true) + prometheus.withFormat('table'), ]) diff --git a/dashboards/resources/queries/cluster.libsonnet b/dashboards/resources/queries/cluster.libsonnet new file mode 100644 index 000000000..e60d2c99d --- /dev/null +++ b/dashboards/resources/queries/cluster.libsonnet @@ -0,0 +1,121 @@ +{ + // CPU Queries + cpuUtilisation(config):: + 'cluster:node_cpu:ratio_rate5m{%(clusterLabel)s="$cluster"}' % config, + + cpuRequestsCommitment(config):: + 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % config, + + cpuLimitsCommitment(config):: + 'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % config, + + cpuUsageByNamespace(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace)' % config, + + // CPU Quota Table Queries + podsByNamespace(config):: + 'sum(kube_pod_owner{%(kubeStateMetricsSelector)s, %(clusterLabel)s="$cluster"}) by (namespace)' % config, + + workloadsByNamespace(config):: + 'count(avg(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % config, + + cpuRequestsByNamespace(config):: + 'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + cpuUsageVsRequests(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + cpuLimitsByNamespace(config):: + 'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + cpuUsageVsLimits(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m{%(clusterLabel)s="$cluster"})) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + // Memory Queries + memoryUtilisation(config):: + '1 - sum(:node_memory_MemAvailable_bytes:sum{%(clusterLabel)s="$cluster"}) / sum(node_memory_MemTotal_bytes{%(nodeExporterSelector)s,%(clusterLabel)s="$cluster"})' % config, + + memoryRequestsCommitment(config):: + 'sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % config, + + memoryLimitsCommitment(config):: + 'sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % config, + + memoryUsageByNamespace(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace)' % config, + + // Memory Quota Table Queries + memoryRequestsByNamespace(config):: + 'sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + memoryUsageVsRequests(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + memoryLimitsByNamespace(config):: + 'sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + memoryUsageVsLimits(config):: + 'sum(max by (%(clusterLabel)s, %(namespaceLabel)s, pod, container)(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""})) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % config, + + // Network Queries + networkReceiveBandwidth(config):: + 'sum(rate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + networkTransmitBandwidth(config):: + 'sum(rate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + networkReceivePackets(config):: + 'sum(rate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + networkTransmitPackets(config):: + 'sum(rate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + networkReceivePacketsDropped(config):: + 'sum(rate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + networkTransmitPacketsDropped(config):: + 'sum(rate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + avgContainerReceiveBandwidth(config):: + 'avg(irate(container_network_receive_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + avgContainerTransmitBandwidth(config):: + 'avg(irate(container_network_transmit_bytes_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + rateOfReceivedPackets(config):: + 'sum(irate(container_network_receive_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + rateOfTransmittedPackets(config):: + 'sum(irate(container_network_transmit_packets_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + rateOfReceivedPacketsDropped(config):: + 'sum(irate(container_network_receive_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + rateOfTransmittedPacketsDropped(config):: + 'sum(irate(container_network_transmit_packets_dropped_total{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[%(grafanaIntervalVar)s])) by (namespace)' % config, + + // Storage Queries + iopsReadsWrites(config):: + 'ceil(sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(diskDeviceSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s])))' % config, + + throughputReadWrite(config):: + 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(diskDeviceSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + iopsReads(config):: + 'sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + iopsWrites(config):: + 'sum by(namespace) (rate(container_fs_writes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + iopsReadsWritesCombined(config):: + 'sum by(namespace) (rate(container_fs_reads_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + throughputRead(config):: + 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + throughputWrite(config):: + 'sum by(namespace) (rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, + + throughputReadWriteCombined(config):: + 'sum by(namespace) (rate(container_fs_reads_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]) + rate(container_fs_writes_bytes_total{%(cadvisorSelector)s, %(diskDeviceSelector)s, %(containerfsSelector)s, %(clusterLabel)s="$cluster", namespace!=""}[%(grafanaIntervalVar)s]))' % config, +} diff --git a/dashboards/resources/variables/cluster.libsonnet b/dashboards/resources/variables/cluster.libsonnet new file mode 100644 index 000000000..c670ea07d --- /dev/null +++ b/dashboards/resources/variables/cluster.libsonnet @@ -0,0 +1,37 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local var = g.dashboard.variable; + +{ + // Cluster dashboard variables + // Returns both datasource and cluster variables + cluster(config):: { + datasource: + var.datasource.new('datasource', 'prometheus') + + var.datasource.withRegex(config.datasourceFilterRegex) + + var.datasource.generalOptions.showOnDashboard.withLabelAndValue() + + var.datasource.generalOptions.withLabel('Data source') + + { + current: { + selected: true, + text: config.datasourceName, + value: config.datasourceName, + }, + }, + + cluster: + var.query.new('cluster') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + config.clusterLabel, + 'up{%(cadvisorSelector)s}' % config, + ) + + var.query.generalOptions.withLabel('cluster') + + var.query.refresh.onTime() + + ( + if config.showMultiCluster + then var.query.generalOptions.showOnDashboard.withLabelAndValue() + else var.query.generalOptions.showOnDashboard.withNothing() + ) + + var.query.withSort(type='alphabetical'), + }, +}