diff --git a/install/kubernetes/all_image.py b/install/kubernetes/all_image.py index 4de98bd5f..02a804bc0 100644 --- a/install/kubernetes/all_image.py +++ b/install/kubernetes/all_image.py @@ -39,7 +39,7 @@ 'quay.io/prometheus/prometheus:v2.3.1', "quay.io/prometheus/prometheus:v2.27.1", 'quay.io/coreos/kube-state-metrics:v1.3.1', - 'quay.io/prometheus/node-exporter:v0.15.2', + 'prom/node-exporter:v1.3.1', 'quay.io/coreos/kube-rbac-proxy:v0.3.1', 'quay.io/coreos/addon-resizer:1.0', "quay.io/prometheus-operator/prometheus-operator:v0.46.0", diff --git a/install/kubernetes/prometheus/grafana/dashboard/all-node.json b/install/kubernetes/prometheus/grafana/dashboard/all-node.json index ac797c40e..74f301f02 100644 --- a/install/kubernetes/prometheus/grafana/dashboard/all-node.json +++ b/install/kubernetes/prometheus/grafana/dashboard/all-node.json @@ -178,7 +178,7 @@ "targets": [ { "exemplar": true, - "expr": "DCGM_FI_DEV_GPU_UTIL{exported_pod!=''}", + "expr": "DCGM_FI_DEV_GPU_UTIL{pod!=''}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -441,7 +441,7 @@ "steppedLine": false, "targets": [ { - "expr": "100 - (avg by (instance) (irate(node_cpu{job=\"node-exporter\", mode=\"idle\"}[5m])) * 100)\n", + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[5m])) * 100)\n", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", @@ -540,7 +540,7 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal{job=\"node-exporter\"}\n- node_memory_MemFree{job=\"node-exporter\"}\n- node_memory_Buffers{job=\"node-exporter\"}\n- node_memory_Cached{job=\"node-exporter\"}\n", + "expr": "node_memory_MemTotal_bytes{job=\"node-exporter\"}\n- node_memory_MemFree_bytes{job=\"node-exporter\"}\n- node_memory_Buffers_bytes{job=\"node-exporter\"}\n- node_memory_Cached_bytes{job=\"node-exporter\"}\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -639,7 +639,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes{job=\"node-exporter\", device!~\"lo\"}[5m])", + "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", device!~\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}:{{device}}", @@ -738,7 +738,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes{job=\"node-exporter\", device!~\"lo\"}[5m])", + "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!~\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}:{{device}}", @@ -848,21 +848,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (instance) (rate(node_disk_bytes_read{job=\"node-exporter\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_read_bytes_total{job=\"node-exporter\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} read", "refId": "A" }, { - "expr": "sum by (instance) (rate(node_disk_bytes_written{job=\"node-exporter\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_written_bytes_total{job=\"node-exporter\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} written", "refId": "B" }, { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{job=\"node-exporter\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_io_time_seconds_total{job=\"node-exporter\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} io time", diff --git a/install/kubernetes/prometheus/grafana/dashboard/istio-service.json b/install/kubernetes/prometheus/grafana/dashboard/istio-service.json index 79a3fca86..0790eccb9 100644 --- a/install/kubernetes/prometheus/grafana/dashboard/istio-service.json +++ b/install/kubernetes/prometheus/grafana/dashboard/istio-service.json @@ -72,7 +72,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (destination_workload) (istio_requests_total{destination_service_namespace=\"$namespace\",destination_workload=~\"($service)\"})", + "expr": "sum by (destination_workload) (istio_requests_total{destination_service_namespace=~\"$namespace\",destination_workload=~\"($service)\"})", "format": "time_series", "hide": true, "interval": "", @@ -82,7 +82,7 @@ }, { "exemplar": true, - "expr": "sum by (destination_workload,response_code) (irate(istio_requests_total{destination_service_namespace=\"$namespace\",destination_workload=~\"($service)\"}[1m]))", + "expr": "sum by (destination_workload,response_code) (irate(istio_requests_total{destination_service_namespace=~\"$namespace\",destination_workload=~\"($service)\"}[1m]))", "hide": false, "interval": "", "legendFormat": "{{response_code}} , {{destination_workload}}", @@ -183,7 +183,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (destination_workload) (irate(istio_request_bytes_sum{destination_service_namespace=\"$namespace\",destination_workload=~\"($service)\"}[1m]))", + "expr": "sum by (destination_workload) (irate(istio_request_bytes_sum{destination_service_namespace=~\"$namespace\",destination_workload=~\"($service)\"}[1m]))", "format": "time_series", "hide": false, "interval": "", @@ -284,7 +284,7 @@ "targets": [ { "exemplar": true, - "expr": "avg by (destination_workload) (rate(istio_request_duration_milliseconds_sum{destination_service_namespace=\"$namespace\",destination_workload=~\"($service)\"}[1m])/rate(istio_request_duration_milliseconds_count{destination_service_namespace=\"$namespace\",destination_workload=~\"($service)\"}[1m]))", + "expr": "avg by (destination_workload) ((rate(istio_request_duration_milliseconds_sum{destination_service_namespace=~\"$namespace\",destination_workload=~\"($service)\"}[1m])/rate(istio_request_duration_milliseconds_count{destination_service_namespace=~\"$namespace\",destination_workload=~\"($service)\"}[1m]))>0)", "format": "time_series", "hide": false, "interval": "", @@ -606,7 +606,7 @@ "targets": [ { "exemplar": true, - "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{exported_pod=~\".*$service.*\"}", + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{pod=~\".*$service.*\"}", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -808,7 +808,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (destination_workload) (istio_requests_total{destination_service_namespace=\"$namespace\"})", + "expr": "sum by (destination_workload) (istio_requests_total{destination_service_namespace=~\"$namespace\"})", "format": "time_series", "hide": false, "interval": "", @@ -904,7 +904,7 @@ "value": "$__all" }, "datasource": "prometheus", - "definition": "label_values(istio_requests_total{destination_service_namespace=\"$namespace\"}, destination_service_name)", + "definition": "label_values(istio_requests_total{destination_service_namespace=~\"$namespace\"}, destination_service_name)", "description": null, "error": null, "hide": 0, @@ -914,7 +914,7 @@ "name": "service", "options": [], "query": { - "query": "label_values(istio_requests_total{destination_service_namespace=\"$namespace\"}, destination_service_name)", + "query": "label_values(istio_requests_total{destination_service_namespace=~\"$namespace\"}, destination_service_name)", "refId": "StandardVariableQuery" }, "refresh": 1, diff --git a/install/kubernetes/prometheus/grafana/dashboard/pod-info.json b/install/kubernetes/prometheus/grafana/dashboard/pod-info.json index 2912c28fe..ee1978920 100644 --- a/install/kubernetes/prometheus/grafana/dashboard/pod-info.json +++ b/install/kubernetes/prometheus/grafana/dashboard/pod-info.json @@ -105,7 +105,7 @@ }, { "exemplar": true, - "expr": "sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}[2m]))", + "expr": "sum by (node,namespace,pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod!=\"\",namespace=~\"pipeline|jupyter|service|katib\"}[3m]))", "format": "table", "hide": false, "instant": true, @@ -406,7 +406,7 @@ "targets": [ { "exemplar": true, - "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod=~\".*$pod.*\"}[2m]))", + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod=~\".*$pod.*\"}[3m]))", "format": "time_series", "instant": false, "interval": "30s", diff --git a/install/kubernetes/prometheus/node-exporter/node-exporter-ds.yml b/install/kubernetes/prometheus/node-exporter/node-exporter-ds.yml index 4702af73a..1d7f1a054 100755 --- a/install/kubernetes/prometheus/node-exporter/node-exporter-ds.yml +++ b/install/kubernetes/prometheus/node-exporter/node-exporter-ds.yml @@ -27,7 +27,7 @@ spec: key: node-role.kubernetes.io/master containers: - name: node-exporter - image: quay.io/prometheus/node-exporter:v0.15.2 # quay.io/prometheus/node-exporter:v0.15.2 + image: prom/node-exporter:v1.3.1 # 原版本为 quay.io/prometheus/node-exporter:v0.15.2 ,若出现grafana报表不显示的情况,请退回旧版 args: - --web.listen-address=127.0.0.1:9101 - --path.procfs=/host/proc