diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/README-Manual.md b/Monitoring/monitor_fsxn_with_harvest_on_ec2/README-Manual.md index 9cd0124a..14ba0d31 100644 --- a/Monitoring/monitor_fsxn_with_harvest_on_ec2/README-Manual.md +++ b/Monitoring/monitor_fsxn_with_harvest_on_ec2/README-Manual.md @@ -244,7 +244,7 @@ services: #### 5.5. Download FSxN dashboards and import into Grafana container: The following commands will download the FSxN designed dashboards from this repo and replace the default Grafana dashboards with them: ```yaml -wget https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_fsxn_with_grafana/fsx_dashboards.zip +wget https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip unzip fsx_dashboards.zip rm -rf grafana/dashboards mv dashboards grafana/dashboards diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_CW_Utilization.json b/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_CW_Utilization.json index d6cc1f21..bc19dda0 100644 --- a/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_CW_Utilization.json +++ b/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_CW_Utilization.json @@ -25,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 35, + "iteration": 1740487115678, "links": [ { "asDropdown": true, @@ -45,10 +45,6 @@ "liveNow": false, "panels": [ { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, "description": "Average latency in microseconds for the WAFL filesystem to process all the operations on the volume; not including request processing or network communication time.", "fieldConfig": { "defaults": { @@ -56,8 +52,6 @@ "mode": "palette-classic" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -131,7 +125,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg(aws_fsx_disk_read_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId)", + "expr": "avg(aws_fsx_disk_read_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId) /60", "interval": "", "legendFormat": "{{dimension_FileSystemId}}_read_operations", "range": true, @@ -143,9 +137,11 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "avg(aws_fsx_disk_write_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId)", + "exemplar": true, + "expr": "avg(aws_fsx_disk_write_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId) /60", "hide": false, "instant": false, + "interval": "", "legendFormat": "{{dimension_FileSystemId}}_write_operations", "range": true, "refId": "B" @@ -156,9 +152,11 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(aws_fsx_disk_write_operations_average{dimension_FileSystemId=~\"$Cluster\"} + aws_fsx_disk_read_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId)", + "exemplar": true, + "expr": "sum(aws_fsx_disk_write_operations_average{dimension_FileSystemId=~\"$Cluster\"} + aws_fsx_disk_read_operations_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId) /60", "hide": false, "instant": false, + "interval": "", "legendFormat": "{{dimension_FileSystemId}}_sum_operations", "range": true, "refId": "C" @@ -169,10 +167,6 @@ "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, "description": "Average latency in microseconds for the WAFL filesystem to process all the operations on the volume; not including request processing or network communication time.", "fieldConfig": { "defaults": { @@ -180,8 +174,6 @@ "mode": "palette-classic" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -267,9 +259,11 @@ "uid": "prometheus" }, "editorMode": "code", + "exemplar": true, "expr": "avg(aws_fsx_disk_read_bytes_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId) / 1048576", "hide": false, "instant": false, + "interval": "", "legendFormat": "{{dimension_FileSystemId}}_read_megabytes", "range": true, "refId": "B" @@ -280,9 +274,11 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(aws_fsx_disk_read_bytes_average{dimension_FileSystemId=~\"$Cluster\"} + aws_fsx_disk_write_bytes_average{dimension_FileSystemId=~\"$Cluster\"} ) by (dimension_FileSystemId)", + "exemplar": true, + "expr": "sum(aws_fsx_disk_read_bytes_average{dimension_FileSystemId=~\"$Cluster\"} + aws_fsx_disk_write_bytes_average{dimension_FileSystemId=~\"$Cluster\"} ) by (dimension_FileSystemId) / 1048576", "hide": false, "instant": false, + "interval": "", "legendFormat": "{{dimension_FileSystemId}}_sum_megabytes", "range": true, "refId": "C" @@ -293,10 +289,6 @@ "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, "description": "Average latency in microseconds for the WAFL filesystem to process all the operations on the volume; not including request processing or network communication time.", "fieldConfig": { "defaults": { @@ -304,8 +296,6 @@ "mode": "palette-classic" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, @@ -334,6 +324,7 @@ } }, "mappings": [], + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -353,7 +344,7 @@ "options": { "mode": "exclude", "names": [ - "fs-038c109c5e023e533_network_throughput_utilization" + "fs-09261fd9478d5a3e5_network_throughput_utilization" ], "prefix": "All except:", "readOnly": true @@ -429,9 +420,11 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(aws_fsx_file_server_disk_throughput_utilization_average{dimension_FileSystemId=~\"$Cluster\"} + aws_fsx_disk_write_bytes_average{dimension_FileSystemId=~\"$Cluster\"} ) by (dimension_FileSystemId)", + "exemplar": true, + "expr": "avg(aws_fsx_file_server_disk_throughput_utilization_average{dimension_FileSystemId=~\"$Cluster\"}) by (dimension_FileSystemId)", "hide": false, "instant": false, + "interval": "", "legendFormat": "{{dimension_FileSystemId}}_disk_throughput_utilization", "range": true, "refId": "C" @@ -443,20 +436,20 @@ "type": "timeseries" } ], - "refresh": false, - "schemaVersion": 38, + "refresh": "", + "schemaVersion": 34, "style": "dark", "tags": [], "templating": { "list": [ { "current": { - "selected": false, + "selected": true, "text": [ - "" + "fs-09261fd9478d5a3e5" ], "value": [ - "" + "fs-09261fd9478d5a3e5" ] }, "datasource": { @@ -504,6 +497,6 @@ "timezone": "", "title": "FSxN: Utilization", "uid": "c467bd93-f82b-4c34-8bc0-3a8b71e39898", - "version": 3, + "version": 2, "weekStart": "" } diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_Data_protection.json b/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_Data_protection.json index 3c33fe0f..4a2bc387 100644 --- a/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_Data_protection.json +++ b/Monitoring/monitor_fsxn_with_harvest_on_ec2/dashboards/FSxN_Data_protection.json @@ -140,10 +140,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{})) OR on() vector(0)", + "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{})) OR on() vector(0)", "instant": true, "interval": "", "legendFormat": "Volumes protected", @@ -152,10 +152,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy=~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}))) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy=~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}))) OR on() vector(0)", "hide": false, "instant": true, "interval": "", @@ -222,10 +222,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{})) OR on() vector(0)", + "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{})) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -241,7 +241,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Volumes Breaching Snapshot Copy Reserve Space.", "fieldConfig": { @@ -326,10 +326,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)", + "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)", "instant": true, "interval": "", "legendFormat": "Volumes breached", @@ -338,10 +338,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "(count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{})) * on (volume, svm, cluster) group_right() (volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} <= volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)) + (count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} == 0)) OR on() vector(0))", + "expr": "(count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{})) * on (volume, svm, cluster) group_right() (volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} <= volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)) + (count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} == 0)) OR on() vector(0))", "instant": true, "interval": "", "legendFormat": "Volumes not breached", @@ -354,7 +354,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot size used is breached the snapshot reserve size.", "fieldConfig": { @@ -407,10 +407,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)", + "expr": "count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -426,7 +426,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes that are not protected.", "fieldConfig": { @@ -490,10 +490,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy=~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}))) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy=~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}))) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -509,7 +509,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot size used is not breached the snapshot reserve size.", "fieldConfig": { @@ -573,10 +573,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "(count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{})) * on (volume, svm, cluster) group_right() (volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} <= volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)) + (count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} == 0)) OR on() vector(0))", + "expr": "(count((volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{})) * on (volume, svm, cluster) group_right() (volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} > 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} <= volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"})) OR on() vector(0)) + (count(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} == 0)) OR on() vector(0))", "format": "time_series", "hide": false, "instant": true, @@ -592,7 +592,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "This panel displays volumes detail with protected status and snapshot policy.", "fieldConfig": { @@ -732,10 +732,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "label_replace(label_replace(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", volume!~\"MDV.*\"}, \"Status\", \"Protected\", \"snapshot_policy\", \"(.*)\") , \"Status\", \"Unprotected\", \"snapshot_policy\", \"(none.*)\") * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{})", + "expr": "label_replace(label_replace(volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", volume!~\"MDV.*\"}, \"Status\", \"Protected\", \"snapshot_policy\", \"(.*)\") , \"Status\", \"Unprotected\", \"snapshot_policy\", \"(none.*)\") * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{})", "format": "table", "instant": true, "interval": "", @@ -775,7 +775,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Volumes details of snapshot copy reserve space.", "fieldConfig": { @@ -965,10 +965,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0)", + "expr": "volume_labels{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\", snapshot_policy!=\"\", snapshot_policy!~\"none.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{}) * on (volume, svm, cluster) group_right() ( volume_snapshot_reserve_size{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0 and volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"} >= 0)", "format": "table", "hide": false, "instant": true, @@ -980,7 +980,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, "expr": "volume_snapshots_size_used{datacenter=~\"$Datacenter\", cluster=~\"$Cluster\"}", @@ -1083,7 +1083,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot count is < 10.", "fieldConfig": { @@ -1136,10 +1136,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 0 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} < 10)) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 0 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} < 10)) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -1155,7 +1155,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot count is between 10 to 100.", "fieldConfig": { @@ -1208,10 +1208,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} >= 10 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} <= 100)) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} >= 10 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} <= 100)) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -1227,7 +1227,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot count is between 101 to 500.", "fieldConfig": { @@ -1280,10 +1280,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 100 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} <= 500)) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 100 and volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} <= 500)) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -1299,7 +1299,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "Total number of volumes whose snapshot count is > 500.", "fieldConfig": { @@ -1352,10 +1352,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 500)) OR on() vector(0)", + "expr": "count((volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 500)) OR on() vector(0)", "format": "time_series", "hide": false, "instant": true, @@ -1371,7 +1371,7 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "description": "This panel displays volumes detail with snapshot count.", "fieldConfig": { @@ -1586,10 +1586,10 @@ { "datasource": { "type": "prometheus", - "uid": "ee9vj35on6mtcb" + "uid": "prometheus" }, "exemplar": false, - "expr": "(volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_total_schedules{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 0)", + "expr": "(volume_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",snapshot_policy!=\"\", snapshot_policy!~\"none.*\", volume!~\"MDV.*\"} * on (snapshot_policy) group_left () group by (snapshot_policy) (snapshot_policy_labels{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})) * on (volume,svm,cluster) (volume_snapshot_count{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"} > 0)", "format": "table", "instant": true, "interval": "", @@ -1658,7 +1658,7 @@ { "current": { "text": "prometheus", - "value": "ee9vj35on6mtcb" + "value": "prometheus" }, "hide": 2, "includeAll": false, diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip b/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip index a9df99f6..4dedadcf 100644 Binary files a/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip and b/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip differ diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/harvest-grafana-cf-template.yaml b/Monitoring/monitor_fsxn_with_harvest_on_ec2/harvest-grafana-cf-template.yaml index 296b7ad0..dbe2e176 100644 --- a/Monitoring/monitor_fsxn_with_harvest_on_ec2/harvest-grafana-cf-template.yaml +++ b/Monitoring/monitor_fsxn_with_harvest_on_ec2/harvest-grafana-cf-template.yaml @@ -160,8 +160,8 @@ Resources: sed -i 's|ghcr.io/netapp/harvest:latest|ghcr.io/tlvdevops/harvest-fsx:latest|g' harvest-compose.yml # Download grafana dashbaords - wget https://raw.githubusercontent.com/TLVDevOps/harvest-fsx/main/dashboards.zip - unzip dashboards.zip + wget wget https://raw.githubusercontent.com/NetApp/FSx-ONTAP-samples-scripts/main/Monitoring/monitor_fsxn_with_harvest_on_ec2/fsx_dashboards.zip + unzip fsx_dashboards.zip rm -rf grafana/dashboards && mv dashboards grafana/dashboards # Create yace-config.yaml @@ -189,8 +189,6 @@ Resources: statistics: [Average] - name: FileServerDiskThroughputUtilization statistics: [Average] - - name: CPUUtilization - statistics: [Average] EOF # Append YACE service to Docker Compose file diff --git a/Monitoring/monitor_fsxn_with_harvest_on_ec2/yace-config.yaml b/Monitoring/monitor_fsxn_with_harvest_on_ec2/yace-config.yaml index 4be472d0..f2023332 100644 --- a/Monitoring/monitor_fsxn_with_harvest_on_ec2/yace-config.yaml +++ b/Monitoring/monitor_fsxn_with_harvest_on_ec2/yace-config.yaml @@ -21,5 +21,3 @@ discovery: statistics: [Average] - name: FileServerDiskThroughputUtilization statistics: [Average] - - name: CPUUtilization - statistics: [Average]