File tree Expand file tree Collapse file tree 1 file changed +11
-9
lines changed
4.validation_and_observability/4.prometheus-grafana/1click-dashboards-deployment Expand file tree Collapse file tree 1 file changed +11
-9
lines changed Original file line number Diff line number Diff line change @@ -169,17 +169,17 @@ Resources:
169169 Value : !Sub
170170 - |
171171 global:
172- scrape_interval: 15s
173- evaluation_interval: 15s
174- scrape_timeout: 15s
172+ scrape_interval: 1m
173+ evaluation_interval: 2m
174+ scrape_timeout: 5m
175175
176176 scrape_configs:
177177 - job_name: slurm_exporter
178- scrape_interval: 5s
178+ scrape_interval: 60s
179179 ec2_sd_configs:
180180 - port: 8080
181181 region: ${AWS::Region}
182- refresh_interval: 10s
182+ refresh_interval: 300s
183183 filters:
184184 - name: instance-state-name
185185 values:
@@ -192,11 +192,11 @@ Resources:
192192 - ${PARALLELCLUSTER_NAME}
193193
194194 - job_name: efa_node_exporter
195- scrape_interval: 5s
195+ scrape_interval: 60s
196196 ec2_sd_configs:
197197 - port: 9100
198198 region: ${AWS::Region}
199- refresh_interval: 10s
199+ refresh_interval: 300s
200200 filters:
201201 - name: instance-state-name
202202 values:
@@ -214,13 +214,14 @@ Resources:
214214 - p5.48xlarge
215215 - p5e.48xlarge
216216 - p5en.48xlarge
217+ - p6-b200.48xlarge
217218
218219 - job_name: dcgm_exporter
219- scrape_interval: 5s
220+ scrape_interval: 60s
220221 ec2_sd_configs:
221222 - port: 9400
222223 region: ${AWS::Region}
223- refresh_interval: 10s
224+ refresh_interval: 300s
224225 filters:
225226 - name: instance-state-name
226227 values:
@@ -238,6 +239,7 @@ Resources:
238239 - p5.48xlarge
239240 - p5e.48xlarge
240241 - p5en.48xlarge
242+ - p6-b200.48xlarge
241243
242244 relabel_configs:
243245 - source_labels: [__meta_ec2_tag_Name]
You can’t perform that action at this time.
0 commit comments