@@ -23,7 +23,7 @@ local servingOverviewRow = {
23
23
type: 'row' ,
24
24
};
25
25
26
- local modelRequestRatePanel = {
26
+ local modelRequestRatePanel(matcher) = {
27
27
datasource: promDatasource,
28
28
description: 'Rate of requests over time for the selected model. Grouped by statuses.' ,
29
29
fieldConfig: {
@@ -80,7 +80,7 @@ local modelRequestRatePanel = {
80
80
},
81
81
targets: [
82
82
prometheus.target(
83
- 'rate(:tensorflow:serving:request_count{job=~"$job",instance=~"$instance", model_name=~"$model_name"}[$__rate_interval])' ,
83
+ 'rate(:tensorflow:serving:request_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])' ,
84
84
datasource=promDatasource,
85
85
legendFormat='model_name="{{model_name}}",status="{{status}}"' ,
86
86
),
@@ -90,7 +90,7 @@ local modelRequestRatePanel = {
90
90
type: 'timeseries' ,
91
91
};
92
92
93
- local modelPredictRequestLatencyPanel = {
93
+ local modelPredictRequestLatencyPanel(matcher) = {
94
94
datasource: promDatasource,
95
95
description: 'Average request latency of predict requests for the selected model.' ,
96
96
fieldConfig: {
@@ -147,7 +147,7 @@ local modelPredictRequestLatencyPanel = {
147
147
},
148
148
targets: [
149
149
prometheus.target(
150
- 'increase(:tensorflow:serving:request_latency_sum{job=~"$job",instance=~"$instance", model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:request_latency_count{job=~"$job",instance=~"$instance", model_name=~"$model_name"}[$__rate_interval])' ,
150
+ 'increase(:tensorflow:serving:request_latency_sum{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:request_latency_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])' ,
151
151
datasource=promDatasource,
152
152
legendFormat='model_name="{{model_name}}"' ,
153
153
),
@@ -157,7 +157,7 @@ local modelPredictRequestLatencyPanel = {
157
157
type: 'timeseries' ,
158
158
};
159
159
160
- local modelPredictRuntimeLatencyPanel = {
160
+ local modelPredictRuntimeLatencyPanel(matcher) = {
161
161
datasource: promDatasource,
162
162
description: 'Average runtime latency to fulfill a predict request for the selected model.' ,
163
163
fieldConfig: {
@@ -214,7 +214,7 @@ local modelPredictRuntimeLatencyPanel = {
214
214
},
215
215
targets: [
216
216
prometheus.target(
217
- 'increase(:tensorflow:serving:runtime_latency_sum{job=~"$job",instance=~"$instance", model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:runtime_latency_count{job=~"$job",instance=~"$instance", model_name=~"$model_name"}[$__rate_interval])' ,
217
+ 'increase(:tensorflow:serving:runtime_latency_sum{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:runtime_latency_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])' ,
218
218
datasource=promDatasource,
219
219
legendFormat='model_name="{{model_name}}"' ,
220
220
),
@@ -224,7 +224,7 @@ local modelPredictRuntimeLatencyPanel = {
224
224
type: 'timeseries' ,
225
225
};
226
226
227
- local graphBuildCallsPanel = {
227
+ local graphBuildCallsPanel(matcher) = {
228
228
datasource: promDatasource,
229
229
description: 'Number of times TensorFlow Serving has created a new client graph.' ,
230
230
fieldConfig: {
@@ -281,7 +281,7 @@ local graphBuildCallsPanel = {
281
281
},
282
282
targets: [
283
283
prometheus.target(
284
- 'increase(:tensorflow:core:graph_build_calls{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
284
+ 'increase(:tensorflow:core:graph_build_calls{' + matcher + ' }[$__rate_interval])' ,
285
285
datasource=promDatasource,
286
286
),
287
287
],
@@ -290,7 +290,7 @@ local graphBuildCallsPanel = {
290
290
type: 'timeseries' ,
291
291
};
292
292
293
- local graphRunsPanel = {
293
+ local graphRunsPanel(matcher) = {
294
294
datasource: promDatasource,
295
295
description: 'Number of graph executions.' ,
296
296
fieldConfig: {
@@ -347,7 +347,7 @@ local graphRunsPanel = {
347
347
},
348
348
targets: [
349
349
prometheus.target(
350
- 'increase(:tensorflow:core:graph_runs{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
350
+ 'increase(:tensorflow:core:graph_runs{' + matcher + ' }[$__rate_interval])' ,
351
351
datasource=promDatasource,
352
352
),
353
353
],
@@ -356,7 +356,7 @@ local graphRunsPanel = {
356
356
type: 'timeseries' ,
357
357
};
358
358
359
- local graphBuildTimePanel = {
359
+ local graphBuildTimePanel(matcher) = {
360
360
datasource: promDatasource,
361
361
description: 'Amount of time Tensorflow has spent creating new client graphs.' ,
362
362
fieldConfig: {
@@ -413,7 +413,7 @@ local graphBuildTimePanel = {
413
413
},
414
414
targets: [
415
415
prometheus.target(
416
- 'increase(:tensorflow:core:graph_build_time_usecs{job=~"$job",instance=~"$instance" }[$__rate_interval])/increase(:tensorflow:core:graph_build_calls{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
416
+ 'increase(:tensorflow:core:graph_build_time_usecs{' + matcher + ' }[$__rate_interval])/increase(:tensorflow:core:graph_build_calls{' + matcher + ' }[$__rate_interval])' ,
417
417
datasource=promDatasource,
418
418
),
419
419
],
@@ -422,7 +422,7 @@ local graphBuildTimePanel = {
422
422
type: 'timeseries' ,
423
423
};
424
424
425
- local graphRunTimePanel = {
425
+ local graphRunTimePanel(matcher) = {
426
426
datasource: promDatasource,
427
427
description: 'Amount of time spent executing graphs.' ,
428
428
fieldConfig: {
@@ -479,7 +479,7 @@ local graphRunTimePanel = {
479
479
},
480
480
targets: [
481
481
prometheus.target(
482
- 'increase(:tensorflow:core:graph_run_time_usecs{job=~"$job",instance=~"$instance" }[$__rate_interval])/increase(:tensorflow:core:graph_runs{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
482
+ 'increase(:tensorflow:core:graph_run_time_usecs{' + matcher + ' }[$__rate_interval])/increase(:tensorflow:core:graph_runs{' + matcher + ' }[$__rate_interval])' ,
483
483
datasource=promDatasource,
484
484
),
485
485
],
@@ -488,7 +488,7 @@ local graphRunTimePanel = {
488
488
type: 'timeseries' ,
489
489
};
490
490
491
- local batchQueuingLatencyPanel = {
491
+ local batchQueuingLatencyPanel(matcher) = {
492
492
datasource: promDatasource,
493
493
description: 'Current latency in the batching queue.' ,
494
494
fieldConfig: {
@@ -545,7 +545,7 @@ local batchQueuingLatencyPanel = {
545
545
},
546
546
targets: [
547
547
prometheus.target(
548
- 'increase(:tensorflow:serving:batching_session:queuing_latency_sum{job=~"$job",instance=~"$instance" }[$__rate_interval])/increase(:tensorflow:serving:batching_session:queuing_latency_count{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
548
+ 'increase(:tensorflow:serving:batching_session:queuing_latency_sum{' + matcher + ' }[$__rate_interval])/increase(:tensorflow:serving:batching_session:queuing_latency_count{' + matcher + ' }[$__rate_interval])' ,
549
549
datasource=promDatasource,
550
550
),
551
551
],
@@ -554,7 +554,7 @@ local batchQueuingLatencyPanel = {
554
554
type: 'timeseries' ,
555
555
};
556
556
557
- local batchQueueThroughputPanel = {
557
+ local batchQueueThroughputPanel(matcher) = {
558
558
datasource: promDatasource,
559
559
description: 'Rate of batch queue throughput over time.' ,
560
560
fieldConfig: {
@@ -611,7 +611,7 @@ local batchQueueThroughputPanel = {
611
611
},
612
612
targets: [
613
613
prometheus.target(
614
- 'rate(:tensorflow:serving:batching_session:queuing_latency_count{job=~"$job",instance=~"$instance" }[$__rate_interval])' ,
614
+ 'rate(:tensorflow:serving:batching_session:queuing_latency_count{' + matcher + ' }[$__rate_interval])' ,
615
615
datasource=promDatasource,
616
616
),
617
617
],
@@ -620,7 +620,7 @@ local batchQueueThroughputPanel = {
620
620
type: 'timeseries' ,
621
621
};
622
622
623
- local containerLogsPanel = {
623
+ local containerLogsPanel(matcher) = {
624
624
datasource: lokiDatasource,
625
625
description: 'Logs from the TensorFlow Serving Docker container.' ,
626
626
options: {
@@ -637,7 +637,7 @@ local containerLogsPanel = {
637
637
{
638
638
datasource: lokiDatasource,
639
639
editorMode: 'code' ,
640
- expr: '{name="tensorflow",job=~"$job",instance=~"$instance" }' ,
640
+ expr: '{name="tensorflow",' + matcher + ' }' ,
641
641
legendFormat: '' ,
642
642
queryType: 'range' ,
643
643
refId: 'A' ,
@@ -648,6 +648,8 @@ local containerLogsPanel = {
648
648
type: 'logs' ,
649
649
};
650
650
651
+ local getMatcher(cfg) = '%(tensorflowSelector)s, instance=~"$instance"' % cfg;
652
+
651
653
{
652
654
grafanaDashboards+:: {
653
655
'tensorflow-overview.json' :
@@ -681,10 +683,22 @@ local containerLogsPanel = {
681
683
allValues='.+' ,
682
684
sort=1 ,
683
685
),
686
+ template.new(
687
+ 'cluster' ,
688
+ promDatasource,
689
+ 'label_values(:tensorflow:serving:request_count{}, cluster)' % $._config,
690
+ label='Cluster' ,
691
+ refresh=2 ,
692
+ includeAll=true ,
693
+ multi=true ,
694
+ allValues='.*' ,
695
+ hide=if $._config.enableMultiCluster then '' else 'variable' ,
696
+ sort=0
697
+ ),
684
698
template.new(
685
699
'instance' ,
686
700
promDatasource,
687
- 'label_values(:tensorflow:serving:request_count{job=~"$job" }, instance)' ,
701
+ 'label_values(:tensorflow:serving:request_count{%(tensorflowSelector)s }, instance)' % $._config ,
688
702
label='Instance' ,
689
703
refresh='time' ,
690
704
includeAll=true ,
@@ -695,7 +709,7 @@ local containerLogsPanel = {
695
709
template.new(
696
710
'model_name' ,
697
711
promDatasource,
698
- 'label_values(:tensorflow:serving:request_count{job=~"$job",instance=~"$instance"} , model_name)' ,
712
+ 'label_values(:tensorflow:serving:request_count{%(tensorflowSelector)s}} , model_name)' % $._config ,
699
713
label='Model name' ,
700
714
refresh='time' ,
701
715
includeAll=true ,
@@ -718,23 +732,23 @@ local containerLogsPanel = {
718
732
std.flattenArrays ([
719
733
// Model Row
720
734
[
721
- modelRequestRatePanel { gridPos: { h: 8 , w: 24 , x: 0 , y: 0 } },
722
- modelPredictRequestLatencyPanel { gridPos: { h: 8 , w: 12 , x: 0 , y: 8 } },
723
- modelPredictRuntimeLatencyPanel { gridPos: { h: 8 , w: 12 , x: 12 , y: 8 } },
735
+ modelRequestRatePanel(getMatcher($._config)) { gridPos: { h: 8 , w: 24 , x: 0 , y: 0 } },
736
+ modelPredictRequestLatencyPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 0 , y: 8 } },
737
+ modelPredictRuntimeLatencyPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 12 , y: 8 } },
724
738
],
725
739
// Serving Overview Row
726
740
[
727
741
servingOverviewRow { gridPos: { h: 1 , w: 24 , x: 0 , y: 16 } },
728
- graphBuildCallsPanel { gridPos: { h: 8 , w: 12 , x: 0 , y: 17 } },
729
- graphRunsPanel { gridPos: { h: 8 , w: 12 , x: 12 , y: 17 } },
730
- graphBuildTimePanel { gridPos: { h: 8 , w: 12 , x: 0 , y: 25 } },
731
- graphRunTimePanel { gridPos: { h: 8 , w: 12 , x: 12 , y: 25 } },
732
- batchQueuingLatencyPanel { gridPos: { h: 8 , w: 12 , x: 0 , y: 33 } },
733
- batchQueueThroughputPanel { gridPos: { h: 8 , w: 12 , x: 12 , y: 33 } },
742
+ graphBuildCallsPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 0 , y: 17 } },
743
+ graphRunsPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 12 , y: 17 } },
744
+ graphBuildTimePanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 0 , y: 25 } },
745
+ graphRunTimePanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 12 , y: 25 } },
746
+ batchQueuingLatencyPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 0 , y: 33 } },
747
+ batchQueueThroughputPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 12 , x: 12 , y: 33 } },
734
748
],
735
749
// Optional Log Row
736
750
if $._config.enableLokiLogs then [
737
- containerLogsPanel { gridPos: { h: 8 , w: 24 , x: 0 , y: 41 } },
751
+ containerLogsPanel(getMatcher($._config)) { gridPos: { h: 8 , w: 24 , x: 0 , y: 41 } },
738
752
] else [],
739
753
]),
740
754
),
0 commit comments