Skip to content

Commit b5e3f0e

Browse files
authored
Tensorflow k8s plugin support (#1184)
* added multicluster selector to config * updated tensorflow overview libsonnet for k8s support * fixed typo * removed tensorflow_cluster label * fixed typos and added back model_name selectors * removed unused parts in config * put back enableMultiCluster and tensorflowSelector
1 parent 88231c4 commit b5e3f0e

File tree

2 files changed

+48
-32
lines changed

2 files changed

+48
-32
lines changed

tensorflow-mixin/config.libsonnet

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
{
22
_config+:: {
3+
enableMultiCluster: false,
4+
tensorflowSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
35
dashboardTags: ['tensorflow-mixin'],
46
dashboardPeriod: 'now-30m',
57
dashboardTimezone: 'default',

tensorflow-mixin/dashboards/tensorflow-overview.libsonnet

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ local servingOverviewRow = {
2323
type: 'row',
2424
};
2525

26-
local modelRequestRatePanel = {
26+
local modelRequestRatePanel(matcher) = {
2727
datasource: promDatasource,
2828
description: 'Rate of requests over time for the selected model. Grouped by statuses.',
2929
fieldConfig: {
@@ -80,7 +80,7 @@ local modelRequestRatePanel = {
8080
},
8181
targets: [
8282
prometheus.target(
83-
'rate(:tensorflow:serving:request_count{job=~"$job",instance=~"$instance",model_name=~"$model_name"}[$__rate_interval])',
83+
'rate(:tensorflow:serving:request_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])',
8484
datasource=promDatasource,
8585
legendFormat='model_name="{{model_name}}",status="{{status}}"',
8686
),
@@ -90,7 +90,7 @@ local modelRequestRatePanel = {
9090
type: 'timeseries',
9191
};
9292

93-
local modelPredictRequestLatencyPanel = {
93+
local modelPredictRequestLatencyPanel(matcher) = {
9494
datasource: promDatasource,
9595
description: 'Average request latency of predict requests for the selected model.',
9696
fieldConfig: {
@@ -147,7 +147,7 @@ local modelPredictRequestLatencyPanel = {
147147
},
148148
targets: [
149149
prometheus.target(
150-
'increase(:tensorflow:serving:request_latency_sum{job=~"$job",instance=~"$instance",model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:request_latency_count{job=~"$job",instance=~"$instance",model_name=~"$model_name"}[$__rate_interval])',
150+
'increase(:tensorflow:serving:request_latency_sum{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:request_latency_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])',
151151
datasource=promDatasource,
152152
legendFormat='model_name="{{model_name}}"',
153153
),
@@ -157,7 +157,7 @@ local modelPredictRequestLatencyPanel = {
157157
type: 'timeseries',
158158
};
159159

160-
local modelPredictRuntimeLatencyPanel = {
160+
local modelPredictRuntimeLatencyPanel(matcher) = {
161161
datasource: promDatasource,
162162
description: 'Average runtime latency to fulfill a predict request for the selected model.',
163163
fieldConfig: {
@@ -214,7 +214,7 @@ local modelPredictRuntimeLatencyPanel = {
214214
},
215215
targets: [
216216
prometheus.target(
217-
'increase(:tensorflow:serving:runtime_latency_sum{job=~"$job",instance=~"$instance",model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:runtime_latency_count{job=~"$job",instance=~"$instance",model_name=~"$model_name"}[$__rate_interval])',
217+
'increase(:tensorflow:serving:runtime_latency_sum{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])/increase(:tensorflow:serving:runtime_latency_count{' + matcher + ', model_name=~"$model_name"}[$__rate_interval])',
218218
datasource=promDatasource,
219219
legendFormat='model_name="{{model_name}}"',
220220
),
@@ -224,7 +224,7 @@ local modelPredictRuntimeLatencyPanel = {
224224
type: 'timeseries',
225225
};
226226

227-
local graphBuildCallsPanel = {
227+
local graphBuildCallsPanel(matcher) = {
228228
datasource: promDatasource,
229229
description: 'Number of times TensorFlow Serving has created a new client graph.',
230230
fieldConfig: {
@@ -281,7 +281,7 @@ local graphBuildCallsPanel = {
281281
},
282282
targets: [
283283
prometheus.target(
284-
'increase(:tensorflow:core:graph_build_calls{job=~"$job",instance=~"$instance"}[$__rate_interval])',
284+
'increase(:tensorflow:core:graph_build_calls{' + matcher + '}[$__rate_interval])',
285285
datasource=promDatasource,
286286
),
287287
],
@@ -290,7 +290,7 @@ local graphBuildCallsPanel = {
290290
type: 'timeseries',
291291
};
292292

293-
local graphRunsPanel = {
293+
local graphRunsPanel(matcher) = {
294294
datasource: promDatasource,
295295
description: 'Number of graph executions.',
296296
fieldConfig: {
@@ -347,7 +347,7 @@ local graphRunsPanel = {
347347
},
348348
targets: [
349349
prometheus.target(
350-
'increase(:tensorflow:core:graph_runs{job=~"$job",instance=~"$instance"}[$__rate_interval])',
350+
'increase(:tensorflow:core:graph_runs{' + matcher + '}[$__rate_interval])',
351351
datasource=promDatasource,
352352
),
353353
],
@@ -356,7 +356,7 @@ local graphRunsPanel = {
356356
type: 'timeseries',
357357
};
358358

359-
local graphBuildTimePanel = {
359+
local graphBuildTimePanel(matcher) = {
360360
datasource: promDatasource,
361361
description: 'Amount of time Tensorflow has spent creating new client graphs.',
362362
fieldConfig: {
@@ -413,7 +413,7 @@ local graphBuildTimePanel = {
413413
},
414414
targets: [
415415
prometheus.target(
416-
'increase(:tensorflow:core:graph_build_time_usecs{job=~"$job",instance=~"$instance"}[$__rate_interval])/increase(:tensorflow:core:graph_build_calls{job=~"$job",instance=~"$instance"}[$__rate_interval])',
416+
'increase(:tensorflow:core:graph_build_time_usecs{' + matcher + '}[$__rate_interval])/increase(:tensorflow:core:graph_build_calls{' + matcher + '}[$__rate_interval])',
417417
datasource=promDatasource,
418418
),
419419
],
@@ -422,7 +422,7 @@ local graphBuildTimePanel = {
422422
type: 'timeseries',
423423
};
424424

425-
local graphRunTimePanel = {
425+
local graphRunTimePanel(matcher) = {
426426
datasource: promDatasource,
427427
description: 'Amount of time spent executing graphs.',
428428
fieldConfig: {
@@ -479,7 +479,7 @@ local graphRunTimePanel = {
479479
},
480480
targets: [
481481
prometheus.target(
482-
'increase(:tensorflow:core:graph_run_time_usecs{job=~"$job",instance=~"$instance"}[$__rate_interval])/increase(:tensorflow:core:graph_runs{job=~"$job",instance=~"$instance"}[$__rate_interval])',
482+
'increase(:tensorflow:core:graph_run_time_usecs{' + matcher + '}[$__rate_interval])/increase(:tensorflow:core:graph_runs{' + matcher + '}[$__rate_interval])',
483483
datasource=promDatasource,
484484
),
485485
],
@@ -488,7 +488,7 @@ local graphRunTimePanel = {
488488
type: 'timeseries',
489489
};
490490

491-
local batchQueuingLatencyPanel = {
491+
local batchQueuingLatencyPanel(matcher) = {
492492
datasource: promDatasource,
493493
description: 'Current latency in the batching queue.',
494494
fieldConfig: {
@@ -545,7 +545,7 @@ local batchQueuingLatencyPanel = {
545545
},
546546
targets: [
547547
prometheus.target(
548-
'increase(:tensorflow:serving:batching_session:queuing_latency_sum{job=~"$job",instance=~"$instance"}[$__rate_interval])/increase(:tensorflow:serving:batching_session:queuing_latency_count{job=~"$job",instance=~"$instance"}[$__rate_interval])',
548+
'increase(:tensorflow:serving:batching_session:queuing_latency_sum{' + matcher + '}[$__rate_interval])/increase(:tensorflow:serving:batching_session:queuing_latency_count{' + matcher + '}[$__rate_interval])',
549549
datasource=promDatasource,
550550
),
551551
],
@@ -554,7 +554,7 @@ local batchQueuingLatencyPanel = {
554554
type: 'timeseries',
555555
};
556556

557-
local batchQueueThroughputPanel = {
557+
local batchQueueThroughputPanel(matcher) = {
558558
datasource: promDatasource,
559559
description: 'Rate of batch queue throughput over time.',
560560
fieldConfig: {
@@ -611,7 +611,7 @@ local batchQueueThroughputPanel = {
611611
},
612612
targets: [
613613
prometheus.target(
614-
'rate(:tensorflow:serving:batching_session:queuing_latency_count{job=~"$job",instance=~"$instance"}[$__rate_interval])',
614+
'rate(:tensorflow:serving:batching_session:queuing_latency_count{' + matcher + '}[$__rate_interval])',
615615
datasource=promDatasource,
616616
),
617617
],
@@ -620,7 +620,7 @@ local batchQueueThroughputPanel = {
620620
type: 'timeseries',
621621
};
622622

623-
local containerLogsPanel = {
623+
local containerLogsPanel(matcher) = {
624624
datasource: lokiDatasource,
625625
description: 'Logs from the TensorFlow Serving Docker container.',
626626
options: {
@@ -637,7 +637,7 @@ local containerLogsPanel = {
637637
{
638638
datasource: lokiDatasource,
639639
editorMode: 'code',
640-
expr: '{name="tensorflow",job=~"$job",instance=~"$instance"}',
640+
expr: '{name="tensorflow",' + matcher + '}',
641641
legendFormat: '',
642642
queryType: 'range',
643643
refId: 'A',
@@ -648,6 +648,8 @@ local containerLogsPanel = {
648648
type: 'logs',
649649
};
650650

651+
local getMatcher(cfg) = '%(tensorflowSelector)s, instance=~"$instance"' % cfg;
652+
651653
{
652654
grafanaDashboards+:: {
653655
'tensorflow-overview.json':
@@ -681,10 +683,22 @@ local containerLogsPanel = {
681683
allValues='.+',
682684
sort=1,
683685
),
686+
template.new(
687+
'cluster',
688+
promDatasource,
689+
'label_values(:tensorflow:serving:request_count{}, cluster)' % $._config,
690+
label='Cluster',
691+
refresh=2,
692+
includeAll=true,
693+
multi=true,
694+
allValues='.*',
695+
hide=if $._config.enableMultiCluster then '' else 'variable',
696+
sort=0
697+
),
684698
template.new(
685699
'instance',
686700
promDatasource,
687-
'label_values(:tensorflow:serving:request_count{job=~"$job"}, instance)',
701+
'label_values(:tensorflow:serving:request_count{%(tensorflowSelector)s}, instance)' % $._config,
688702
label='Instance',
689703
refresh='time',
690704
includeAll=true,
@@ -695,7 +709,7 @@ local containerLogsPanel = {
695709
template.new(
696710
'model_name',
697711
promDatasource,
698-
'label_values(:tensorflow:serving:request_count{job=~"$job",instance=~"$instance"}, model_name)',
712+
'label_values(:tensorflow:serving:request_count{%(tensorflowSelector)s}}, model_name)' % $._config,
699713
label='Model name',
700714
refresh='time',
701715
includeAll=true,
@@ -718,23 +732,23 @@ local containerLogsPanel = {
718732
std.flattenArrays([
719733
// Model Row
720734
[
721-
modelRequestRatePanel { gridPos: { h: 8, w: 24, x: 0, y: 0 } },
722-
modelPredictRequestLatencyPanel { gridPos: { h: 8, w: 12, x: 0, y: 8 } },
723-
modelPredictRuntimeLatencyPanel { gridPos: { h: 8, w: 12, x: 12, y: 8 } },
735+
modelRequestRatePanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 0 } },
736+
modelPredictRequestLatencyPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 8 } },
737+
modelPredictRuntimeLatencyPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 8 } },
724738
],
725739
// Serving Overview Row
726740
[
727741
servingOverviewRow { gridPos: { h: 1, w: 24, x: 0, y: 16 } },
728-
graphBuildCallsPanel { gridPos: { h: 8, w: 12, x: 0, y: 17 } },
729-
graphRunsPanel { gridPos: { h: 8, w: 12, x: 12, y: 17 } },
730-
graphBuildTimePanel { gridPos: { h: 8, w: 12, x: 0, y: 25 } },
731-
graphRunTimePanel { gridPos: { h: 8, w: 12, x: 12, y: 25 } },
732-
batchQueuingLatencyPanel { gridPos: { h: 8, w: 12, x: 0, y: 33 } },
733-
batchQueueThroughputPanel { gridPos: { h: 8, w: 12, x: 12, y: 33 } },
742+
graphBuildCallsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 17 } },
743+
graphRunsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 17 } },
744+
graphBuildTimePanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 25 } },
745+
graphRunTimePanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 25 } },
746+
batchQueuingLatencyPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 33 } },
747+
batchQueueThroughputPanel(getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 33 } },
734748
],
735749
// Optional Log Row
736750
if $._config.enableLokiLogs then [
737-
containerLogsPanel { gridPos: { h: 8, w: 24, x: 0, y: 41 } },
751+
containerLogsPanel(getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 41 } },
738752
] else [],
739753
]),
740754
),

0 commit comments

Comments
 (0)