diff --git a/opentelemetry-collector-mixin/.lint b/opentelemetry-collector-mixin/.lint new file mode 100644 index 000000000..bc18d39f7 --- /dev/null +++ b/opentelemetry-collector-mixin/.lint @@ -0,0 +1,7 @@ +exclusions: + template-job-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + template-instance-rule: + reason: Allows All value to be '.*' instead of '.*' to accommodate for non-K8S environments + panel-datasource-rule: + reason: The new Grafonnet promotes the use of datasources at the query level. This should probably end up in the linter as a valid option. diff --git a/opentelemetry-collector-mixin/Makefile b/opentelemetry-collector-mixin/Makefile new file mode 100644 index 000000000..b4fdca560 --- /dev/null +++ b/opentelemetry-collector-mixin/Makefile @@ -0,0 +1 @@ +include ../Makefile_mixin diff --git a/opentelemetry-collector-mixin/README.md b/opentelemetry-collector-mixin/README.md new file mode 100644 index 000000000..d19c71264 --- /dev/null +++ b/opentelemetry-collector-mixin/README.md @@ -0,0 +1,115 @@ +# opentelemetry-collector-mixin + +Prometheus Monitoring Mixin for the OpenTelemetry Collector + +This mixin contains a set of Prometheus alert rules and Grafana dashboards +based on the metrics exported by the OpenTelemetry Collector's [internal +telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/). + +To use it, you need to have `jsonnet` (any sufficiently modern version should +do, but ideally v0.20+) and `jb` installed. + +If you have a working Go development environment, you can run the following to +get started: +``` +go install github.com/google/go-jsonnet/cmd/jsonnet@latest +go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest +``` + +### Usage + +First, install the dependencies by running the following command from the repo +root: +``` +$ jb install +``` + +You can then build the Prometheus alert and recording rules by running: +``` +$ make prometheus_alerts.yaml +$ make prometheus_rules.yaml +``` + +You can also render a JSON dashboard file for Grafana by running the following +command. The results are stored in the `dashboards_out/` directory. +``` +$ make dashboards_out +``` + +### OpenTelemetry Collector configuration + +By default, the OpenTelemetry Collector exposes its [internal +telemetry](https://opentelemetry.io/docs/collector/internal-telemetry/) as +prometheus metrics on port 8888. + +The following configuration can be used as a starting point for scraping and +sending metrics in a Prometheus-compatible store. + +```yaml +extensions: + basicauth/remote_write: + client_auth: + username: "username" + password: "password" + +receivers: + prometheus: + config: + scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 15s + static_configs: + - targets: ['0.0.0.0:8888'] + +processors: + batch: + +exporters: + prometheusremotewrite: + endpoint: "http://prometheus/api/prom/push" + auth: + authenticator: basicauth/remote_write + resource_to_telemetry_conversion: + enabled: true # Convert resource attributes to metric labels + +service: + telemetry: + metrics: + level: "detailed" + readers: + - pull: + exporter: + prometheus: + host: '0.0.0.0' + port: 8888 + extensions: [basicauth/remote_write] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [prometheusremotewrite] +``` + +### Other requirements + +The Makefile contains commands for formatting, linting and testing the mixin. +For development purposes you may need one or more of the following as well. +``` +go install github.com/google/go-jsonnet/cmd/jsonnet-lint@latest +go install github.com/grafana/dashboard-linter@latest +go install github.com/prometheus/prometheus/cmd/promtool@latest +go install github.com/monitoring-mixins/mixtool/cmd/mixtool@main +``` + +### Contributing + +To contribute: + +1. Fork the repository +2. Make your changes +3. Run `make all` to verify your changes and test in a Prometheus/Grafana environment. Screenshots are welcome for new panels/dashboards. +4. Submit a pull request + +If you want to make some parameter configurable, use `config.libsonnet` as an +entrypoint. + diff --git a/opentelemetry-collector-mixin/alerts.jsonnet b/opentelemetry-collector-mixin/alerts.jsonnet new file mode 100644 index 000000000..75e7c1b29 --- /dev/null +++ b/opentelemetry-collector-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/opentelemetry-collector-mixin/alerts/alerts.libsonnet b/opentelemetry-collector-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..90a897be2 --- /dev/null +++ b/opentelemetry-collector-mixin/alerts/alerts.libsonnet @@ -0,0 +1,25 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'otelcol', + rules: [ + { + alert: 'OtelcolSendingQueueFull', + expr: ||| + otelcol_exporter_queue_size >= otelcol_exporter_queue_capacity + |||, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The sending queue has filled up.', + description: 'The sending queue is full for {{ $labels.instance }}. The collector might start dropping data', + }, + }, + ], + }, + ], + }, +} diff --git a/opentelemetry-collector-mixin/config.libsonnet b/opentelemetry-collector-mixin/config.libsonnet new file mode 100644 index 000000000..5fa45b7a0 --- /dev/null +++ b/opentelemetry-collector-mixin/config.libsonnet @@ -0,0 +1,23 @@ +{ + _config+:: { + // Grafana dashboard IDs are necessary for stable links for dashboards + grafanaDashboardIDs: { + 'collector.json': std.md5('collector.json'), + }, + + // Config for the Grafana dashboards in the Kubernetes Mixin + grafana: { + // The default refresh time for all dashboards, default to 10s + refresh: '10s', + + // Timezone for Grafana dashboards:: UTC, browser, ... + grafanaTimezone: 'UTC', + + // Tags for Grafana dashboards + dashboardTags: ['otelcol'], + }, + + // Default datasource name + datasourceName: 'default', + }, +} diff --git a/opentelemetry-collector-mixin/dashboards.jsonnet b/opentelemetry-collector-mixin/dashboards.jsonnet new file mode 100644 index 000000000..c0d94ffa6 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards.jsonnet @@ -0,0 +1,11 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; +local cfg = import 'config.libsonnet'; + +{ + [name]: dashboards[name] { + timezone: cfg._config.grafana.grafanaTimezone, + refresh: cfg._config.grafana.refresh, + tags: cfg._config.grafana.dashboardTags, + } + for name in std.objectFields(dashboards) +} diff --git a/opentelemetry-collector-mixin/dashboards/collector.libsonnet b/opentelemetry-collector-mixin/dashboards/collector.libsonnet new file mode 100644 index 000000000..14ec1dfc0 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/collector.libsonnet @@ -0,0 +1,137 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local row = g.panel.row; +local variables = import './utils/variables.libsonnet'; +local panels = import './utils/panels.libsonnet'; +local queries = import './utils/queries.libsonnet'; +local cfg = import '../config.libsonnet'; + +{ + grafanaDashboards+:: { + 'collector.json': + g.dashboard.new( + 'OpenTelemetry Collector Health', + ) + + g.dashboard.withDescription('A dashboard for monitoring the health of OpenTelemetry Collector instances using their internal metrics.') + + g.dashboard.graphTooltip.withSharedCrosshair() + + g.dashboard.withVariables([ + variables.datasourceVariable, + variables.jobVariable, + variables.clusterVariable, + variables.namespaceVariable, + variables.instanceVariable, + ]) + + g.dashboard.withUid(cfg._config.grafanaDashboardIDs['collector.json']) + + g.dashboard.withPanels( + g.util.grid.wrapPanels([ + // Overview row + row.new('Overview'), + panels.stat.base('Running Collectors', [queries.runningCollectors]), + panels.table.uptime('Collector uptime', [queries.collectorUptime]), + + // Resources row + row.new('Resources'), + panels.timeSeries.cpuUsage('CPU usage', [queries.cpuUsage]) + + { gridPos: { w: 8 } }, + panels.timeSeries.memoryUsage('Memory (RSS)', queries.memUsageRSS) + + { gridPos: { w: 8 } }, + panels.timeSeries.memoryUsage('Memory (Heap Alloc)', queries.memUsageHeapAlloc) + + { gridPos: { w: 8 } }, + + // Receivers row + row.new('Receivers'), + panels.timeSeries.short('Accepted metric points', [queries.acceptedMetricPoints]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Accepted log records', [queries.acceptedLogRecords]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Accepted spans', [queries.acceptedSpans]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Total incoming items', [queries.incomingItems]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused metric points', [queries.refusedMetricPoints]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused log records', [queries.refusedLogRecords]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Refused spans', [queries.refusedSpans]) + + { gridPos: { w: 6 } }, + panels.timeSeries.short('Total outgoing items', [queries.outgoingItems]) + + { gridPos: { w: 6 } }, + + // Processors row + row.new('Processors'), + panels.heatmap.base('Number of units in the batch', [queries.batchSendSize]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Batch cardinality', [queries.batchCardinality]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Queue current size vs capacity', [queries.queueSize, queries.queueCapacity]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Batch size send trigger', [queries.batchSizeSendTrigger]), + panels.timeSeries.short('Batch timeout send trigger', [queries.batchTimeoutSendTrigger]), + + // Exporters row + row.new('Exporters'), + panels.timeSeries.short('Exported metrics', [queries.exportedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Exported logs', [queries.exportedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Exported spans', [queries.exportedSpans]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed metrics', [queries.failedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed logs', [queries.failedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Failed spans', [queries.failedSpans]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed metrics', [queries.enqueueFailedMetrics]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed logs', [queries.enqueueFailedLogs]) + + { gridPos: { w: 8 } }, + panels.timeSeries.short('Enqueue failed spans', [queries.enqueueFailedSpans]) + + { gridPos: { w: 8 } }, + + // Network traffic row + row.new('Network traffic'), + panels.timeSeries.seconds('Inbound gRPC request duration percentiles', [ + queries.grpcInboundDurationP50, + queries.grpcInboundDurationP90, + queries.grpcInboundDurationP99, + ]), + panels.timeSeries.seconds('Inbound HTTP request duration percentiles', [ + queries.httpInboundDurationP50, + queries.httpInboundDurationP90, + queries.httpInboundDurationP99, + ]), + panels.timeSeries.bytes('Inbound gRPC request size percentiles', [ + queries.grpcInboundSizeP50, + queries.grpcInboundSizeP90, + queries.grpcInboundSizeP99, + ]), + panels.timeSeries.bytes('Inbound HTTP request size percentiles', [ + queries.httpInboundSizeP50, + queries.httpInboundSizeP90, + queries.httpInboundSizeP99, + ]), + panels.timeSeries.seconds('Outgoing gRPC request duration percentiles', [ + queries.grpcOutboundDurationP50, + queries.grpcOutboundDurationP90, + queries.grpcOutboundDurationP99, + ]), + panels.timeSeries.seconds('Outgoing HTTP request duration percentiles', [ + queries.httpOutboundDurationP50, + queries.httpOutboundDurationP90, + queries.httpOutboundDurationP99, + ]), + panels.timeSeries.bytes('Outgoing gRPC request size percentiles', [ + queries.grpcOutboundSizeP50, + queries.grpcOutboundSizeP90, + queries.grpcOutboundSizeP99, + ]), + panels.timeSeries.bytes('Outgoing HTTP request size percentiles', [ + queries.httpOutboundSizeP50, + queries.httpOutboundSizeP90, + queries.httpOutboundSizeP99, + ]), + + ], panelWidth=12, panelHeight=8), + ), + }, +} diff --git a/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet b/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 000000000..dba171f18 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1 @@ +(import 'collector.libsonnet') diff --git a/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet new file mode 100644 index 000000000..3958ef48b --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/panels.libsonnet @@ -0,0 +1,207 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +{ + timeSeries: { + local timeSeries = g.panel.timeSeries, + local fieldOverride = g.panel.timeSeries.fieldOverride, + local custom = timeSeries.fieldConfig.defaults.custom, + local options = timeSeries.options, + + base(title, targets): + timeSeries.new(title) + + timeSeries.queryOptions.withTargets(targets) + + timeSeries.queryOptions.withInterval('1m') + + options.legend.withDisplayMode('table') + + options.legend.withCalcs([ + 'lastNotNull', + 'max', + ]) + + custom.withFillOpacity(10) + + custom.withShowPoints('never') + + timeSeries.panelOptions.withDescription(title), + + short(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('short') + + timeSeries.standardOptions.withDecimals(0), + + seconds(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('s') + + custom.scaleDistribution.withType('log') + + custom.scaleDistribution.withLog(10), + + cpuUsage: self.seconds, + + bytes(title, targets): + self.base(title, targets,) + + timeSeries.standardOptions.withUnit('bytes') + + custom.scaleDistribution.withType('log') + + custom.scaleDistribution.withLog(2), + + memoryUsage(title, targets): + self.bytes(title, targets) + + timeSeries.standardOptions.withOverrides([ + fieldOverride.byRegexp.new('/(virtual|resident)/i') + + fieldOverride.byRegexp.withProperty( + 'custom.fillOpacity', + 0 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineWidth', + 2 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineStyle', + { + dash: [10, 10], + fill: 'dash', + } + ), + ]), + + durationQuantile(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('s') + + custom.withDrawStyle('bars') + + timeSeries.standardOptions.withOverrides([ + fieldOverride.byRegexp.new('/mean/i') + + fieldOverride.byRegexp.withProperty( + 'custom.fillOpacity', + 0 + ) + + fieldOverride.byRegexp.withProperty( + 'custom.lineStyle', + { + dash: [8, 10], + fill: 'dash', + } + ), + ]), + + milliseconds(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('ms'), + + cps(title, targets): + self.base(title, targets) + + timeSeries.standardOptions.withUnit('cps'), + }, + + heatmap: { + local heatmap = g.panel.heatmap, + local options = heatmap.options, + + base(title, targets): + heatmap.new(title) + + heatmap.queryOptions.withTargets(targets) + + heatmap.queryOptions.withInterval('1m') + + + options.withCalculate() + + options.calculation.xBuckets.withMode('size') + + options.calculation.xBuckets.withValue('1min') + + options.withCellGap(2) + + options.color.withMode('scheme') + + options.color.withScheme('Spectral') + + options.color.withSteps(128) + + options.yAxis.withDecimals(0) + + options.yAxis.withUnit('short') + + heatmap.panelOptions.withDescription(title), + }, + + stat: { + local stat = g.panel.stat, + local options = stat.options, + + base(title, targets): + stat.new(title) + + stat.queryOptions.withTargets(targets) + + + options.withColorMode('value') + + options.withGraphMode('none') + + options.withJustifyMode('center') + + options.withOrientation('auto') + + options.reduceOptions.withCalcs(['lastNotNull']) + + options.reduceOptions.withFields('') + + options.reduceOptions.withValues(false) + + options.withShowPercentChange(false) + + options.withTextMode('auto') + + options.withWideLayout(true) + + stat.standardOptions.withUnit('none') + + stat.panelOptions.withDescription(title), + }, + + table: { + local table = g.panel.table, + local options = table.options, + + base(title, targets): + table.new(title) + + table.queryOptions.withTargets(targets) + + table.queryOptions.withInterval('1m') + + + options.withCellHeight('sm') + + options.withFrameIndex(0) + + options.withShowHeader(true) + + options.footer.withShow(false) + + options.footer.withCountRows(false) + + options.footer.withFields('') + + options.footer.withReducer(['sum']) + + table.panelOptions.withDescription(title), + + uptime(title, targets): + self.base(title, targets) + + table.standardOptions.withUnit('s') + + table.queryOptions.withTransformations([ + { + id: 'organize', + options: { + excludeByName: { + Time: true, + job: true, + __name__: true, + }, + includeByName: { + cluster: true, + namespace: true, + instance: true, + service_version: true, + version: true, + Value: true, + }, + indexByName: { + cluster: 0, + namespace: 1, + instance: 2, + service_version: 3, + version: 3, + Value: 4, + }, + renameByName: { + Value: 'Uptime', + }, + }, + }, + ]) + + table.standardOptions.withOverrides([ + { + matcher: { + id: 'byName', + options: 'Uptime', + }, + properties: [ + { + id: 'custom.displayMode', + value: 'basic', + }, + ], + }, + ]) + + table.options.withSortBy([ + { + displayName: 'Uptime', + desc: true, + }, + ]), + + }, +} diff --git a/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet new file mode 100644 index 000000000..48263a133 --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/queries.libsonnet @@ -0,0 +1,505 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local prometheusQuery = g.query.prometheus; +local variables = import './variables.libsonnet'; + +{ + // Existing queries (modified to work with instance variable) + cpuUsage: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + rate( + { + __name__=~"otelcol_process_cpu_seconds(_total)?", + job=~"$job", + cluster=~"$cluster", + namespace=~"$namespace", + instance=~"$instance" + } + [$__rate_interval]) + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + {{cluster}} - {{namespace}} - {{instance}} + |||), + + memUsageRSS: + [ + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + {__name__=~"otelcol_process_memory_rss(_bytes)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + RSS - {{cluster}} - {{namespace}} - {{instance}} + |||), + ], + + memUsageHeapAlloc: + [ + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance) ( + {__name__=~"otelcol_process_runtime_total_sys_memory_bytes(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ) + ||| + ) + + prometheusQuery.withIntervalFactor(2) + + prometheusQuery.withLegendFormat(||| + RSS - {{cluster}} - {{namespace}} - {{instance}} + |||), + ], + + // Fleet Overview queries + runningCollectors: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + count({__name__=~"otelcol_process_uptime(_seconds_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ), + + collectorUptime: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + {__name__=~"otelcol_process_uptime(_seconds_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"} + ||| + ) + + prometheusQuery.withFormat('table') + + prometheusQuery.withInstant(true), + + // Receivers status queries + acceptedMetricPoints: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + acceptedLogRecords: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + acceptedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_accepted_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + incomingItems: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_incoming_items(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedMetricPoints: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedLogRecords: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + refusedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_receiver_refused_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + outgoingItems: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_outgoing_items(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Processors status queries + batchSendSize: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by (job, cluster, namespace, instance, le) (increase({__name__=~"otelcol_processor_batch_batch_send_size_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} - {{le}}'), + + batchCardinality: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_processor_batch_metadata_cardinality(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + queueSize: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_exporter_queue_size(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} queue current size'), + + queueCapacity: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) ({__name__=~"otelcol_exporter_queue_capacity(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}} queue capacity'), + + batchSizeSendTrigger: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_batch_timeout_trigger_send(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ), + + batchTimeoutSendTrigger: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_processor_batch_timeout_trigger_send(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Exporters status queries + exportedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + exportedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ), + + exportedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_sent_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + failedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_send_failed_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedMetrics: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_metric_points(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedLogs: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_log_records(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + enqueueFailedSpans: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + sum by(job, cluster, namespace, instance) (rate({__name__=~"otelcol_exporter_enqueue_failed_spans(_total)?", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval])) + ||| + ) + + prometheusQuery.withLegendFormat('{{cluster}} - {{namespace}} - {{instance}}'), + + // Network traffic queries + grpcInboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcInboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_server_request_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpInboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_server_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_duration_milliseconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundDurationP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_duration_seconds_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + grpcOutboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"rpc_client_request_size(_bytes_?)_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP50: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.50, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p50 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP90: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.90, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p90 - {{cluster}} - {{namespace}} - {{instance}}'), + + httpOutboundSizeP99: + prometheusQuery.new( + '$' + variables.datasourceVariable.name, + ||| + histogram_quantile(0.99, sum by (job, cluster, namespace, instance, le) (rate({__name__=~"http_client_request_body_size_bytes_bucket", job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance"}[$__rate_interval]))) + ||| + ) + + prometheusQuery.withLegendFormat('p99 - {{cluster}} - {{namespace}} - {{instance}}'), +} diff --git a/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet new file mode 100644 index 000000000..362ccd2de --- /dev/null +++ b/opentelemetry-collector-mixin/dashboards/utils/variables.libsonnet @@ -0,0 +1,51 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local variable = g.dashboard.variable; +local cfg = import '../../config.libsonnet'; + +{ + datasourceVariable: + variable.datasource.new('datasource', 'prometheus') + + variable.datasource.generalOptions.withLabel('Data source') + + variable.datasource.generalOptions.withCurrent(cfg._config.datasourceName) + + variable.datasource.generalOptions.showOnDashboard.withLabelAndValue(), + + clusterVariable: + variable.query.new('cluster') + + variable.query.generalOptions.withLabel('Cluster') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('cluster', metric='{__name__=~"otelcol_process_uptime.*"}'), + + namespaceVariable: + variable.query.new('namespace') + + variable.query.generalOptions.withLabel('Namespace') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('namespace', metric='{__name__=~"otelcol_process_uptime.*"}'), + + jobVariable: + variable.query.new('job') + + variable.query.generalOptions.withLabel('Job') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('job', metric='{__name__=~"otelcol_process_uptime.*"}'), + + instanceVariable: + variable.query.new('instance') + + variable.query.generalOptions.withLabel('Instance') + + variable.query.withDatasourceFromVariable(self.datasourceVariable) + + variable.query.refresh.onTime() + + variable.query.withSort(type='alphabetical', asc=false) + + variable.query.selectionOptions.withIncludeAll(true, '.*') + + variable.query.selectionOptions.withMulti(true) + + variable.query.queryTypes.withLabelValues('instance', metric='{__name__=~"otelcol_process_uptime.*"}'), +} diff --git a/opentelemetry-collector-mixin/jsonnetfile.json b/opentelemetry-collector-mixin/jsonnetfile.json new file mode 100644 index 000000000..2414c8671 --- /dev/null +++ b/opentelemetry-collector-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "main" + } + ], + "legacyImports": true +} diff --git a/opentelemetry-collector-mixin/mixin.libsonnet b/opentelemetry-collector-mixin/mixin.libsonnet new file mode 100644 index 000000000..152721db9 --- /dev/null +++ b/opentelemetry-collector-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') + +(import 'config.libsonnet') diff --git a/opentelemetry-collector-mixin/rules.jsonnet b/opentelemetry-collector-mixin/rules.jsonnet new file mode 100644 index 000000000..dbe13f417 --- /dev/null +++ b/opentelemetry-collector-mixin/rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) diff --git a/opentelemetry-collector-mixin/rules/rules.libsonnet b/opentelemetry-collector-mixin/rules/rules.libsonnet new file mode 100644 index 000000000..137f46c58 --- /dev/null +++ b/opentelemetry-collector-mixin/rules/rules.libsonnet @@ -0,0 +1,11 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'otelcol-rules', + rules: [ + ], + }, + ], + }, +}