Skip to content

Commit cb61f8d

Browse files
authored
Add OTel HTTP support (#853)
* Add OTel HTTP support * temporal_sdk_temporal_num_pollers works, need to use more specific query for test * added tests and CI action to run docker and run new test * Remove stale println's, move variables to #[values()], assert specific metrics
1 parent 52d1bb6 commit cb61f8d

File tree

10 files changed

+217
-18
lines changed

10 files changed

+217
-18
lines changed

.github/workflows/per-pr.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,31 @@ jobs:
8484

8585
- uses: Swatinem/rust-cache@v2
8686
- run: cargo integ-test
87+
88+
docker-integ-tests:
89+
name: Docker integ tests
90+
env:
91+
TEMPORAL_CLOUD_ADDRESS: https://${{ vars.TEMPORAL_CLIENT_NAMESPACE }}.tmprl.cloud:7233
92+
TEMPORAL_CLOUD_NAMESPACE: ${{ vars.TEMPORAL_CLIENT_NAMESPACE }}
93+
TEMPORAL_CLIENT_CERT: ${{ secrets.TEMPORAL_CLIENT_CERT }}
94+
TEMPORAL_CLIENT_KEY: ${{ secrets.TEMPORAL_CLIENT_KEY }}
95+
DOCKER_PROMETHEUS_RUNNING: true
96+
timeout-minutes: 20
97+
runs-on: ubuntu-latest
98+
steps:
99+
- uses: actions/checkout@v4
100+
- uses: dtolnay/rust-toolchain@stable
101+
with:
102+
toolchain: 1.80.0
103+
- name: Install protoc
104+
uses: arduino/setup-protoc@v3
105+
with:
106+
# TODO: Upgrade proto once https://github.com/arduino/setup-protoc/issues/99 is fixed
107+
version: '23.x'
108+
repo-token: ${{ secrets.GITHUB_TOKEN }}
109+
- name: Start container for otel-collector and prometheus
110+
uses: hoverkraft-tech/[email protected]
111+
with:
112+
compose-file: ./docker/docker-compose-ci.yaml
113+
- uses: Swatinem/rust-cache@v2
114+
- run: cargo integ-test test_docker_

core-api/src/telemetry.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ pub struct OtelCollectorOptions {
7171
/// Overrides for histogram buckets. Units depend on the value of `use_seconds_for_durations`.
7272
#[builder(default)]
7373
pub histogram_bucket_overrides: HistogramBucketOverrides,
74+
/// Protocol to use for communication with the collector
75+
#[builder(default = "OtlpProtocol::Grpc")]
76+
pub protocol: OtlpProtocol,
7477
}
7578

7679
/// Options for exporting metrics to Prometheus
@@ -143,6 +146,15 @@ pub enum MetricTemporality {
143146
Delta,
144147
}
145148

149+
/// Options for configuring telemetry
150+
#[derive(Debug, Clone, Copy)]
151+
pub enum OtlpProtocol {
152+
/// Use gRPC to communicate with the collector
153+
Grpc,
154+
/// Use HTTP to communicate with the collector
155+
Http,
156+
}
157+
146158
impl Default for TelemetryOptions {
147159
fn default() -> Self {
148160
TelemetryOptionsBuilder::default().build().unwrap()

core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ lru = "0.12"
4444
mockall = "0.13"
4545
opentelemetry = { workspace = true, features = ["metrics"], optional = true }
4646
opentelemetry_sdk = { version = "0.26", features = ["rt-tokio", "metrics"], optional = true }
47-
opentelemetry-otlp = { version = "0.26", features = ["tokio", "metrics", "tls"], optional = true }
47+
opentelemetry-otlp = { version = "0.26", features = ["tokio", "metrics", "tls", "http-proto", "reqwest-client",], optional = true }
4848
opentelemetry-prometheus = { git = "https://github.com/open-telemetry/opentelemetry-rust.git", rev = "e911383", optional = true }
4949
parking_lot = { version = "0.12", features = ["send_guard"] }
5050
pid = "4.0"

core/src/telemetry/otel.rs

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ use temporal_sdk_core_api::telemetry::{
3030
CoreMeter, Counter, Gauge, GaugeF64, Histogram, HistogramDuration, HistogramF64,
3131
MetricAttributes, MetricParameters, NewAttributes,
3232
},
33-
HistogramBucketOverrides, MetricTemporality, OtelCollectorOptions, PrometheusExporterOptions,
33+
HistogramBucketOverrides, MetricTemporality, OtelCollectorOptions, OtlpProtocol,
34+
PrometheusExporterOptions,
3435
};
3536
use tokio::task::AbortHandle;
3637
use tonic::{metadata::MetadataMap, transport::ClientTlsConfig};
@@ -120,16 +121,26 @@ pub(super) fn augment_meter_provider_with_defaults(
120121
pub fn build_otlp_metric_exporter(
121122
opts: OtelCollectorOptions,
122123
) -> Result<CoreOtelMeter, anyhow::Error> {
123-
let mut exporter =
124-
opentelemetry_otlp::TonicExporterBuilder::default().with_endpoint(opts.url.to_string());
125-
if opts.url.scheme() == "https" || opts.url.scheme() == "grpcs" {
126-
exporter = exporter.with_tls_config(ClientTlsConfig::new().with_native_roots());
127-
}
128-
let exporter = exporter
129-
.with_metadata(MetadataMap::from_headers((&opts.headers).try_into()?))
130-
.build_metrics_exporter(Box::new(metric_temporality_to_selector(
131-
opts.metric_temporality,
132-
)))?;
124+
let exporter = match opts.protocol {
125+
OtlpProtocol::Grpc => {
126+
let mut exporter = opentelemetry_otlp::TonicExporterBuilder::default()
127+
.with_endpoint(opts.url.to_string());
128+
if opts.url.scheme() == "https" || opts.url.scheme() == "grpcs" {
129+
exporter = exporter.with_tls_config(ClientTlsConfig::new().with_native_roots());
130+
}
131+
exporter
132+
.with_metadata(MetadataMap::from_headers((&opts.headers).try_into()?))
133+
.build_metrics_exporter(Box::new(metric_temporality_to_selector(
134+
opts.metric_temporality,
135+
)))?
136+
}
137+
OtlpProtocol::Http => opentelemetry_otlp::HttpExporterBuilder::default()
138+
.with_endpoint(opts.url.to_string())
139+
.with_headers(opts.headers)
140+
.build_metrics_exporter(Box::new(metric_temporality_to_selector(
141+
opts.metric_temporality,
142+
)))?,
143+
};
133144
let reader = PeriodicReader::builder(exporter, runtime::Tokio)
134145
.with_interval(opts.metric_periodicity)
135146
.build();

docker/docker-compose-ci.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
version: '3.5'
2+
3+
services:
4+
otel-collector:
5+
image: otel/opentelemetry-collector:latest
6+
command: [ '--config=/etc/otel-collector-ci.yaml' ]
7+
volumes:
8+
- ../etc/otel-collector-ci.yaml:/etc/otel-collector-ci.yaml
9+
ports:
10+
# - "1888:1888" # pprof extension
11+
# It's useful to be able to manually inspect metrics during dev
12+
- '8888:8888' # Prometheus metrics exposed by the collector
13+
- '8889:8889' # Prometheus exporter metrics
14+
# - "13133:13133" # health_check extension
15+
- '4317:4317' # OTLP gRPC receiver
16+
- '4318:4318' # OTLP HTTP receiver
17+
# - "55679:55679" # zpages extension
18+
19+
prometheus:
20+
container_name: prometheus
21+
image: prom/prometheus:latest
22+
volumes:
23+
- ../etc/prometheus.yaml:/etc/prometheus/prometheus.yml
24+
ports:
25+
- '9090:9090'

etc/otel-collector-ci.yaml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
receivers:
2+
otlp:
3+
protocols:
4+
grpc:
5+
endpoint: 0.0.0.0:4317
6+
http:
7+
endpoint: 0.0.0.0:4318
8+
9+
exporters:
10+
prometheus:
11+
endpoint: '0.0.0.0:8889'
12+
namespace: temporal_sdk
13+
14+
debug:
15+
16+
processors:
17+
batch:
18+
19+
extensions:
20+
health_check:
21+
pprof:
22+
endpoint: :1888
23+
zpages:
24+
endpoint: :55679
25+
26+
service:
27+
extensions: [ pprof, zpages, health_check ]
28+
pipelines:
29+
traces:
30+
receivers: [ otlp ]
31+
processors: [ batch ]
32+
exporters: [ debug, ]
33+
metrics:
34+
receivers: [ otlp ]
35+
processors: [ batch ]
36+
exporters: [ debug, prometheus ]

etc/otel-collector-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ exporters:
1111
endpoint: '0.0.0.0:8889'
1212
namespace: temporal_sdk
1313

14-
logging:
14+
debug:
1515

1616
otlp/jaeger:
1717
endpoint: jaeger:14250
@@ -32,8 +32,8 @@ service:
3232
traces:
3333
receivers: [ otlp ]
3434
processors: [ batch ]
35-
exporters: [ logging, otlp/jaeger ]
35+
exporters: [ debug, otlp/jaeger ]
3636
metrics:
3737
receivers: [ otlp ]
3838
processors: [ batch ]
39-
exporters: [ logging, prometheus ]
39+
exporters: [ debug, prometheus ]

etc/prometheus.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
scrape_configs:
22
- job_name: 'otel-collector'
3-
scrape_interval: 3s
3+
scrape_interval: 1s
44
static_configs:
55
- targets: ['otel-collector:8889']
66
- targets: ['otel-collector:8888']

test-utils/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ pub static SEARCH_ATTR_INT: &str = "CustomIntField";
8080
pub const OTEL_URL_ENV_VAR: &str = "TEMPORAL_INTEG_OTEL_URL";
8181
/// If set, enable direct scraping of prom metrics on the specified port
8282
pub const PROM_ENABLE_ENV_VAR: &str = "TEMPORAL_INTEG_PROM_PORT";
83+
/// This should match the prometheus port exposed in docker-compose-ci.yaml
84+
pub const PROMETHEUS_QUERY_API: &str = "http://localhost:9090/api/v1/query";
8385
#[macro_export]
8486
macro_rules! prost_dur {
8587
($dur_call:ident $args:tt) => {

tests/integ_tests/metrics_tests.rs

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use anyhow::anyhow;
22
use assert_matches::assert_matches;
3+
use std::string::ToString;
34
use std::{collections::HashMap, env, net::SocketAddr, sync::Arc, time::Duration};
45
use temporal_client::{
56
WorkflowClientTrait, WorkflowOptions, WorkflowService, REQUEST_LATENCY_HISTOGRAM_NAME,
@@ -15,8 +16,9 @@ use temporal_sdk_core::{
1516
use temporal_sdk_core_api::{
1617
telemetry::{
1718
metrics::{CoreMeter, MetricAttributes, MetricParameters},
18-
HistogramBucketOverrides, OtelCollectorOptionsBuilder, PrometheusExporterOptions,
19-
PrometheusExporterOptionsBuilder, TelemetryOptions, TelemetryOptionsBuilder,
19+
HistogramBucketOverrides, OtelCollectorOptionsBuilder, OtlpProtocol,
20+
PrometheusExporterOptions, PrometheusExporterOptionsBuilder, TelemetryOptions,
21+
TelemetryOptionsBuilder,
2022
},
2123
worker::WorkerConfigBuilder,
2224
Worker,
@@ -43,6 +45,7 @@ use temporal_sdk_core_protos::{
4345
};
4446
use temporal_sdk_core_test_utils::{
4547
get_integ_server_options, get_integ_telem_options, CoreWfStarter, NAMESPACE, OTEL_URL_ENV_VAR,
48+
PROMETHEUS_QUERY_API,
4649
};
4750
use tokio::{join, sync::Barrier, task::AbortHandle};
4851
use url::Url;
@@ -651,6 +654,88 @@ async fn request_fail_codes_otel() {
651654
}
652655
}
653656

657+
// Tests that rely on Prometheus running in a docker container need to start
658+
// with `docker_` and set the `DOCKER_PROMETHEUS_RUNNING` env variable to run
659+
#[rstest::rstest]
660+
#[tokio::test]
661+
async fn docker_metrics_with_prometheus(
662+
#[values(
663+
("http://localhost:4318/v1/metrics", OtlpProtocol::Http),
664+
("http://localhost:4317", OtlpProtocol::Grpc)
665+
)]
666+
otel_collector: (&str, OtlpProtocol),
667+
) {
668+
if std::env::var("DOCKER_PROMETHEUS_RUNNING").is_err() {
669+
return;
670+
}
671+
let (otel_collector_addr, otel_protocol) = otel_collector;
672+
let test_uid = format!(
673+
"test_{}_",
674+
uuid::Uuid::new_v4().to_string().replace("-", "")
675+
);
676+
677+
// Configure the OTLP exporter with HTTP
678+
let opts = OtelCollectorOptionsBuilder::default()
679+
.url(otel_collector_addr.parse().unwrap())
680+
.protocol(otel_protocol)
681+
.global_tags(HashMap::from([("test_id".to_string(), test_uid.clone())]))
682+
.build()
683+
.unwrap();
684+
let exporter = Arc::new(build_otlp_metric_exporter(opts).unwrap());
685+
let telemopts = TelemetryOptionsBuilder::default()
686+
.metrics(exporter as Arc<dyn CoreMeter>)
687+
.metric_prefix(test_uid.clone())
688+
.build()
689+
.unwrap();
690+
let rt = CoreRuntime::new_assume_tokio(telemopts).unwrap();
691+
let test_name = "docker_metrics_with_prometheus";
692+
let mut starter = CoreWfStarter::new_with_runtime(test_name, rt);
693+
let worker = starter.get_worker().await;
694+
starter.start_wf().await;
695+
696+
// Immediately finish the workflow
697+
let task = worker.poll_workflow_activation().await.unwrap();
698+
worker
699+
.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
700+
task.run_id,
701+
CompleteWorkflowExecution { result: None }.into(),
702+
))
703+
.await
704+
.unwrap();
705+
706+
let client = starter.get_client().await;
707+
client.list_namespaces().await.unwrap();
708+
709+
// Give Prometheus time to scrape metrics
710+
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
711+
712+
// Query Prometheus API for metrics
713+
let client = reqwest::Client::new();
714+
let query = format!("temporal_sdk_{}num_pollers", test_uid.clone());
715+
let response = client
716+
.get(PROMETHEUS_QUERY_API)
717+
.query(&[("query", query)])
718+
.send()
719+
.await
720+
.unwrap()
721+
.json::<serde_json::Value>()
722+
.await
723+
.unwrap();
724+
725+
// Validate the Prometheus response
726+
if let Some(data) = response["data"]["result"].as_array() {
727+
assert!(!data.is_empty(), "No metrics found for query: {test_uid}");
728+
assert_eq!(data[0]["metric"]["exported_job"], "temporal-core-sdk");
729+
assert_eq!(data[0]["metric"]["job"], "otel-collector");
730+
assert!(data[0]["metric"]["task_queue"]
731+
.as_str()
732+
.unwrap()
733+
.starts_with(test_name));
734+
} else {
735+
panic!("Invalid Prometheus response: {:?}", response);
736+
}
737+
}
738+
654739
#[tokio::test]
655740
async fn activity_metrics() {
656741
let (telemopts, addr, _aborter) = prom_metrics(None);

0 commit comments

Comments
 (0)