Skip to content

Commit 7c99d15

Browse files
authored
feat(outbound): Add response metrics to policy router (#3086)
The outbound policy router includes a requests counter that measures the number of requests dispatched to each route-backend; but this does not provide visibility into success rate or response time. Before introducing timeouts and retires on outbound routes, this change introduces visibility into per-route response metrics. The route_request_statuses counters measure responses from the application's point of view. Once retries are introduced, this will provide visibility into the _effective_ success rate of each route. outbound_http_route_request_statuses_total{parent...,route...,http_status="200",error="TIMEOUT"} 0 outbound_grpc_route_request_statuses_total{parent...,route...,grpc_status="NOT_FOUND",error="TIMEOUT"} 0 A coarse histogram is introduced at this scope to track the total duration of requests dispatched to each route, covering all retries and all response stream processing: outbound_http_route_request_duration_seconds_sum{parent...,route...} 0 outbound_http_route_request_duration_seconds_count{parent...,route...} 0 outbound_http_route_request_duration_seconds_bucket{le="0.05",parent...,route...} 0 outbound_http_route_request_duration_seconds_bucket{le="0.5",parent...,route...} 0 outbound_http_route_request_duration_seconds_bucket{le="1.0",parent...,route...} 0 outbound_http_route_request_duration_seconds_bucket{le="10.0",parent...,route...} 0 outbound_http_route_request_duration_seconds_bucket{le="+Inf",parent...,route...} 0 The route_backend_response_statuses counters measure the responses from individual backends. This reflects the _actual_ success rate of each route as served by the backend services. outbound_http_route_backend_response_statuses_total{parent...,route...,backend...,http_status="...",error="..."} 0 outbound_grpc_route_backend_response_statuses_total{parent...,route...,backend...,grpc_status="...",error="..."} 0 A slightly more detailed histogram is introduced at this scope to track the time spend processing responses from each backend (i.e. after the request has been fully dispatched): outbound_http_route_backend_response_duration_seconds_sum{parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_count{parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="0.025",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="0.05",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="0.1",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="0.25",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="0.5",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="1.0",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="10.0",parent...,route...,backend...} 0 outbound_http_route_backend_response_duration_seconds_bucket{le="+Inf",parent...,route...,backend...} 0 Note that duration histograms omit status code labels, as they needlessly inflate metrics cardinality. The histograms that we have introduced here are generally much more constrained, as we much choose broadly applicable buckets and want to avoid cardinality explosion when many routes are used.
1 parent b310d63 commit 7c99d15

File tree

22 files changed

+2339
-149
lines changed

22 files changed

+2339
-149
lines changed

Cargo.lock

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,6 +1277,7 @@ dependencies = [
12771277
"linkerd-app-test",
12781278
"linkerd-distribute",
12791279
"linkerd-http-classify",
1280+
"linkerd-http-prom",
12801281
"linkerd-http-retry",
12811282
"linkerd-http-route",
12821283
"linkerd-identity",
@@ -1493,6 +1494,24 @@ dependencies = [
14931494
"tracing",
14941495
]
14951496

1497+
[[package]]
1498+
name = "linkerd-http-prom"
1499+
version = "0.1.0"
1500+
dependencies = [
1501+
"futures",
1502+
"http",
1503+
"http-body",
1504+
"linkerd-error",
1505+
"linkerd-http-box",
1506+
"linkerd-metrics",
1507+
"linkerd-stack",
1508+
"parking_lot",
1509+
"pin-project",
1510+
"prometheus-client",
1511+
"thiserror",
1512+
"tokio",
1513+
]
1514+
14961515
[[package]]
14971516
name = "linkerd-http-retry"
14981517
version = "0.1.0"

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ members = [
2929
"linkerd/http/classify",
3030
"linkerd/http/h2",
3131
"linkerd/http/metrics",
32+
"linkerd/http/prom",
3233
"linkerd/http/retry",
3334
"linkerd/http/route",
3435
"linkerd/identity",

linkerd/app/outbound/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ linkerd-app-core = { path = "../core" }
3535
linkerd-app-test = { path = "../test", optional = true }
3636
linkerd-distribute = { path = "../../distribute" }
3737
linkerd-http-classify = { path = "../../http/classify" }
38+
linkerd-http-prom = { path = "../../http/prom" }
3839
linkerd-http-retry = { path = "../../http/retry" }
3940
linkerd-http-route = { path = "../../http/route" }
4041
linkerd-identity = { path = "../../identity" }
@@ -49,6 +50,7 @@ linkerd-tonic-watch = { path = "../../tonic-watch" }
4950
[dev-dependencies]
5051
hyper = { version = "0.14", features = ["http1", "http2"] }
5152
linkerd-app-test = { path = "../test", features = ["client-policy"] }
53+
linkerd-http-prom = { path = "../../http/prom", features = ["test-util"] }
5254
linkerd-io = { path = "../../io", features = ["tokio-test"] }
5355
linkerd-meshtls = { path = "../../meshtls", features = ["rustls"] }
5456
linkerd-meshtls-rustls = { path = "../../meshtls/rustls", features = [

linkerd/app/outbound/src/http.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ pub struct Http<T>(T);
3232
#[derive(Clone, Debug, Default)]
3333
pub struct HttpMetrics {
3434
balancer: concrete::BalancerMetrics,
35-
http_route: policy::RouteMetrics,
36-
grpc_route: policy::RouteMetrics,
35+
http_route: policy::HttpRouteMetrics,
36+
grpc_route: policy::GrpcRouteMetrics,
3737
}
3838

3939
pub fn spawn_routes<T>(
@@ -132,12 +132,12 @@ where
132132
impl HttpMetrics {
133133
pub fn register(registry: &mut prom::Registry) -> Self {
134134
let http = registry.sub_registry_with_prefix("http");
135-
let http_route = policy::RouteMetrics::register(http.sub_registry_with_prefix("route"));
135+
let http_route = policy::HttpRouteMetrics::register(http.sub_registry_with_prefix("route"));
136136
let balancer =
137137
concrete::BalancerMetrics::register(http.sub_registry_with_prefix("balancer"));
138138

139139
let grpc = registry.sub_registry_with_prefix("grpc");
140-
let grpc_route = policy::RouteMetrics::register(grpc.sub_registry_with_prefix("route"));
140+
let grpc_route = policy::GrpcRouteMetrics::register(grpc.sub_registry_with_prefix("route"));
141141

142142
Self {
143143
balancer,

linkerd/app/outbound/src/http/logical/policy.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ mod router;
88
mod tests;
99

1010
pub use self::{
11-
route::{errors, RouteMetrics},
11+
route::{errors, GrpcRouteMetrics, HttpRouteMetrics},
1212
router::{GrpcParams, HttpParams},
1313
};
1414
pub use linkerd_proxy_client_policy::{ClientPolicy, FailureAccrual};
@@ -50,8 +50,8 @@ where
5050
/// routing configurations to route requests over cached inner backend
5151
/// services.
5252
pub(super) fn layer<N, S>(
53-
http_metrics: route::RouteMetrics,
54-
grpc_metrics: route::RouteMetrics,
53+
http_metrics: route::HttpRouteMetrics,
54+
grpc_metrics: route::GrpcRouteMetrics,
5555
) -> impl svc::Layer<N, Service = svc::ArcNewCloneHttp<Self>> + Clone
5656
where
5757
// Inner stack.

linkerd/app/outbound/src/http/logical/policy/route.rs

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
use super::super::Concrete;
2-
use crate::RouteRef;
3-
use linkerd_app_core::{classify, metrics::prom, proxy::http, svc, Addr, Error, Result};
2+
use crate::{ParentRef, RouteRef};
3+
use linkerd_app_core::{classify, proxy::http, svc, Addr, Error, Result};
44
use linkerd_distribute as distribute;
55
use linkerd_http_route as http_route;
66
use linkerd_proxy_client_policy as policy;
77
use std::{fmt::Debug, hash::Hash, sync::Arc};
88

99
pub(crate) mod backend;
1010
pub(crate) mod filters;
11+
pub(crate) mod metrics;
1112

1213
pub(crate) use self::backend::{Backend, MatchedBackend};
1314
pub use self::filters::errors;
15+
use self::metrics::labels::Route as RouteLabels;
1416

15-
#[derive(Clone, Debug, Default)]
16-
pub struct RouteMetrics {
17-
backend: backend::RouteBackendMetrics,
18-
}
17+
pub use self::metrics::{GrpcRouteMetrics, HttpRouteMetrics};
1918

2019
/// A target type that includes a summary of exactly how a request was matched.
2120
/// This match state is required to apply route filters.
@@ -31,6 +30,7 @@ pub(crate) struct Matched<M, P> {
3130
pub(crate) struct Route<T, F, E> {
3231
pub(super) parent: T,
3332
pub(super) addr: Addr,
33+
pub(super) parent_ref: ParentRef,
3434
pub(super) route_ref: RouteRef,
3535
pub(super) filters: Arc<[F]>,
3636
pub(super) distribution: BackendDistribution<T, F>,
@@ -55,6 +55,11 @@ pub(crate) type Grpc<T> = MatchedRoute<
5555
pub(crate) type BackendDistribution<T, F> = distribute::Distribution<Backend<T, F>>;
5656
pub(crate) type NewDistribute<T, F, N> = distribute::NewDistribute<Backend<T, F>, (), N>;
5757

58+
pub type Metrics<R, B> = metrics::RouteMetrics<
59+
<R as metrics::MkStreamLabel>::StreamLabel,
60+
<B as metrics::MkStreamLabel>::StreamLabel,
61+
>;
62+
5863
/// Wraps errors with route metadata.
5964
#[derive(Debug, thiserror::Error)]
6065
#[error("route {}: {source}", route.0)]
@@ -64,28 +69,6 @@ struct RouteError {
6469
source: Error,
6570
}
6671

67-
// === impl RouteMetrics ===
68-
69-
impl RouteMetrics {
70-
pub fn register(reg: &mut prom::Registry) -> Self {
71-
Self {
72-
backend: backend::RouteBackendMetrics::register(
73-
reg.sub_registry_with_prefix("backend"),
74-
),
75-
}
76-
}
77-
78-
#[cfg(test)]
79-
pub(crate) fn request_count(
80-
&self,
81-
p: crate::ParentRef,
82-
r: RouteRef,
83-
b: crate::BackendRef,
84-
) -> backend::RequestCount {
85-
self.backend.request_count(p, r, b)
86-
}
87-
}
88-
8972
// === impl MatchedRoute ===
9073

9174
impl<T, M, F, E> MatchedRoute<T, M, F, E>
@@ -103,13 +86,15 @@ where
10386
// Assert that filters can be applied.
10487
Self: filters::Apply,
10588
Self: svc::Param<classify::Request>,
89+
Self: metrics::MkStreamLabel,
10690
MatchedBackend<T, M, F>: filters::Apply,
91+
MatchedBackend<T, M, F>: metrics::MkStreamLabel,
10792
{
10893
/// Builds a route stack that applies policy filters to requests and
10994
/// distributes requests over each route's backends. These [`Concrete`]
11095
/// backends are expected to be cached/shared by the inner stack.
11196
pub(crate) fn layer<N, S>(
112-
metrics: RouteMetrics,
97+
metrics: Metrics<Self, MatchedBackend<T, M, F>>,
11398
) -> impl svc::Layer<N, Service = svc::ArcNewCloneHttp<Self>> + Clone
11499
where
115100
// Inner stack.
@@ -134,10 +119,11 @@ where
134119
// consideration, so we must eagerly fail requests to prevent
135120
// leaking tasks onto the runtime.
136121
.push_on_service(svc::LoadShed::layer())
137-
// TODO(ver) attach the `E` typed failure policy to requests.
138122
.push(filters::NewApplyFilters::<Self, _, _>::layer())
139-
// Sets an optional request timeout.
140123
.push(http::NewTimeout::layer())
124+
.push(metrics::layer(&metrics.requests))
125+
// Configure a classifier to use in the endpoint stack.
126+
// FIXME(ver) move this into NewSetExtensions
141127
.push(classify::NewClassify::layer())
142128
.push(svc::NewMapErr::layer_with(|rt: &Self| {
143129
let route = rt.params.route_ref.clone();
@@ -152,18 +138,29 @@ where
152138
}
153139
}
154140

155-
impl<T: Clone, M, F, E> svc::Param<BackendDistribution<T, F>> for MatchedRoute<T, M, F, E> {
141+
impl<T: Clone, M, F, P> svc::Param<BackendDistribution<T, F>> for MatchedRoute<T, M, F, P> {
156142
fn param(&self) -> BackendDistribution<T, F> {
157143
self.params.distribution.clone()
158144
}
159145
}
160146

161-
impl<T, M, F, E> svc::Param<http::timeout::ResponseTimeout> for MatchedRoute<T, M, F, E> {
147+
impl<T: Clone, M, F, P> svc::Param<RouteLabels> for MatchedRoute<T, M, F, P> {
148+
fn param(&self) -> RouteLabels {
149+
RouteLabels(
150+
self.params.parent_ref.clone(),
151+
self.params.route_ref.clone(),
152+
)
153+
}
154+
}
155+
156+
impl<T, M, F, P> svc::Param<http::timeout::ResponseTimeout> for MatchedRoute<T, M, F, P> {
162157
fn param(&self) -> http::timeout::ResponseTimeout {
163158
http::timeout::ResponseTimeout(self.params.request_timeout)
164159
}
165160
}
166161

162+
// === impl Http ===
163+
167164
impl<T> filters::Apply for Http<T> {
168165
#[inline]
169166
fn apply_request<B>(&self, req: &mut ::http::Request<B>) -> Result<()> {
@@ -176,14 +173,30 @@ impl<T> filters::Apply for Http<T> {
176173
}
177174
}
178175

176+
impl<T> metrics::MkStreamLabel for Http<T> {
177+
type StatusLabels = metrics::labels::HttpRouteRsp;
178+
type DurationLabels = metrics::labels::Route;
179+
type StreamLabel = metrics::LabelHttpRouteRsp;
180+
181+
fn mk_stream_labeler<B>(&self, _: &::http::Request<B>) -> Option<Self::StreamLabel> {
182+
let parent = self.params.parent_ref.clone();
183+
let route = self.params.route_ref.clone();
184+
Some(metrics::LabelHttpRsp::from(metrics::labels::Route::from((
185+
parent, route,
186+
))))
187+
}
188+
}
189+
179190
impl<T> svc::Param<classify::Request> for Http<T> {
180191
fn param(&self) -> classify::Request {
181192
classify::Request::ClientPolicy(classify::ClientPolicy::Http(
182-
self.params.failure_policy.clone(),
193+
policy::http::StatusRanges::default(),
183194
))
184195
}
185196
}
186197

198+
// === impl Grpc ===
199+
187200
impl<T> filters::Apply for Grpc<T> {
188201
#[inline]
189202
fn apply_request<B>(&self, req: &mut ::http::Request<B>) -> Result<()> {
@@ -196,10 +209,24 @@ impl<T> filters::Apply for Grpc<T> {
196209
}
197210
}
198211

212+
impl<T> metrics::MkStreamLabel for Grpc<T> {
213+
type StatusLabels = metrics::labels::GrpcRouteRsp;
214+
type DurationLabels = metrics::labels::Route;
215+
type StreamLabel = metrics::LabelGrpcRouteRsp;
216+
217+
fn mk_stream_labeler<B>(&self, _: &::http::Request<B>) -> Option<Self::StreamLabel> {
218+
let parent = self.params.parent_ref.clone();
219+
let route = self.params.route_ref.clone();
220+
Some(metrics::LabelGrpcRsp::from(metrics::labels::Route::from((
221+
parent, route,
222+
))))
223+
}
224+
}
225+
199226
impl<T> svc::Param<classify::Request> for Grpc<T> {
200227
fn param(&self) -> classify::Request {
201-
classify::Request::ClientPolicy(classify::ClientPolicy::Grpc(
202-
self.params.failure_policy.clone(),
203-
))
228+
classify::Request::ClientPolicy(
229+
classify::ClientPolicy::Grpc(policy::grpc::Codes::default()),
230+
)
204231
}
205232
}

linkerd/app/outbound/src/http/logical/policy/route/backend.rs

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
use super::{super::Concrete, filters};
22
use crate::{BackendRef, ParentRef, RouteRef};
33
use linkerd_app_core::{proxy::http, svc, Error, Result};
4+
use linkerd_http_prom::record_response::MkStreamLabel;
45
use linkerd_http_route as http_route;
56
use linkerd_proxy_client_policy as policy;
67
use std::{fmt::Debug, hash::Hash, sync::Arc};
78

8-
mod count_reqs;
9-
mod metrics;
10-
11-
pub use self::count_reqs::RequestCount;
12-
pub use self::metrics::RouteBackendMetrics;
9+
pub(super) mod metrics;
1310

1411
#[derive(Debug, PartialEq, Eq, Hash)]
1512
pub(crate) struct Backend<T, F> {
@@ -25,6 +22,8 @@ pub(crate) type Http<T> =
2522
pub(crate) type Grpc<T> =
2623
MatchedBackend<T, http_route::grpc::r#match::RouteMatch, policy::grpc::Filter>;
2724

25+
pub type Metrics<T> = metrics::RouteBackendMetrics<<T as MkStreamLabel>::StreamLabel>;
26+
2827
/// Wraps errors with backend metadata.
2928
#[derive(Debug, thiserror::Error)]
3029
#[error("backend {}: {source}", backend.0)]
@@ -71,15 +70,15 @@ where
7170
F: Clone + Send + Sync + 'static,
7271
// Assert that filters can be applied.
7372
Self: filters::Apply,
74-
RouteBackendMetrics: svc::ExtractParam<RequestCount, Self>,
73+
Self: metrics::MkStreamLabel,
7574
{
7675
/// Builds a stack that applies per-route-backend policy filters over an
7776
/// inner [`Concrete`] stack.
7877
///
7978
/// This [`MatchedBackend`] must implement [`filters::Apply`] to apply these
8079
/// filters.
8180
pub(crate) fn layer<N, S>(
82-
metrics: RouteBackendMetrics,
81+
metrics: Metrics<Self>,
8382
) -> impl svc::Layer<N, Service = svc::ArcNewCloneHttp<Self>> + Clone
8483
where
8584
// Inner stack.
@@ -103,7 +102,7 @@ where
103102
)
104103
.push(filters::NewApplyFilters::<Self, _, _>::layer())
105104
.push(http::NewTimeout::layer())
106-
.push(count_reqs::NewCountRequests::layer_via(metrics.clone()))
105+
.push(metrics::layer(&metrics))
107106
.push(svc::NewMapErr::layer_with(|t: &Self| {
108107
let backend = t.params.concrete.backend_ref.clone();
109108
move |source| {
@@ -155,6 +154,21 @@ impl<T> filters::Apply for Http<T> {
155154
}
156155
}
157156

157+
impl<T> metrics::MkStreamLabel for Http<T> {
158+
type StatusLabels = metrics::labels::HttpRouteBackendRsp;
159+
type DurationLabels = metrics::labels::RouteBackend;
160+
type StreamLabel = metrics::LabelHttpRouteBackendRsp;
161+
162+
fn mk_stream_labeler<B>(&self, _: &::http::Request<B>) -> Option<Self::StreamLabel> {
163+
let parent = self.params.concrete.parent_ref.clone();
164+
let route = self.params.route_ref.clone();
165+
let backend = self.params.concrete.backend_ref.clone();
166+
Some(metrics::LabelHttpRsp::from(
167+
metrics::labels::RouteBackend::from((parent, route, backend)),
168+
))
169+
}
170+
}
171+
158172
impl<T> filters::Apply for Grpc<T> {
159173
#[inline]
160174
fn apply_request<B>(&self, req: &mut ::http::Request<B>) -> Result<()> {
@@ -165,3 +179,18 @@ impl<T> filters::Apply for Grpc<T> {
165179
filters::apply_grpc_response(&self.params.filters, rsp)
166180
}
167181
}
182+
183+
impl<T> metrics::MkStreamLabel for Grpc<T> {
184+
type StatusLabels = metrics::labels::GrpcRouteBackendRsp;
185+
type DurationLabels = metrics::labels::RouteBackend;
186+
type StreamLabel = metrics::LabelGrpcRouteBackendRsp;
187+
188+
fn mk_stream_labeler<B>(&self, _: &::http::Request<B>) -> Option<Self::StreamLabel> {
189+
let parent = self.params.concrete.parent_ref.clone();
190+
let route = self.params.route_ref.clone();
191+
let backend = self.params.concrete.backend_ref.clone();
192+
Some(metrics::LabelGrpcRsp::from(
193+
metrics::labels::RouteBackend::from((parent, route, backend)),
194+
))
195+
}
196+
}

0 commit comments

Comments
 (0)