Skip to content

Commit fbb1fb8

Browse files
author
Chao Xu
committed
add metrics and traces for egress dials
1 parent cd0057c commit fbb1fb8

File tree

2 files changed

+140
-8
lines changed

2 files changed

+140
-8
lines changed

staging/src/k8s.io/apiserver/pkg/server/egressselector/egress_selector.go

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,20 @@ import (
2222
"crypto/tls"
2323
"crypto/x509"
2424
"fmt"
25-
"google.golang.org/grpc"
2625
"io/ioutil"
27-
utilnet "k8s.io/apimachinery/pkg/util/net"
28-
"k8s.io/apiserver/pkg/apis/apiserver"
29-
"k8s.io/klog"
3026
"net"
3127
"net/http"
3228
"net/url"
33-
client "sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client"
3429
"strings"
30+
"time"
31+
32+
"google.golang.org/grpc"
33+
utilnet "k8s.io/apimachinery/pkg/util/net"
34+
"k8s.io/apiserver/pkg/apis/apiserver"
35+
egressmetrics "k8s.io/apiserver/pkg/server/egressselector/metrics"
36+
"k8s.io/klog"
37+
utiltrace "k8s.io/utils/trace"
38+
client "sigs.k8s.io/apiserver-network-proxy/konnectivity-client/pkg/client"
3539
)
3640

3741
var directDialer utilnet.DialFunc = http.DefaultTransport.(*http.Transport).DialContext
@@ -152,35 +156,57 @@ func createConnectTCPDialer(tcpTransport *apiserver.TCPTransport) (utilnet.DialF
152156
certPool = nil
153157
}
154158
contextDialer := func(ctx context.Context, network, addr string) (net.Conn, error) {
159+
trace := utiltrace.New("Proxy via HTTP Connect over TCP", utiltrace.Field{Key: "address", Value: addr})
160+
defer trace.LogIfLong(500 * time.Millisecond)
155161
klog.V(4).Infof("Sending request to %q.", addr)
162+
start := time.Now()
156163
proxyConn, err := tls.Dial("tcp", proxyAddress,
157164
&tls.Config{
158165
Certificates: []tls.Certificate{clientCerts},
159166
RootCAs: certPool,
160167
},
161168
)
162169
if err != nil {
170+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportTCP, egressmetrics.StageDial)
163171
return nil, fmt.Errorf("dialing proxy %q failed: %v", proxyAddress, err)
164172
}
165-
return tunnelHTTPConnect(proxyConn, proxyAddress, addr)
173+
ret, err := tunnelHTTPConnect(proxyConn, proxyAddress, addr)
174+
if err != nil {
175+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportTCP, egressmetrics.StageProxy)
176+
return nil, err
177+
}
178+
egressmetrics.Metrics.ObserveDialLatency(time.Since(start), egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportTCP)
179+
return ret, nil
166180
}
167181
return contextDialer, nil
168182
}
169183

170184
func createConnectUDSDialer(udsConfig *apiserver.UDSTransport) (utilnet.DialFunc, error) {
171185
contextDialer := func(ctx context.Context, network, addr string) (net.Conn, error) {
186+
trace := utiltrace.New("Proxy via HTTP Connect over UDS", utiltrace.Field{Key: "address", Value: addr})
187+
defer trace.LogIfLong(500 * time.Millisecond)
188+
start := time.Now()
172189
proxyConn, err := net.Dial("unix", udsConfig.UDSName)
173190
if err != nil {
191+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportUDS, egressmetrics.StageDial)
174192
return nil, fmt.Errorf("dialing proxy %q failed: %v", udsConfig.UDSName, err)
175193
}
176-
return tunnelHTTPConnect(proxyConn, udsConfig.UDSName, addr)
194+
ret, err := tunnelHTTPConnect(proxyConn, udsConfig.UDSName, addr)
195+
if err != nil {
196+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportUDS, egressmetrics.StageProxy)
197+
return nil, err
198+
}
199+
egressmetrics.Metrics.ObserveDialLatency(time.Since(start), egressmetrics.ProtocolHTTPConnect, egressmetrics.TransportUDS)
200+
return ret, nil
177201
}
178202
return contextDialer, nil
179203
}
180204

181205
func createGRPCUDSDialer(udsName string) (utilnet.DialFunc, error) {
182206
contextDialer := func(ctx context.Context, network, addr string) (net.Conn, error) {
183-
207+
trace := utiltrace.New("Proxy via GRPC over UDS", utiltrace.Field{Key: "address", Value: addr})
208+
defer trace.LogIfLong(500 * time.Millisecond)
209+
start := time.Now()
184210
dialOption := grpc.WithContextDialer(func(context.Context, string) (net.Conn, error) {
185211
c, err := net.Dial("unix", udsName)
186212
if err != nil {
@@ -191,13 +217,16 @@ func createGRPCUDSDialer(udsName string) (utilnet.DialFunc, error) {
191217

192218
tunnel, err := client.CreateGrpcTunnel(udsName, dialOption, grpc.WithInsecure())
193219
if err != nil {
220+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolGRPC, egressmetrics.TransportUDS, egressmetrics.StageDial)
194221
return nil, err
195222
}
196223

197224
proxyConn, err := tunnel.Dial("tcp", addr)
198225
if err != nil {
226+
egressmetrics.Metrics.ObserveDialFailure(egressmetrics.ProtocolGRPC, egressmetrics.TransportUDS, egressmetrics.StageProxy)
199227
return nil, err
200228
}
229+
egressmetrics.Metrics.ObserveDialLatency(time.Since(start), egressmetrics.ProtocolGRPC, egressmetrics.TransportUDS)
201230
return proxyConn, nil
202231
}
203232
return contextDialer, nil
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
Copyright 2017 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"time"
21+
22+
"k8s.io/component-base/metrics"
23+
"k8s.io/component-base/metrics/legacyregistry"
24+
)
25+
26+
const (
27+
namespace = "apiserver"
28+
subsystem = "egress_dialer"
29+
30+
// ProtocolHTTPConnect means that the proxy protocol is http-connect.
31+
ProtocolHTTPConnect = "http_connect"
32+
// ProtocolHTTPGRPC means that the proxy protocol is the GRPC protocol.
33+
ProtocolGRPC = "grpc"
34+
// TransportTCP means that the transport is TCP.
35+
TransportTCP = "tcp"
36+
// TransportUDS means that the transport is UDS.
37+
TransportUDS = "uds"
38+
// StageTransport indicates that the dial failed at dialing to the proxy server.
39+
StageDial = "dial"
40+
// StageProtocol indicates that the dial failed at requesting the proxy server to proxy.
41+
StageProxy = "proxy"
42+
)
43+
44+
var (
45+
// Use buckets ranging from 5 ms to 12.5 seconds.
46+
latencyBuckets = []float64{0.005, 0.025, 0.1, 0.5, 2.5, 12.5}
47+
latencySummaryMaxAge = 5 * time.Hour
48+
49+
// Metrics provides access to all dial metrics.
50+
Metrics = newDialMetrics()
51+
)
52+
53+
// DialMetrics instruments dials to proxy server with prometheus metrics.
54+
type DialMetrics struct {
55+
latencies *metrics.HistogramVec
56+
failures *metrics.CounterVec
57+
}
58+
59+
// newDialMetrics create a new DialMetrics, configured with default metric names.
60+
func newDialMetrics() *DialMetrics {
61+
latencies := metrics.NewHistogramVec(
62+
&metrics.HistogramOpts{
63+
Namespace: namespace,
64+
Subsystem: subsystem,
65+
Name: "dial_duration_seconds",
66+
Help: "Dial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)",
67+
Buckets: latencyBuckets,
68+
StabilityLevel: metrics.ALPHA,
69+
},
70+
[]string{"protocol", "transport"},
71+
)
72+
73+
failures := metrics.NewCounterVec(
74+
&metrics.CounterOpts{
75+
Namespace: namespace,
76+
Subsystem: subsystem,
77+
Name: "dial_failure_count",
78+
Help: "Dial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (dial or proxy). The stage indicates at which stage the dial failed",
79+
StabilityLevel: metrics.ALPHA,
80+
},
81+
[]string{"protocol", "transport", "stage"},
82+
)
83+
84+
legacyregistry.MustRegister(latencies)
85+
legacyregistry.MustRegister(failures)
86+
return &DialMetrics{latencies: latencies, failures: failures}
87+
}
88+
89+
// Reset resets the metrics.
90+
func (m *DialMetrics) Reset() {
91+
m.latencies.Reset()
92+
m.failures.Reset()
93+
}
94+
95+
// ObserveDialLatency records the latency of a dial, labeled by protocol, transport.
96+
func (m *DialMetrics) ObserveDialLatency(elapsed time.Duration, protocol, transport string) {
97+
m.latencies.WithLabelValues(protocol, transport).Observe(elapsed.Seconds())
98+
}
99+
100+
// ObserverDialFailure records a failed dial, labeled by protocol, transport, and the stage the dial failed at.
101+
func (m *DialMetrics) ObserveDialFailure(protocol, transport, stage string) {
102+
m.failures.WithLabelValues(protocol, transport, stage).Inc()
103+
}

0 commit comments

Comments
 (0)