Skip to content

Commit c0a5022

Browse files
committed
Add automatic TLS certificate reloading for EPP
Enables the server to reload certificates without restart when they are rotated, which is particularly useful in Kubernetes environments where certificate rotation is automated. Adds --enable-cert-refresh flag (default: false) to control this behavior. Uses file watching with debouncing to handle rapid file system events during certificate updates. Signed-off-by: Pierangelo Di Pilato <[email protected]>
1 parent 0fc7cfb commit c0a5022

File tree

6 files changed

+480
-6
lines changed

6 files changed

+480
-6
lines changed

cmd/epp/runner/runner.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ var (
106106
certPath = flag.String("cert-path", runserver.DefaultCertPath, "The path to the certificate for secure serving. The certificate and private key files "+
107107
"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
108108
"then a self-signed certificate is used.")
109+
enableCertReload = flag.Bool("enable-cert-reload", runserver.DefaultCertReload, "Enables certificate reloading of the certificates specified in --cert-path")
109110
// metric flags
110111
totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.")
111112
kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
@@ -345,6 +346,7 @@ func (r *Runner) Run(ctx context.Context) error {
345346
SecureServing: *secureServing,
346347
HealthChecking: *healthChecking,
347348
CertPath: *certPath,
349+
EnableCertReload: *enableCertReload,
348350
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
349351
MetricsStalenessThreshold: *metricsStalenessThreshold,
350352
Director: director,

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/cespare/xxhash/v2 v2.3.0
77
github.com/elastic/crd-ref-docs v0.2.0
88
github.com/envoyproxy/go-control-plane/envoy v1.35.0
9+
github.com/fsnotify/fsnotify v1.9.0
910
github.com/go-logr/logr v1.4.3
1011
github.com/google/go-cmp v0.7.0
1112
github.com/google/uuid v1.6.0
@@ -58,7 +59,6 @@ require (
5859
github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
5960
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
6061
github.com/felixge/httpsnoop v1.0.4 // indirect
61-
github.com/fsnotify/fsnotify v1.9.0 // indirect
6262
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
6363
github.com/go-logr/stdr v1.2.2 // indirect
6464
github.com/go-logr/zapr v1.3.0 // indirect

pkg/common/certs.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"context"
21+
"crypto/tls"
22+
"fmt"
23+
"sync/atomic"
24+
"time"
25+
26+
"github.com/fsnotify/fsnotify"
27+
"sigs.k8s.io/controller-runtime/pkg/log"
28+
29+
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
30+
)
31+
32+
// debounceDelay wait for events to settle before reloading
33+
const debounceDelay = 250 * time.Millisecond
34+
35+
type CertReloader struct {
36+
cert *atomic.Pointer[tls.Certificate]
37+
}
38+
39+
func NewCertReloader(ctx context.Context, path string, init *tls.Certificate) (*CertReloader, error) {
40+
certPtr := &atomic.Pointer[tls.Certificate]{}
41+
certPtr.Store(init)
42+
43+
w, err := fsnotify.NewWatcher()
44+
if err != nil {
45+
return nil, fmt.Errorf("failed to create cert watcher: %w", err)
46+
}
47+
48+
logger := log.FromContext(ctx).
49+
WithName("cert-reloader").
50+
WithValues("path", path)
51+
traceLogger := logger.V(logutil.TRACE)
52+
53+
go func() {
54+
defer w.Close()
55+
56+
var debounceTimer *time.Timer
57+
58+
for {
59+
select {
60+
case ev := <-w.Events:
61+
traceLogger.Info("Cert changed", "event", ev)
62+
63+
if ev.Op&(fsnotify.Write|fsnotify.Create) == 0 {
64+
continue
65+
}
66+
67+
// Debounce: reset the timer if we get another event
68+
if debounceTimer != nil {
69+
debounceTimer.Stop()
70+
}
71+
72+
debounceTimer = time.AfterFunc(debounceDelay, func() {
73+
// This runs after the delay with no new events
74+
cert, err := tls.LoadX509KeyPair(path+"/tls.crt", path+"/tls.key")
75+
if err != nil {
76+
logger.Error(err, "Failed to reload TLS certificate")
77+
return
78+
}
79+
certPtr.Store(&cert)
80+
traceLogger.Info("Reloaded TLS certificate")
81+
})
82+
83+
case err := <-w.Errors:
84+
if err != nil {
85+
logger.Error(err, "cert watcher failed")
86+
}
87+
case <-ctx.Done():
88+
return
89+
}
90+
}
91+
}()
92+
93+
if err := w.Add(path); err != nil {
94+
return nil, fmt.Errorf("failed to watch %q: %w", path, err)
95+
}
96+
97+
return &CertReloader{cert: certPtr}, nil
98+
}
99+
100+
func (r *CertReloader) Get() *tls.Certificate {
101+
return r.cert.Load()
102+
}

0 commit comments

Comments
 (0)