Skip to content

Commit 07ff68c

Browse files
feat: fsnotify cert watching for live rotation (#29)
* feat: add fsnotify cert watching for live rotation Add CatchCertChange() to watch parent directories of PKI cert files for filesystem changes (K8s Secret volume symlink swaps). Events are debounced at 500ms before triggering config reload via the existing ReloadConfig path. Also fix the TODO at interface.go:336 — reloadFirewall now checks both firewall and pki config changes, ensuring firewall rules are re-evaluated when certificates are rotated. Closes #27 * bump go directive to 1.26.1 to fix govulncheck stdlib vulns Fixes GO-2026-4599, GO-2026-4600, GO-2026-4601, GO-2026-4602: crypto/x509, net/url, and os vulnerabilities in go1.25.7. * build golangci-lint from source to support Go 1.26.1 golangci-lint v2.5 binary was built with Go 1.25, which rejects linting code targeting Go 1.26.1. Switch to install-mode: goinstall with v2.11.2 so the binary is built using the project's Go toolchain. --------- Co-authored-by: privsim <excaliberswake@pm.me>
1 parent 25b0ea3 commit 07ff68c

File tree

7 files changed

+312
-6
lines changed

7 files changed

+312
-6
lines changed

.github/workflows/test.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ jobs:
3333
- name: golangci-lint
3434
uses: golangci/golangci-lint-action@v9
3535
with:
36-
version: v2.5
36+
version: v2.11.2
37+
install-mode: goinstall
3738

3839
- name: Test
3940
run: make test
@@ -122,7 +123,8 @@ jobs:
122123
- name: golangci-lint
123124
uses: golangci/golangci-lint-action@v9
124125
with:
125-
version: v2.5
126+
version: v2.11.2
127+
install-mode: goinstall
126128

127129
- name: Test
128130
run: make test

config/config.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"time"
1717

1818
"dario.cat/mergo"
19+
"github.com/fsnotify/fsnotify"
1920
"github.com/sirupsen/logrus"
2021
"go.yaml.in/yaml/v3"
2122
)
@@ -143,6 +144,107 @@ func (c *C) CatchHUP(ctx context.Context) {
143144
}()
144145
}
145146

147+
// CatchCertChange watches the parent directories of the configured pki.cert, pki.key, and pki.ca
148+
// file paths for filesystem changes (e.g., K8s Secret volume atomic symlink swaps) and triggers
149+
// a config reload. Changes are debounced: after the last filesystem event, the method waits
150+
// for the specified debounce duration before reloading to coalesce rapid updates (e.g., cert and
151+
// key updated in quick succession). The watcher is stopped when ctx is cancelled.
152+
func (c *C) CatchCertChange(ctx context.Context, debounce time.Duration) {
153+
if c.path == "" {
154+
return
155+
}
156+
157+
certPaths := c.GetCertPaths()
158+
if len(certPaths) == 0 {
159+
c.l.Warn("No certificate file paths configured, cert file watching disabled")
160+
return
161+
}
162+
163+
watcher, err := fsnotify.NewWatcher()
164+
if err != nil {
165+
c.l.WithError(err).Error("Failed to create fsnotify watcher for cert files")
166+
return
167+
}
168+
169+
// Watch parent directories because K8s uses atomic symlink swaps at the
170+
// directory level, not individual file writes.
171+
watched := make(map[string]bool)
172+
for _, p := range certPaths {
173+
dir := filepath.Dir(p)
174+
absDir, err := filepath.Abs(dir)
175+
if err != nil {
176+
c.l.WithError(err).WithField("path", dir).Warn("Failed to resolve absolute path for cert directory")
177+
continue
178+
}
179+
if watched[absDir] {
180+
continue
181+
}
182+
if err := watcher.Add(absDir); err != nil {
183+
c.l.WithError(err).WithField("dir", absDir).Warn("Failed to watch cert directory")
184+
continue
185+
}
186+
watched[absDir] = true
187+
c.l.WithField("dir", absDir).Info("Watching directory for certificate changes")
188+
}
189+
190+
if len(watched) == 0 {
191+
c.l.Warn("No cert directories could be watched, cert file watching disabled")
192+
watcher.Close()
193+
return
194+
}
195+
196+
go func() {
197+
defer watcher.Close()
198+
var timer *time.Timer
199+
for {
200+
select {
201+
case <-ctx.Done():
202+
if timer != nil {
203+
timer.Stop()
204+
}
205+
return
206+
case event, ok := <-watcher.Events:
207+
if !ok {
208+
return
209+
}
210+
c.l.WithField("event", event).Debug("Cert directory fsnotify event")
211+
// Reset the debounce timer on every event
212+
if timer != nil {
213+
timer.Stop()
214+
}
215+
timer = time.AfterFunc(debounce, func() {
216+
c.l.Info("Certificate file change detected, reloading config")
217+
c.ReloadConfig()
218+
})
219+
case err, ok := <-watcher.Errors:
220+
if !ok {
221+
return
222+
}
223+
c.l.WithError(err).Error("fsnotify watcher error")
224+
}
225+
}
226+
}()
227+
}
228+
229+
// GetCertPaths returns the file paths for pki.cert, pki.key, and pki.ca from the current
230+
// config settings, filtering out inline PEM data and empty values.
231+
func (c *C) GetCertPaths() []string {
232+
keys := []string{"pki.cert", "pki.key", "pki.ca"}
233+
var paths []string
234+
for _, k := range keys {
235+
v := c.GetString(k, "")
236+
if v == "" {
237+
continue
238+
}
239+
// Skip inline PEM data and PKCS#11 URIs — only watch actual file paths
240+
if strings.Contains(v, "-----BEGIN") || strings.HasPrefix(v, "pkcs11:") {
241+
continue
242+
}
243+
paths = append(paths, v)
244+
}
245+
return paths
246+
}
247+
146248
func (c *C) ReloadConfig() {
147249
c.reloadLock.Lock()
148250
defer c.reloadLock.Unlock()

config/config_test.go

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package config
22

33
import (
4+
"context"
45
"os"
56
"path/filepath"
7+
"sync/atomic"
68
"testing"
79
"time"
810

@@ -220,3 +222,195 @@ firewall:
220222
}
221223
assert.Equal(t, expected, m)
222224
}
225+
226+
func TestConfig_GetCertPaths(t *testing.T) {
227+
l := test.NewLogger()
228+
229+
// File paths should be returned
230+
c := NewC(l)
231+
c.Settings["pki"] = map[string]any{
232+
"cert": "/etc/nebula/host.crt",
233+
"key": "/etc/nebula/host.key",
234+
"ca": "/etc/nebula/ca.crt",
235+
}
236+
paths := c.GetCertPaths()
237+
assert.Len(t, paths, 3)
238+
assert.Contains(t, paths, "/etc/nebula/host.crt")
239+
assert.Contains(t, paths, "/etc/nebula/host.key")
240+
assert.Contains(t, paths, "/etc/nebula/ca.crt")
241+
242+
// Inline PEM data should be skipped
243+
c = NewC(l)
244+
c.Settings["pki"] = map[string]any{
245+
"cert": "-----BEGIN NEBULA CERTIFICATE-----\ndata\n-----END NEBULA CERTIFICATE-----",
246+
"key": "/etc/nebula/host.key",
247+
"ca": "-----BEGIN NEBULA CERTIFICATE-----\ndata\n-----END NEBULA CERTIFICATE-----",
248+
}
249+
paths = c.GetCertPaths()
250+
assert.Len(t, paths, 1)
251+
assert.Equal(t, "/etc/nebula/host.key", paths[0])
252+
253+
// PKCS#11 URIs should be skipped
254+
c = NewC(l)
255+
c.Settings["pki"] = map[string]any{
256+
"cert": "/etc/nebula/host.crt",
257+
"key": "pkcs11:token=mytoken",
258+
"ca": "/etc/nebula/ca.crt",
259+
}
260+
paths = c.GetCertPaths()
261+
assert.Len(t, paths, 2)
262+
assert.Contains(t, paths, "/etc/nebula/host.crt")
263+
assert.Contains(t, paths, "/etc/nebula/ca.crt")
264+
265+
// Empty or missing settings should return empty
266+
c = NewC(l)
267+
paths = c.GetCertPaths()
268+
assert.Empty(t, paths)
269+
}
270+
271+
func TestConfig_CatchCertChange(t *testing.T) {
272+
l := test.NewLogger()
273+
274+
// Set up a temp directory with config and cert files
275+
configDir, err := os.MkdirTemp("", "config-certwatch-test")
276+
require.NoError(t, err)
277+
defer os.RemoveAll(configDir)
278+
279+
certDir, err := os.MkdirTemp("", "certs-certwatch-test")
280+
require.NoError(t, err)
281+
defer os.RemoveAll(certDir)
282+
283+
certPath := filepath.Join(certDir, "host.crt")
284+
keyPath := filepath.Join(certDir, "host.key")
285+
caPath := filepath.Join(certDir, "ca.crt")
286+
287+
require.NoError(t, os.WriteFile(certPath, []byte("cert-data"), 0644))
288+
require.NoError(t, os.WriteFile(keyPath, []byte("key-data"), 0644))
289+
require.NoError(t, os.WriteFile(caPath, []byte("ca-data"), 0644))
290+
291+
// Create config that references those cert paths
292+
configYaml := "pki:\n cert: " + certPath + "\n key: " + keyPath + "\n ca: " + caPath + "\n"
293+
require.NoError(t, os.WriteFile(filepath.Join(configDir, "config.yaml"), []byte(configYaml), 0644))
294+
295+
c := NewC(l)
296+
require.NoError(t, c.Load(configDir))
297+
298+
// Track reload calls via callback
299+
var reloadCount atomic.Int32
300+
c.RegisterReloadCallback(func(c *C) {
301+
reloadCount.Add(1)
302+
})
303+
304+
ctx, cancel := context.WithCancel(context.Background())
305+
defer cancel()
306+
307+
// Start watching with a short debounce for testing
308+
c.CatchCertChange(ctx, 100*time.Millisecond)
309+
310+
// Give the watcher time to start
311+
time.Sleep(50 * time.Millisecond)
312+
313+
// Modify a cert file
314+
require.NoError(t, os.WriteFile(certPath, []byte("cert-data-updated"), 0644))
315+
316+
// Wait for debounce + processing
317+
time.Sleep(500 * time.Millisecond)
318+
319+
assert.GreaterOrEqual(t, reloadCount.Load(), int32(1), "ReloadConfig should have been called at least once")
320+
321+
// Reset count and test debouncing: rapid writes should coalesce into one reload
322+
startCount := reloadCount.Load()
323+
require.NoError(t, os.WriteFile(certPath, []byte("cert-v2"), 0644))
324+
time.Sleep(20 * time.Millisecond)
325+
require.NoError(t, os.WriteFile(keyPath, []byte("key-v2"), 0644))
326+
time.Sleep(20 * time.Millisecond)
327+
require.NoError(t, os.WriteFile(caPath, []byte("ca-v2"), 0644))
328+
329+
// Wait for debounce to fire
330+
time.Sleep(500 * time.Millisecond)
331+
332+
endCount := reloadCount.Load()
333+
// The debounced rapid writes should result in at most 2 reload calls
334+
// (ideally 1, but filesystem event timing can vary)
335+
assert.LessOrEqual(t, endCount-startCount, int32(2),
336+
"Debouncing should coalesce rapid writes into few reloads")
337+
assert.GreaterOrEqual(t, endCount-startCount, int32(1),
338+
"At least one reload should occur for the rapid writes")
339+
}
340+
341+
func TestConfig_CatchCertChange_ContextCancel(t *testing.T) {
342+
l := test.NewLogger()
343+
344+
certDir, err := os.MkdirTemp("", "certs-cancel-test")
345+
require.NoError(t, err)
346+
defer os.RemoveAll(certDir)
347+
348+
certPath := filepath.Join(certDir, "host.crt")
349+
require.NoError(t, os.WriteFile(certPath, []byte("cert-data"), 0644))
350+
351+
configDir, err := os.MkdirTemp("", "config-cancel-test")
352+
require.NoError(t, err)
353+
defer os.RemoveAll(configDir)
354+
355+
configYaml := "pki:\n cert: " + certPath + "\n"
356+
require.NoError(t, os.WriteFile(filepath.Join(configDir, "config.yaml"), []byte(configYaml), 0644))
357+
358+
c := NewC(l)
359+
require.NoError(t, c.Load(configDir))
360+
361+
var reloadCount atomic.Int32
362+
c.RegisterReloadCallback(func(c *C) {
363+
reloadCount.Add(1)
364+
})
365+
366+
ctx, cancel := context.WithCancel(context.Background())
367+
c.CatchCertChange(ctx, 100*time.Millisecond)
368+
time.Sleep(50 * time.Millisecond)
369+
370+
// Cancel the context to stop the watcher
371+
cancel()
372+
time.Sleep(50 * time.Millisecond)
373+
374+
// Write to cert file after cancellation — should NOT trigger reload
375+
beforeCount := reloadCount.Load()
376+
require.NoError(t, os.WriteFile(certPath, []byte("cert-after-cancel"), 0644))
377+
time.Sleep(300 * time.Millisecond)
378+
379+
assert.Equal(t, beforeCount, reloadCount.Load(),
380+
"No reload should occur after context cancellation")
381+
}
382+
383+
func TestConfig_CatchCertChange_NoPath(t *testing.T) {
384+
l := test.NewLogger()
385+
c := NewC(l)
386+
387+
// No path set — CatchCertChange should return immediately without panic
388+
ctx, cancel := context.WithCancel(context.Background())
389+
defer cancel()
390+
c.CatchCertChange(ctx, 100*time.Millisecond)
391+
}
392+
393+
func TestConfig_CatchCertChange_InlinePEM(t *testing.T) {
394+
l := test.NewLogger()
395+
396+
configDir, err := os.MkdirTemp("", "config-inline-test")
397+
require.NoError(t, err)
398+
defer os.RemoveAll(configDir)
399+
400+
// All inline PEM — no files to watch
401+
configYaml := `pki:
402+
cert: "-----BEGIN NEBULA CERTIFICATE-----\ndata\n-----END NEBULA CERTIFICATE-----"
403+
key: "-----BEGIN NEBULA X25519 PRIVATE KEY-----\ndata\n-----END NEBULA X25519 PRIVATE KEY-----"
404+
ca: "-----BEGIN NEBULA CERTIFICATE-----\ndata\n-----END NEBULA CERTIFICATE-----"
405+
`
406+
require.NoError(t, os.WriteFile(filepath.Join(configDir, "config.yaml"), []byte(configYaml), 0644))
407+
408+
c := NewC(l)
409+
require.NoError(t, c.Load(configDir))
410+
411+
ctx, cancel := context.WithCancel(context.Background())
412+
defer cancel()
413+
414+
// Should not panic or error — just logs a warning and returns
415+
c.CatchCertChange(ctx, 100*time.Millisecond)
416+
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/slackhq/nebula
22

3-
go 1.25.7
3+
go 1.26.1
44

55
require (
66
dario.cat/mergo v1.0.2
@@ -10,6 +10,7 @@ require (
1010
github.com/cloudflare/circl v1.6.3
1111
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432
1212
github.com/flynn/noise v1.1.0
13+
github.com/fsnotify/fsnotify v1.9.0
1314
github.com/gaissmai/bart v0.26.1
1415
github.com/gogo/protobuf v1.3.2
1516
github.com/google/gopacket v1.1.19

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
2828
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
2929
github.com/flynn/noise v1.1.0 h1:KjPQoQCEFdZDiP03phOvGi11+SVVhBG2wOWAorLsstg=
3030
github.com/flynn/noise v1.1.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag=
31+
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
32+
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
3133
github.com/gaissmai/bart v0.26.1 h1:+w4rnLGNlA2GDVn382Tfe3jOsK5vOr5n4KmigJ9lbTo=
3234
github.com/gaissmai/bart v0.26.1/go.mod h1:GREWQfTLRWz/c5FTOsIw+KkscuFkIV5t8Rp7Nd1Td5c=
3335
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=

interface.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,11 +333,15 @@ func (f *Interface) reloadDisconnectInvalid(c *config.C) {
333333
}
334334

335335
func (f *Interface) reloadFirewall(c *config.C) {
336-
//TODO: need to trigger/detect if the certificate changed too
337-
if c.HasChanged("firewall") == false {
338-
f.l.Debug("No firewall config change detected")
336+
firewallChanged := c.HasChanged("firewall")
337+
certChanged := c.HasChanged("pki")
338+
if !firewallChanged && !certChanged {
339+
f.l.Debug("No firewall or certificate config change detected")
339340
return
340341
}
342+
if certChanged {
343+
f.l.Info("Certificate change detected, re-evaluating firewall rules")
344+
}
341345

342346
fw, err := NewFirewallFromConfig(f.l, f.pki.getCertState(), c)
343347
if err != nil {

main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ func Main(c *config.C, configTest bool, buildVersion string, logger *logrus.Logg
126126
var tun overlay.Device
127127
if !configTest {
128128
c.CatchHUP(ctx)
129+
c.CatchCertChange(ctx, 500*time.Millisecond)
129130

130131
if deviceFactory == nil {
131132
deviceFactory = overlay.NewDeviceFromConfig

0 commit comments

Comments
 (0)