Skip to content

Commit 48435b0

Browse files
committed
Restart when SystemCertPool should change
This fixes a downstream bug There was a problem downstream where the OpenShift servivce-ca was not yet available, and due to the way the manifests were set up, the service-ca was considered to be part of the SystemCertPool. The problem is that the SystemCertPool, once initialized, will never reload itself. We can get into this situation when we use SSL_CERT_DIR and SSL_CERT_FILE to provide OpenShift CAs to be used by containers/image for pulling. These environment variables change the source of the SystemCertPool. The CertPoolWatcher then watches these locations, and tries to update the pool it provides to the HTTPS client connecting to catalogd. But the SystemCertPool is never updated. (It did not help that there was no explicit CertPoolWatcher for the pull CAs.) I tried to fix this downstream by removing SSL_CERT_DIR, and specifying the `--pull-cas-dir` option. This means that containers/image would directly use certificates that we specify, rather than the default location. But this breaks the use of custom CAs for local image registries. The containers/image package does not provide a way to manipulate the certificate locations beyond a simple directory setting, and we need to leave that directory setting as the default in downstream because it (i.e. /etc/docker/certs.d) is a host- mounted directory that contains certificates for local image registries. And it is possible to configure a custom CA for a local image registry, so that directory must be included, ALONG with the OpenShift provided CAs and service-ca, which is defined by SSL_CERT_DIR. But because of the use of SSL_CERT_DIR to include the OpenShift service-ca, if the service-ca was not available at startup, but became available later, it was not possible to reload the SystemCertPool. Which could cause problems in operator-controller when it tried to connect to catalogd. The fundamental problem is that there's no way to refresh the SystemCertPool, which will become more and more of an issue as certificate lifetimes decrease. Using SSL_CERT_DIR allows us to use the CertPoolWatcher to notice changes to the SystemCertPool. This will allow us to restart the process when certificates change (e.g. OpenShift service-ca becomes available). Changes: * Update CertPoolWatcher to restart on changes to SSL_CERT_DIR and SSL_CERT_FILE * Update CertPoolWatcher to use a Runnable interface, so that it can be added to the manager, and started later, which may improve the changes that the service-ca is ready. * Update CertPoolWatcher to not be created when there's nothing to watch. * Add CertPoolWatcher to catalogd for pull CAs * Add CertPoolWatcher to operator-controller for pull CAs With this, my downstream manifest change should be reverted. Signed-off-by: Todd Short <[email protected]> Assisted-by: Claude Code (alternatives)
1 parent d95f426 commit 48435b0

File tree

4 files changed

+259
-62
lines changed

4 files changed

+259
-62
lines changed

cmd/catalogd/main.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ import (
6060
"github.com/operator-framework/operator-controller/internal/catalogd/webhook"
6161
sharedcontrollers "github.com/operator-framework/operator-controller/internal/shared/controllers"
6262
fsutil "github.com/operator-framework/operator-controller/internal/shared/util/fs"
63+
httputil "github.com/operator-framework/operator-controller/internal/shared/util/http"
6364
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
6465
"github.com/operator-framework/operator-controller/internal/shared/util/pullsecretcache"
6566
sautil "github.com/operator-framework/operator-controller/internal/shared/util/sa"
@@ -291,6 +292,17 @@ func run(ctx context.Context) error {
291292
return err
292293
}
293294

295+
// This watches the pullCasDir and the SSL_CERT_DIR, and SSL_CERT_FILE for changes
296+
cpwPull, err := httputil.NewCertPoolWatcher(cfg.pullCasDir, ctrl.Log.WithName("pull-ca-pool"))
297+
if err != nil {
298+
setupLog.Error(err, "unable to create pull-ca-pool watcher")
299+
return err
300+
}
301+
if err = mgr.Add(cpwPull); err != nil {
302+
setupLog.Error(err, "unable to add pull-ca-pool watcher to manager")
303+
return err
304+
}
305+
294306
if cfg.systemNamespace == "" {
295307
cfg.systemNamespace = podNamespace()
296308
}

cmd/operator-controller/main.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,24 @@ func run() error {
336336
return err
337337
}
338338

339-
certPoolWatcher, err := httputil.NewCertPoolWatcher(cfg.catalogdCasDir, ctrl.Log.WithName("cert-pool"))
339+
cpwCatalogd, err := httputil.NewCertPoolWatcher(cfg.catalogdCasDir, ctrl.Log.WithName("catalogd-ca-pool"))
340340
if err != nil {
341-
setupLog.Error(err, "unable to create CA certificate pool")
341+
setupLog.Error(err, "unable to create catalogd-ca-pool watcher")
342+
return err
343+
}
344+
if err = mgr.Add(cpwCatalogd); err != nil {
345+
setupLog.Error(err, "unable to add catalogd-ca-pool watcher to manager")
346+
return err
347+
}
348+
349+
// This watches the pullCasDir and the SSL_CERT_DIR, and SSL_CERT_FILE for changes
350+
cpwPull, err := httputil.NewCertPoolWatcher(cfg.pullCasDir, ctrl.Log.WithName("pull-ca-pool"))
351+
if err != nil {
352+
setupLog.Error(err, "unable to create pull-ca-pool watcher")
353+
return err
354+
}
355+
if err = mgr.Add(cpwPull); err != nil {
356+
setupLog.Error(err, "unable to add pull-ca-pool watcher to manager")
342357
return err
343358
}
344359

@@ -392,7 +407,7 @@ func run() error {
392407
}
393408
catalogClientBackend := cache.NewFilesystemCache(catalogsCachePath)
394409
catalogClient := catalogclient.New(catalogClientBackend, func() (*http.Client, error) {
395-
return httputil.BuildHTTPClient(certPoolWatcher)
410+
return httputil.BuildHTTPClient(cpwCatalogd)
396411
})
397412

398413
resolver := &resolve.CatalogResolver{

internal/shared/util/http/certpoolwatcher.go

Lines changed: 97 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package http
22

33
import (
4+
"context"
45
"crypto/x509"
56
"fmt"
67
"os"
@@ -14,13 +15,15 @@ import (
1415
)
1516

1617
type CertPoolWatcher struct {
17-
generation int
18-
dir string
19-
mx sync.RWMutex
20-
pool *x509.CertPool
21-
log logr.Logger
22-
watcher *fsnotify.Watcher
23-
done chan bool
18+
generation int
19+
dir string
20+
sslCertPaths []string
21+
mx sync.RWMutex
22+
pool *x509.CertPool
23+
log logr.Logger
24+
watcher *fsnotify.Watcher
25+
done chan bool
26+
restart func(int)
2427
}
2528

2629
// Returns the current CertPool and the generation number
@@ -33,75 +36,111 @@ func (cpw *CertPoolWatcher) Get() (*x509.CertPool, int, error) {
3336
return cpw.pool.Clone(), cpw.generation, nil
3437
}
3538

36-
func (cpw *CertPoolWatcher) Done() {
37-
cpw.done <- true
39+
// Change the restart behavior
40+
// The default is to os.Exit(0) when a change to SSL_CERT_DIR or SSL_CERT_FILE
41+
// is detected, this allows a regeneration of the SystemCertPool.
42+
func (cpw *CertPoolWatcher) Restart(f func(int)) {
43+
cpw.restart = f
3844
}
3945

40-
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
41-
pool, err := NewCertPool(caDir, log)
42-
if err != nil {
43-
return nil, err
46+
// Indicate that you're done with the CertPoolWatcher so it can terminate
47+
// the watcher go func
48+
func (cpw *CertPoolWatcher) Done() {
49+
if cpw.watcher != nil {
50+
cpw.done <- true
4451
}
45-
watcher, err := fsnotify.NewWatcher()
52+
}
53+
54+
func (cpw *CertPoolWatcher) Start(ctx context.Context) error {
55+
var err error
56+
cpw.pool, err = NewCertPool(cpw.dir, cpw.log)
4657
if err != nil {
47-
return nil, err
58+
return err
4859
}
4960

50-
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
51-
// specified, this means that we have some control over the system root
52-
// location, thus they may change, thus we should watch those locations.
53-
sslCertDir := os.Getenv("SSL_CERT_DIR")
54-
sslCertFile := os.Getenv("SSL_CERT_FILE")
55-
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
61+
watchPaths := append(cpw.sslCertPaths, cpw.dir)
62+
watchPaths = slices.DeleteFunc(watchPaths, deleteEmptyStrings)
5663

57-
watchPaths := strings.Split(sslCertDir, ":")
58-
watchPaths = append(watchPaths, caDir, sslCertFile)
59-
watchPaths = slices.DeleteFunc(watchPaths, func(p string) bool {
60-
if p == "" {
61-
return true
62-
}
63-
if _, err := os.Stat(p); err != nil {
64-
return true
65-
}
66-
return false
67-
})
64+
// Nothing was configured to be watched, which means this is
65+
// using the SystemCertPool, so we still need to no error out
66+
if len(watchPaths) == 0 {
67+
cpw.log.Info("No paths to watch")
68+
return nil
69+
}
70+
71+
cpw.watcher, err = fsnotify.NewWatcher()
72+
if err != nil {
73+
return err
74+
}
6875

6976
for _, p := range watchPaths {
70-
if err := watcher.Add(p); err != nil {
71-
return nil, err
77+
if err := cpw.watcher.Add(p); err != nil {
78+
cpw.watcher = nil
79+
return err
7280
}
73-
logPath(p, "watching certificate", log)
81+
logPath(p, "watching certificate", cpw.log)
7482
}
7583

76-
cpw := &CertPoolWatcher{
77-
generation: 1,
78-
dir: caDir,
79-
pool: pool,
80-
log: log,
81-
watcher: watcher,
82-
done: make(chan bool),
83-
}
8484
go func() {
8585
for {
8686
select {
87-
case <-watcher.Events:
87+
case e := <-cpw.watcher.Events:
88+
cpw.checkForRestart(e.Name)
8889
cpw.drainEvents()
8990
cpw.update()
90-
case err := <-watcher.Errors:
91-
log.Error(err, "error watching certificate dir")
91+
case err := <-cpw.watcher.Errors:
92+
cpw.log.Error(err, "error watching certificate dir")
9293
os.Exit(1)
94+
case <-ctx.Done():
95+
cpw.Done()
9396
case <-cpw.done:
94-
err := watcher.Close()
97+
err := cpw.watcher.Close()
9598
if err != nil {
96-
log.Error(err, "error closing watcher")
99+
cpw.log.Error(err, "error closing watcher")
97100
}
98101
return
99102
}
100103
}
101104
}()
105+
return nil
106+
}
107+
108+
func NewCertPoolWatcher(caDir string, log logr.Logger) (*CertPoolWatcher, error) {
109+
// If the SSL_CERT_DIR or SSL_CERT_FILE environment variables are
110+
// specified, this means that we have some control over the system root
111+
// location, thus they may change, thus we should watch those locations.
112+
//
113+
// BECAUSE THE SYSTEM POOL WILL NOT UPDATE, WE HAVE TO RESTART IF THERE
114+
// CHANGES TO EITHER OF THESE LOCATIONS: SSL_CERT_DIR, SSL_CERT_FILE
115+
//
116+
sslCertDir := os.Getenv("SSL_CERT_DIR")
117+
sslCertFile := os.Getenv("SSL_CERT_FILE")
118+
log.V(defaultLogLevel).Info("SSL environment", "SSL_CERT_DIR", sslCertDir, "SSL_CERT_FILE", sslCertFile)
119+
120+
sslCertPaths := append(strings.Split(sslCertDir, ":"), sslCertFile)
121+
sslCertPaths = slices.DeleteFunc(sslCertPaths, deleteEmptyStrings)
122+
123+
cpw := &CertPoolWatcher{
124+
generation: 1,
125+
dir: caDir,
126+
sslCertPaths: sslCertPaths,
127+
log: log,
128+
done: make(chan bool),
129+
restart: os.Exit,
130+
}
102131
return cpw, nil
103132
}
104133

134+
func deleteEmptyStrings(p string) bool {
135+
if p == "" {
136+
return true
137+
}
138+
if _, err := os.Stat(p); err != nil {
139+
return true
140+
}
141+
return false
142+
}
143+
105144
func (cpw *CertPoolWatcher) update() {
106145
cpw.log.Info("updating certificate pool")
107146
pool, err := NewCertPool(cpw.dir, cpw.log)
@@ -115,6 +154,14 @@ func (cpw *CertPoolWatcher) update() {
115154
cpw.generation++
116155
}
117156

157+
func (cpw *CertPoolWatcher) checkForRestart(name string) {
158+
for _, p := range cpw.sslCertPaths {
159+
if strings.Contains(name, p) {
160+
cpw.restart(0)
161+
}
162+
}
163+
}
164+
118165
// Drain as many events as possible before doing anything
119166
// Otherwise, we will be hit with an event for _every_ entry in the
120167
// directory, and end up doing an update for each one
@@ -124,7 +171,8 @@ func (cpw *CertPoolWatcher) drainEvents() {
124171
select {
125172
case <-drainTimer.C:
126173
return
127-
case <-cpw.watcher.Events:
174+
case e := <-cpw.watcher.Events:
175+
cpw.checkForRestart(e.Name)
128176
}
129177
if !drainTimer.Stop() {
130178
<-drainTimer.C

0 commit comments

Comments
 (0)