Skip to content

Commit 24095e4

Browse files
authored
cmd/containerboot: serve health on local endpoint (tailscale#14246)
* cmd/containerboot: serve health on local endpoint We introduced stable (user) metrics in tailscale#14035, and `TS_LOCAL_ADDR_PORT` with it. Rather than requiring users to specify a new addr/port combination for each new local endpoint they want the container to serve, this combines the health check endpoint onto the local addr/port used by metrics if `TS_ENABLE_HEALTH_CHECK` is used instead of `TS_HEALTHCHECK_ADDR_PORT`. `TS_LOCAL_ADDR_PORT` now defaults to binding to all interfaces on 9002 so that it works more seamlessly and with less configuration in environments other than Kubernetes, where the operator always overrides the default anyway. In particular, listening on localhost would not be accessible from outside the container, and many scripted container environments do not know the IP address of the container before it's started. Listening on all interfaces allows users to just set one env var (`TS_ENABLE_METRICS` or `TS_ENABLE_HEALTH_CHECK`) to get a fully functioning local endpoint they can query from outside the container. Updates tailscale#14035, tailscale#12898 Signed-off-by: Tom Proctor <[email protected]>
1 parent a68efe2 commit 24095e4

File tree

7 files changed

+251
-66
lines changed

7 files changed

+251
-66
lines changed

cmd/containerboot/healthz.go

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ package main
77

88
import (
99
"log"
10-
"net"
1110
"net/http"
1211
"sync"
1312
)
@@ -23,29 +22,29 @@ type healthz struct {
2322
func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
2423
h.Lock()
2524
defer h.Unlock()
25+
2626
if h.hasAddrs {
2727
w.Write([]byte("ok"))
2828
} else {
29-
http.Error(w, "node currently has no tailscale IPs", http.StatusInternalServerError)
29+
http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
3030
}
3131
}
3232

33-
// runHealthz runs a simple HTTP health endpoint on /healthz, listening on the
34-
// provided address. A containerized tailscale instance is considered healthy if
35-
// it has at least one tailnet IP address.
36-
func runHealthz(addr string, h *healthz) {
37-
lis, err := net.Listen("tcp", addr)
38-
if err != nil {
39-
log.Fatalf("error listening on the provided health endpoint address %q: %v", addr, err)
33+
func (h *healthz) update(healthy bool) {
34+
h.Lock()
35+
defer h.Unlock()
36+
37+
if h.hasAddrs != healthy {
38+
log.Println("Setting healthy", healthy)
4039
}
41-
mux := http.NewServeMux()
40+
h.hasAddrs = healthy
41+
}
42+
43+
// healthHandlers registers a simple health handler at /healthz.
44+
// A containerized tailscale instance is considered healthy if
45+
// it has at least one tailnet IP address.
46+
func healthHandlers(mux *http.ServeMux) *healthz {
47+
h := &healthz{}
4248
mux.Handle("GET /healthz", h)
43-
log.Printf("Running healthcheck endpoint at %s/healthz", addr)
44-
hs := &http.Server{Handler: mux}
45-
46-
go func() {
47-
if err := hs.Serve(lis); err != nil {
48-
log.Fatalf("failed running health endpoint: %v", err)
49-
}
50-
}()
49+
return h
5150
}

cmd/containerboot/main.go

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,17 @@
5252
// ${TS_CERT_DOMAIN}, it will be replaced with the value of the available FQDN.
5353
// It cannot be used in conjunction with TS_DEST_IP. The file is watched for changes,
5454
// and will be re-applied when it changes.
55-
// - TS_HEALTHCHECK_ADDR_PORT: if specified, an HTTP health endpoint will be
56-
// served at /healthz at the provided address, which should be in form [<address>]:<port>.
57-
// If not set, no health check will be run. If set to :<port>, addr will default to 0.0.0.0
58-
// The health endpoint will return 200 OK if this node has at least one tailnet IP address,
59-
// otherwise returns 503.
55+
// - TS_HEALTHCHECK_ADDR_PORT: deprecated, use TS_ENABLE_HEALTH_CHECK instead and optionally
56+
// set TS_LOCAL_ADDR_PORT. Will be removed in 1.82.0.
57+
// - TS_LOCAL_ADDR_PORT: the address and port to serve local metrics and health
58+
// check endpoints if enabled via TS_ENABLE_METRICS and/or TS_ENABLE_HEALTH_CHECK.
59+
// Defaults to [::]:9002, serving on all available interfaces.
60+
// - TS_ENABLE_METRICS: if true, a metrics endpoint will be served at /metrics on
61+
// the address specified by TS_LOCAL_ADDR_PORT. See https://tailscale.com/kb/1482/client-metrics
62+
// for more information on the metrics exposed.
63+
// - TS_ENABLE_HEALTH_CHECK: if true, a health check endpoint will be served at /healthz on
64+
// the address specified by TS_LOCAL_ADDR_PORT. The health endpoint will return 200
65+
// OK if this node has at least one tailnet IP address, otherwise returns 503.
6066
// NB: the health criteria might change in the future.
6167
// - TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR: if specified, a path to a
6268
// directory that containers tailscaled config in file. The config file needs to be
@@ -99,6 +105,7 @@ import (
99105
"log"
100106
"math"
101107
"net"
108+
"net/http"
102109
"net/netip"
103110
"os"
104111
"os/signal"
@@ -178,12 +185,32 @@ func main() {
178185
}
179186
defer killTailscaled()
180187

181-
if cfg.LocalAddrPort != "" && cfg.MetricsEnabled {
182-
m := &metrics{
183-
lc: client,
184-
debugEndpoint: cfg.DebugAddrPort,
188+
var healthCheck *healthz
189+
if cfg.HealthCheckAddrPort != "" {
190+
mux := http.NewServeMux()
191+
192+
log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
193+
healthCheck = healthHandlers(mux)
194+
195+
close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
196+
defer close()
197+
}
198+
199+
if cfg.localMetricsEnabled() || cfg.localHealthEnabled() {
200+
mux := http.NewServeMux()
201+
202+
if cfg.localMetricsEnabled() {
203+
log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort)
204+
metricsHandlers(mux, client, cfg.DebugAddrPort)
185205
}
186-
runMetrics(cfg.LocalAddrPort, m)
206+
207+
if cfg.localHealthEnabled() {
208+
log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
209+
healthCheck = healthHandlers(mux)
210+
}
211+
212+
close := runHTTPServer(mux, cfg.LocalAddrPort)
213+
defer close()
187214
}
188215

189216
if cfg.EnableForwardingOptimizations {
@@ -328,9 +355,6 @@ authLoop:
328355

329356
certDomain = new(atomic.Pointer[string])
330357
certDomainChanged = make(chan bool, 1)
331-
332-
h = &healthz{} // http server for the healthz endpoint
333-
healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) })
334358
)
335359
if cfg.ServeConfigPath != "" {
336360
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
@@ -556,11 +580,8 @@ runLoop:
556580
}
557581
}
558582

559-
if cfg.HealthCheckAddrPort != "" {
560-
h.Lock()
561-
h.hasAddrs = len(addrs) != 0
562-
h.Unlock()
563-
healthzRunner()
583+
if healthCheck != nil {
584+
healthCheck.update(len(addrs) != 0)
564585
}
565586
if egressSvcsNotify != nil {
566587
egressSvcsNotify <- n
@@ -751,3 +772,22 @@ func tailscaledConfigFilePath() string {
751772
log.Printf("Using tailscaled config file %q to match current capability version %d", filePath, tailcfg.CurrentCapabilityVersion)
752773
return filePath
753774
}
775+
776+
func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) {
777+
ln, err := net.Listen("tcp", addr)
778+
if err != nil {
779+
log.Fatalf("failed to listen on addr %q: %v", addr, err)
780+
}
781+
srv := &http.Server{Handler: mux}
782+
783+
go func() {
784+
if err := srv.Serve(ln); err != nil {
785+
log.Fatalf("failed running server: %v", err)
786+
}
787+
}()
788+
789+
return func() error {
790+
err := srv.Shutdown(context.Background())
791+
return errors.Join(err, ln.Close())
792+
}
793+
}

cmd/containerboot/main_test.go

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,24 @@ func TestContainerBoot(t *testing.T) {
101101

102102
argFile := filepath.Join(d, "args")
103103
runningSockPath := filepath.Join(d, "tmp/tailscaled.sock")
104+
var localAddrPort, healthAddrPort int
105+
for _, p := range []*int{&localAddrPort, &healthAddrPort} {
106+
ln, err := net.Listen("tcp", ":0")
107+
if err != nil {
108+
t.Fatalf("Failed to open listener: %v", err)
109+
}
110+
if err := ln.Close(); err != nil {
111+
t.Fatalf("Failed to close listener: %v", err)
112+
}
113+
port := ln.Addr().(*net.TCPAddr).Port
114+
*p = port
115+
}
116+
metricsURL := func(port int) string {
117+
return fmt.Sprintf("http://127.0.0.1:%d/metrics", port)
118+
}
119+
healthURL := func(port int) string {
120+
return fmt.Sprintf("http://127.0.0.1:%d/healthz", port)
121+
}
104122

105123
type phase struct {
106124
// If non-nil, send this IPN bus notification (and remember it as the
@@ -119,6 +137,8 @@ func TestContainerBoot(t *testing.T) {
119137
// WantFatalLog is the fatal log message we expect from containerboot.
120138
// If set for a phase, the test will finish on that phase.
121139
WantFatalLog string
140+
141+
EndpointStatuses map[string]int
122142
}
123143
runningNotify := &ipn.Notify{
124144
State: ptr.To(ipn.Running),
@@ -147,6 +167,11 @@ func TestContainerBoot(t *testing.T) {
147167
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
148168
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
149169
},
170+
// No metrics or health by default.
171+
EndpointStatuses: map[string]int{
172+
metricsURL(9002): -1,
173+
healthURL(9002): -1,
174+
},
150175
},
151176
{
152177
Notify: runningNotify,
@@ -700,6 +725,104 @@ func TestContainerBoot(t *testing.T) {
700725
},
701726
},
702727
},
728+
{
729+
Name: "metrics_enabled",
730+
Env: map[string]string{
731+
"TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
732+
"TS_ENABLE_METRICS": "true",
733+
},
734+
Phases: []phase{
735+
{
736+
WantCmds: []string{
737+
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
738+
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
739+
},
740+
EndpointStatuses: map[string]int{
741+
metricsURL(localAddrPort): 200,
742+
healthURL(localAddrPort): -1,
743+
},
744+
}, {
745+
Notify: runningNotify,
746+
},
747+
},
748+
},
749+
{
750+
Name: "health_enabled",
751+
Env: map[string]string{
752+
"TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
753+
"TS_ENABLE_HEALTH_CHECK": "true",
754+
},
755+
Phases: []phase{
756+
{
757+
WantCmds: []string{
758+
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
759+
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
760+
},
761+
EndpointStatuses: map[string]int{
762+
metricsURL(localAddrPort): -1,
763+
healthURL(localAddrPort): 503, // Doesn't start passing until the next phase.
764+
},
765+
}, {
766+
Notify: runningNotify,
767+
EndpointStatuses: map[string]int{
768+
metricsURL(localAddrPort): -1,
769+
healthURL(localAddrPort): 200,
770+
},
771+
},
772+
},
773+
},
774+
{
775+
Name: "metrics_and_health_on_same_port",
776+
Env: map[string]string{
777+
"TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
778+
"TS_ENABLE_METRICS": "true",
779+
"TS_ENABLE_HEALTH_CHECK": "true",
780+
},
781+
Phases: []phase{
782+
{
783+
WantCmds: []string{
784+
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
785+
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
786+
},
787+
EndpointStatuses: map[string]int{
788+
metricsURL(localAddrPort): 200,
789+
healthURL(localAddrPort): 503, // Doesn't start passing until the next phase.
790+
},
791+
}, {
792+
Notify: runningNotify,
793+
EndpointStatuses: map[string]int{
794+
metricsURL(localAddrPort): 200,
795+
healthURL(localAddrPort): 200,
796+
},
797+
},
798+
},
799+
},
800+
{
801+
Name: "local_metrics_and_deprecated_health",
802+
Env: map[string]string{
803+
"TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
804+
"TS_ENABLE_METRICS": "true",
805+
"TS_HEALTHCHECK_ADDR_PORT": fmt.Sprintf("[::]:%d", healthAddrPort),
806+
},
807+
Phases: []phase{
808+
{
809+
WantCmds: []string{
810+
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
811+
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
812+
},
813+
EndpointStatuses: map[string]int{
814+
metricsURL(localAddrPort): 200,
815+
healthURL(healthAddrPort): 503, // Doesn't start passing until the next phase.
816+
},
817+
}, {
818+
Notify: runningNotify,
819+
EndpointStatuses: map[string]int{
820+
metricsURL(localAddrPort): 200,
821+
healthURL(healthAddrPort): 200,
822+
},
823+
},
824+
},
825+
},
703826
}
704827

705828
for _, test := range tests {
@@ -796,7 +919,26 @@ func TestContainerBoot(t *testing.T) {
796919
return nil
797920
})
798921
if err != nil {
799-
t.Fatal(err)
922+
t.Fatalf("phase %d: %v", i, err)
923+
}
924+
925+
for url, want := range p.EndpointStatuses {
926+
err := tstest.WaitFor(2*time.Second, func() error {
927+
resp, err := http.Get(url)
928+
if err != nil && want != -1 {
929+
return fmt.Errorf("GET %s: %v", url, err)
930+
}
931+
if want > 0 && resp.StatusCode != want {
932+
defer resp.Body.Close()
933+
body, _ := io.ReadAll(resp.Body)
934+
return fmt.Errorf("GET %s, want %d, got %d\n%s", url, want, resp.StatusCode, string(body))
935+
}
936+
937+
return nil
938+
})
939+
if err != nil {
940+
t.Fatalf("phase %d: %v", i, err)
941+
}
800942
}
801943
}
802944
waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal")
@@ -955,6 +1097,12 @@ func (l *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
9551097
if r.Method != "GET" {
9561098
panic(fmt.Sprintf("unsupported method %q", r.Method))
9571099
}
1100+
case "/localapi/v0/usermetrics":
1101+
if r.Method != "GET" {
1102+
panic(fmt.Sprintf("unsupported method %q", r.Method))
1103+
}
1104+
w.Write([]byte("fake metrics"))
1105+
return
9581106
default:
9591107
panic(fmt.Sprintf("unsupported path %q", r.URL.Path))
9601108
}

0 commit comments

Comments
 (0)