Skip to content

Commit 27fd0ad

Browse files
cameronmeissnerCameron Meissner
andauthored
feat(client): tracing enhancements, add better error handling for missing userAssignedIdentityID in cloud provider config (#138)
Co-authored-by: Cameron Meissner <[email protected]>
1 parent 4184e86 commit 27fd0ad

File tree

15 files changed

+320
-25
lines changed

15 files changed

+320
-25
lines changed

client/cmd/client/main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,16 @@ func run(ctx context.Context) int {
8888
bootstrapCtx, cancel := context.WithDeadline(bootstrapCtx, bootstrapDeadline)
8989
defer cancel()
9090

91-
finalErr, errs, traces := bootstrap.Bootstrap(bootstrapCtx, bootstrapClient, bootstrapConfig)
91+
finalErr, errLog, traces := bootstrap.Bootstrap(bootstrapCtx, bootstrapClient, bootstrapConfig)
9292
bootstrapEndTime := time.Now()
9393

9494
var exitCode int
9595
bootstrapResult := &bootstrap.Result{
9696
Status: bootstrap.StatusSuccess,
9797
ElapsedMilliseconds: bootstrapEndTime.Sub(bootstrapStartTime).Milliseconds(),
98-
Errors: errs,
99-
Traces: traces,
98+
Errors: errLog,
99+
Traces: traces.GetLastNTraces(5), // only keep the last 5 traces to avoid truncating guest agent event data
100+
TraceSummary: traces.GetTraceSummary(),
100101
}
101102

102103
if finalErr != nil {

client/hack/upload.sh

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,24 @@
11
#!/bin/bash
22
set -euxo pipefail
33

4+
SUBSCRIPTION="${SUBSCRIPTION:-}"
5+
RESOURCE_GROUP="${RESOURCE_GROUP:-}"
46
STORAGE_ACCOUNT_NAME="${STORAGE_ACCOUNT_NAME:-}"
57
VERSION="${VERSION:-}"
8+
OS="${OS:-linux}"
69

10+
[ -z "$SUBSCRIPTION" ] && echo "SUBSCRIPTION must be specified" && exit 1
11+
[ -z "$RESOURCE_GROUP" ] && echo "RESOURCE_GROUP must be specified" && exit 1
712
[ -z "$STORAGE_ACCOUNT_NAME" ] && echo "STORAGE_ACCOUNT_NAME must be specified" && exit 1
813
[ -z "$VERSION" ] && echo "VERSION must be specified" && exit 1
914

10-
function build_and_upload_linux_amd64() {
11-
rm -rf "bin/"
15+
display_download_url() {
16+
local relpath="$1"
17+
web_endpoint=$(az storage account show -g $RESOURCE_GROUP -n $STORAGE_ACCOUNT_NAME --subscription $SUBSCRIPTION --query primaryEndpoints.web | tr -d \")
18+
echo "client download URL: ${web_endpoint}${relpath}"
19+
}
20+
21+
build_and_upload_linux_amd64() {
1222
make build OS=linux ARCH=amd64
1323

1424
if [ ! -f "bin/aks-secure-tls-bootstrap-client-amd64" ]; then
@@ -21,8 +31,37 @@ function build_and_upload_linux_amd64() {
2131
mv "${client_path}-amd64" "$client_path"
2232
tar_path="linux-amd64.tar.gz"
2333
tar -czvf "$tar_path" "$client_path"
24-
az storage blob upload -f "$tar_path" --auth-mode login --blob-url "https://${STORAGE_ACCOUNT_NAME}.blob.core.windows.net/\$web/client/linux/amd64/${VERSION}"
34+
az storage blob upload -f "$tar_path" --auth-mode login --blob-url "https://${STORAGE_ACCOUNT_NAME}.blob.core.windows.net/\$web/client/linux/amd64/${VERSION}.tar.gz"
2535
popd
36+
37+
display_download_url "client/linux/amd64/${VERSION}.tar.gz"
2638
}
2739

28-
build_and_upload_linux_amd64
40+
build_and_upload_windows_amd64() {
41+
make build OS=windows ARCH=amd64 EXTENSION=.exe
42+
43+
if [ ! -f "bin/aks-secure-tls-bootstrap-client-amd64.exe" ]; then
44+
echo "could not find client binary aks-secure-tls-bootstrap-client-amd64.exe for upload within bin/"
45+
exit 1
46+
fi
47+
48+
pushd "bin/"
49+
client_path="aks-secure-tls-bootstrap-client-amd64.exe"
50+
mv "${client_path}" "aks-secure-tls-bootstrap-client.exe"
51+
zip -r "windows-amd64.zip" "aks-secure-tls-bootstrap-client.exe"
52+
az storage blob upload -f "windows-amd64.zip" --auth-mode login --blob-url "https://${STORAGE_ACCOUNT_NAME}.blob.core.windows.net/\$web/client/windows/amd64/${VERSION}.zip"
53+
popd
54+
55+
display_download_url "client/windows/amd64/${VERSION}.zip"
56+
}
57+
58+
rm -rf "bin/"
59+
60+
if [ "${OS}" == "linux" ]; then
61+
build_and_upload_linux_amd64
62+
elif [ "${OS}" == "windows" ]; then
63+
build_and_upload_windows_amd64
64+
else
65+
echo "unsupported OS: $OS, must be \"linux\" or \"windows\""
66+
exit 1
67+
fi

client/internal/bootstrap/auth.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ const (
2020
certificateSecretPrefix = "certificate:"
2121
)
2222

23+
const (
24+
clientIDForMSI = "msi"
25+
)
26+
2327
const (
2428
maxMSIRefreshAttempts = 3
2529
)
@@ -55,12 +59,16 @@ func (c *Client) getAccessToken(ctx context.Context, customClientID, resource st
5559
if err != nil {
5660
return "", fmt.Errorf("generating MSI access token: %w", err)
5761
}
58-
// to avoid falling too deep into exponential backoff implemented by adal, which follows the public retry guidance
62+
// to avoid falling too deep into exponential backoff implemented by adal, which follows the public retry guidance:
5963
// https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token#retry-guidance
6064
token.MaxMSIRefreshAttempts = maxMSIRefreshAttempts
6165
return c.extractAccessTokenFunc(token)
6266
}
6367

68+
if cloudProviderConfig.ClientID == clientIDForMSI {
69+
return "", fmt.Errorf("client ID within cloud provider config indicates usage of a managed identity, though no user-assigned identity ID was provided")
70+
}
71+
6472
env, err := azure.EnvironmentFromName(cloudProviderConfig.CloudName)
6573
if err != nil {
6674
return "", fmt.Errorf("getting azure environment config for cloud %q: %w", cloudProviderConfig.CloudName, err)

client/internal/bootstrap/auth_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ func TestGetAccessToken(t *testing.T) {
113113
name: "UserAssignedIdentityID is specified in cloud provider config",
114114
setupCloudProviderConfig: func(t *testing.T, config *cloud.ProviderConfig) {
115115
config.UserAssignedIdentityID = "kubelet-identity-id"
116+
config.ClientID = clientIDForMSI
116117
},
117118
setupExtractAccessTokenFunc: func(t *testing.T) extractAccessTokenFunc {
118119
return func(token *adal.ServicePrincipalToken) (string, error) {
@@ -123,11 +124,26 @@ func TestGetAccessToken(t *testing.T) {
123124
expectedToken: "token",
124125
expectedErr: nil,
125126
},
127+
{
128+
name: "UserAssignedIdentityID is not specified in cloud provider config, but client ID indicates MSI usage",
129+
setupCloudProviderConfig: func(t *testing.T, config *cloud.ProviderConfig) {
130+
config.UserAssignedIdentityID = ""
131+
config.ClientID = clientIDForMSI
132+
},
133+
setupExtractAccessTokenFunc: func(t *testing.T) extractAccessTokenFunc {
134+
return func(token *adal.ServicePrincipalToken) (string, error) {
135+
return "token", nil
136+
}
137+
},
138+
expectedToken: "",
139+
expectedErr: errors.New("client ID within cloud provider config indicates usage of a managed identity, though no user-assigned identity ID was provided"),
140+
},
126141
{
127142
name: "a custom client ID is specified",
128143
customClientID: "custom",
129144
setupCloudProviderConfig: func(t *testing.T, config *cloud.ProviderConfig) {
130145
config.UserAssignedIdentityID = "kubelet-identity-id"
146+
config.ClientID = clientIDForMSI
131147
},
132148
setupExtractAccessTokenFunc: func(t *testing.T) extractAccessTokenFunc {
133149
return func(token *adal.ServicePrincipalToken) (string, error) {

client/internal/bootstrap/bootstrap.go

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
14
package bootstrap
25

36
import (
47
"context"
58
"errors"
69
"fmt"
10+
"os"
11+
"path/filepath"
712
"time"
813

914
"github.com/Azure/aks-secure-tls-bootstrap/client/internal/telemetry"
@@ -18,16 +23,14 @@ import (
1823
// In any case, a record of all errors encountered during the bootstrap process will be returned in errs, where error type is mapped to the corresponding occurrence count.
1924
// Additionally, a map of traces is returned in traces, which records how long each bootstrapping step took, mapping task name to a corresponding time.Duration.
2025
// Trace data is separately recorded for each retry attempt.
21-
func Bootstrap(ctx context.Context, client *Client, config *Config) (finalErr error, errs map[ErrorType]int, traces map[int]telemetry.Trace) {
22-
errs = map[ErrorType]int{}
23-
traces = map[int]telemetry.Trace{}
24-
var retryCount int
26+
func Bootstrap(ctx context.Context, client *Client, config *Config) (finalErr error, errLog map[ErrorType]int, traces *telemetry.TraceStore) {
27+
errLog = make(map[ErrorType]int)
28+
traces = telemetry.NewTraceStore()
2529

2630
finalErr = retry.Do(
2731
func() error {
2832
defer func() {
29-
traces[retryCount] = telemetry.MustGetTracer(ctx).GetTrace()
30-
retryCount++
33+
traces.Add(telemetry.MustGetTracer(ctx).GetTrace())
3134
}()
3235

3336
kubeconfigData, err := client.BootstrapKubeletClientCredential(ctx, config)
@@ -44,10 +47,11 @@ func Bootstrap(ctx context.Context, client *Client, config *Config) (finalErr er
4447
},
4548
retry.RetryIf(func(err error) bool {
4649
var bootstrapErr *BootstrapError
47-
if errors.As(err, &bootstrapErr) {
48-
errs[bootstrapErr.Type()]++
50+
if !errors.As(err, &bootstrapErr) {
51+
return false
4952
}
50-
return true
53+
errLog[bootstrapErr.Type()]++
54+
return bootstrapErr.Retryable()
5155
}),
5256
retry.Context(ctx),
5357
retry.WrapContextErrorWithLastError(true),
@@ -58,7 +62,7 @@ func Bootstrap(ctx context.Context, client *Client, config *Config) (finalErr er
5862
retry.Attempts(0), // retry indefinitely according to the context deadline
5963
)
6064

61-
return finalErr, errs, traces
65+
return finalErr, errLog, traces
6266
}
6367

6468
func writeKubeconfig(ctx context.Context, config *clientcmdapi.Config, path string) error {
@@ -67,6 +71,13 @@ func writeKubeconfig(ctx context.Context, config *clientcmdapi.Config, path stri
6771
tracer.StartSpan(traceName)
6872
defer tracer.EndSpan(traceName)
6973

74+
if err := os.MkdirAll(filepath.Dir(path), 0600); err != nil {
75+
return &BootstrapError{
76+
errorType: ErrorTypeWriteKubeconfigFailure,
77+
inner: fmt.Errorf("creating parent directories for kubeconfig path: %w", err),
78+
}
79+
}
80+
7081
if err := clientcmd.WriteToFile(*config, path); err != nil {
7182
return &BootstrapError{
7283
errorType: ErrorTypeWriteKubeconfigFailure,

client/internal/bootstrap/errors.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
14
package bootstrap
25

36
import "fmt"
@@ -21,6 +24,19 @@ type BootstrapError struct {
2124
inner error
2225
}
2326

27+
func (e *BootstrapError) Retryable() bool {
28+
switch e.errorType {
29+
case ErrorTypeGetAccessTokenFailure,
30+
ErrorTypeGetInstanceDataFailure,
31+
ErrorTypeGetAttestedDataFailure,
32+
ErrorTypeGetNonceFailure,
33+
ErrorTypeGetCredentialFailure:
34+
return true
35+
default:
36+
return false
37+
}
38+
}
39+
2440
func (e *BootstrapError) Error() string {
2541
return fmt.Sprintf("%s: %s", e.errorType, e.inner.Error())
2642
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
package bootstrap
5+
6+
import (
7+
"errors"
8+
"testing"
9+
10+
"github.com/stretchr/testify/assert"
11+
)
12+
13+
func TestBootstrapErrorRetryable(t *testing.T) {
14+
cases := []struct {
15+
name string
16+
err *BootstrapError
17+
expected bool
18+
}{
19+
{
20+
name: "get access token failure should be retryable",
21+
err: &BootstrapError{errorType: ErrorTypeGetAccessTokenFailure, inner: errors.New("get access token error")},
22+
expected: true,
23+
},
24+
{
25+
name: "get instance data failure should be retryable",
26+
err: &BootstrapError{errorType: ErrorTypeGetInstanceDataFailure, inner: errors.New("get instance data error")},
27+
expected: true,
28+
},
29+
{
30+
name: "get attested data failure should be retryable",
31+
err: &BootstrapError{errorType: ErrorTypeGetAttestedDataFailure, inner: errors.New("get attested data error")},
32+
expected: true,
33+
},
34+
{
35+
name: "get nonce failure should be retryable",
36+
err: &BootstrapError{errorType: ErrorTypeGetNonceFailure, inner: errors.New("get nonce error")},
37+
expected: true,
38+
},
39+
{
40+
name: "get credential failure should be retryable",
41+
err: &BootstrapError{errorType: ErrorTypeGetCredentialFailure, inner: errors.New("get credential error")},
42+
expected: true,
43+
},
44+
{
45+
name: "get service client error should not be retryable",
46+
err: &BootstrapError{errorType: ErrorTypeGetServiceClientFailure, inner: errors.New("get service client error")},
47+
expected: false,
48+
},
49+
{
50+
name: "get CSR failure should not be retryable",
51+
err: &BootstrapError{errorType: ErrorTypeGetCSRFailure, inner: errors.New("get CSR error")},
52+
expected: false,
53+
},
54+
{
55+
name: "generate kubeconfig error should not be retryable",
56+
err: &BootstrapError{errorType: ErrorTypeGenerateKubeconfigFailure, inner: errors.New("generate kubeconfig error")},
57+
expected: false,
58+
},
59+
{
60+
name: "write kubeconfig error should not be retryable",
61+
err: &BootstrapError{errorType: ErrorTypeWriteKubeconfigFailure, inner: errors.New("write kubeconfig error")},
62+
expected: false,
63+
},
64+
}
65+
66+
for _, c := range cases {
67+
t.Run(c.name, func(t *testing.T) {
68+
assert.Equal(t, c.expected, c.err.Retryable())
69+
})
70+
}
71+
}

client/internal/bootstrap/event.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
14
package bootstrap
25

36
import (
@@ -55,11 +58,19 @@ const (
5558
)
5659

5760
type Result struct {
58-
Status Status `json:"Status"`
59-
ElapsedMilliseconds int64 `json:"ElapsedMilliseconds"`
60-
Errors map[ErrorType]int `json:"Errors,omitempty"`
61-
Traces map[int]telemetry.Trace `json:"Traces,omitempty"`
62-
FinalError string `json:"FinalError,omitempty"`
61+
// Status is terminal status of the bootstrapping event.
62+
Status Status `json:"Status"`
63+
// ElapsedMilliseconds measures how long the bootstrapping event took to execute, in milliseconds.
64+
ElapsedMilliseconds int64 `json:"ElapsedMilliseconds"`
65+
// Errors is a mapping from top-level bootstrapping error type of the number of times it occurred during the event.
66+
Errors map[ErrorType]int `json:"Errors,omitempty"`
67+
// Traces is a mapping from retry attempt to corresponding Trace. A Trace maps span names to their respective durations.
68+
// This will only ever contain data for the last 3 retries to avoid truncating guest agent event data.
69+
Traces map[int]telemetry.Trace `json:"Traces,omitempty"`
70+
// TraceSummary is a special Trace which maps span names to their total durations across all retry attempts.
71+
TraceSummary telemetry.Trace `json:"TraceSummary,omitempty"`
72+
// FinalError is the the error returned by the last retry attempt, assuming the overall bootstrapping event failed.
73+
FinalError string `json:"FinalError,omitempty"`
6374
}
6475

6576
type Event struct {

client/internal/bootstrap/event_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
14
package bootstrap
25

36
import (

client/internal/build/build.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
14
package build
25

36
// version holds the current version string, provided via go ldflags.

0 commit comments

Comments
 (0)