Skip to content

Commit c26cfbb

Browse files
authored
[APMLP-876] Add gradual rollout bucket calculation in imageresolver.Config (#45448)
### What does this PR do? Adds the gradual rollout bucket ID to the `imageresolver.Config` type. The value is calculated from the primary API key (`api_key` value in the DD config). This is just adding the value to the `imageresolver.Config`, but not updating any of the image resolution behavior to depend on this yet. That will be in a subsequent PR. ### Motivation In order to implement the tag-based gradual rollout, we need to be able to calculate which rollout bucket a given cluster belongs to. For our first iteration, we've decided to go with the API key in order to have an even distribution among the total number of buckets. ### Describe how you validated your changes Added unit tests to validate behavior as well as the evenness of the distribution with the current hashing implementation. ### Additional Notes Co-authored-by: erika.yasuda <erika.yasuda@datadoghq.com>
1 parent 25a11d4 commit c26cfbb

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,19 @@
1010
package imageresolver
1111

1212
import (
13+
"crypto/sha256"
14+
"encoding/binary"
15+
"strconv"
1316
"time"
1417

1518
"github.com/DataDog/datadog-agent/comp/core/config"
1619
"github.com/DataDog/datadog-agent/pkg/remoteconfig/state"
1720
)
1821

22+
const (
23+
rolloutBucketCount = 10 // Max number of buckets for gradual rollout
24+
)
25+
1926
// RemoteConfigClient defines the interface we need for remote config operations
2027
type RemoteConfigClient interface {
2128
GetConfigs(product string) map[string]state.RawConfig
@@ -29,6 +36,14 @@ type Config struct {
2936
RCClient RemoteConfigClient
3037
MaxInitRetries int
3138
InitRetryDelay time.Duration
39+
BucketID string
40+
}
41+
42+
func calculateRolloutBucket(apiKey string) string {
43+
// DEV: If the API key is empty for whatever reason, resolves to bucket 2
44+
hash := sha256.Sum256([]byte(apiKey))
45+
hashInt := binary.BigEndian.Uint64(hash[:8])
46+
return strconv.Itoa(int(hashInt % rolloutBucketCount))
3247
}
3348

3449
// NewConfig creates a new Config
@@ -39,5 +54,6 @@ func NewConfig(cfg config.Component, rcClient RemoteConfigClient) Config {
3954
RCClient: rcClient,
4055
MaxInitRetries: 5,
4156
InitRetryDelay: 1 * time.Second,
57+
BucketID: calculateRolloutBucket(cfg.GetString("api_key")),
4258
}
4359
}

pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
package imageresolver
99

1010
import (
11+
"fmt"
12+
"math"
1113
"testing"
1214
"time"
1315

@@ -35,6 +37,7 @@ func TestNewConfig(t *testing.T) {
3537
RCClient: nil,
3638
MaxInitRetries: 5,
3739
InitRetryDelay: 1 * time.Second,
40+
BucketID: "2",
3841
},
3942
},
4043
{
@@ -51,6 +54,7 @@ func TestNewConfig(t *testing.T) {
5154
RCClient: nil,
5255
MaxInitRetries: 5,
5356
InitRetryDelay: 1 * time.Second,
57+
BucketID: "2",
5458
},
5559
},
5660
{
@@ -66,6 +70,24 @@ func TestNewConfig(t *testing.T) {
6670
RCClient: nil,
6771
MaxInitRetries: 5,
6872
InitRetryDelay: 1 * time.Second,
73+
BucketID: "2",
74+
},
75+
},
76+
{
77+
name: "bucket_id_based_on_api_key",
78+
configFactory: func(t *testing.T) config.Component {
79+
mockConfig := config.NewMock(t)
80+
mockConfig.SetWithoutSource("site", "datadoghq.com")
81+
mockConfig.SetWithoutSource("api_key", "1234567890abcdef")
82+
return mockConfig
83+
},
84+
expectedState: Config{
85+
Site: "datadoghq.com",
86+
DDRegistries: map[string]struct{}{"gcr.io/datadoghq": {}, "docker.io/datadog": {}, "public.ecr.aws/datadog": {}},
87+
RCClient: nil,
88+
MaxInitRetries: 5,
89+
InitRetryDelay: 1 * time.Second,
90+
BucketID: "0",
6991
},
7092
},
7193
}
@@ -79,3 +101,33 @@ func TestNewConfig(t *testing.T) {
79101
})
80102
}
81103
}
104+
105+
func TestCalculateRolloutBucket_EvenlyDistributed(t *testing.T) {
106+
bucketCounts := make(map[string]int)
107+
108+
numSamples := 10000
109+
for i := 0; i < numSamples; i++ {
110+
apiKey := fmt.Sprintf("api-key-%d", i)
111+
bucket := calculateRolloutBucket(apiKey)
112+
bucketCounts[bucket]++
113+
}
114+
115+
require.Len(t, bucketCounts, rolloutBucketCount, "Should use all %d buckets", rolloutBucketCount)
116+
117+
expectedPerBucket := float64(numSamples) / float64(rolloutBucketCount)
118+
p := 1.0 / float64(rolloutBucketCount)
119+
stdDev := math.Sqrt(float64(numSamples) * p * (1.0 - p))
120+
tolerance := 4.0 // 4 std devs give 99.99% confidence
121+
122+
minCount := int(expectedPerBucket - tolerance*stdDev)
123+
maxCount := int(expectedPerBucket + tolerance*stdDev)
124+
125+
for bucket, count := range bucketCounts {
126+
require.GreaterOrEqual(t, count, minCount,
127+
"Bucket %s has too few samples: %d (expected between %d and %d)",
128+
bucket, count, minCount, maxCount)
129+
require.LessOrEqual(t, count, maxCount,
130+
"Bucket %s has too many samples: %d (expected between %d and %d)",
131+
bucket, count, minCount, maxCount)
132+
}
133+
}

0 commit comments

Comments
 (0)