[APMLP-876] Add gradual rollout bucket calculation in imageresolver.Config (#45448)

erikayasuda · web-flow · commit c26cfbbf7863 · 2026-01-23T22:03:22.000Z
### What does this PR do?
Adds the gradual rollout bucket ID to the `imageresolver.Config` type. The value is calculated from the primary API key (`api_key` value in the DD config).  

This is just adding the value to the `imageresolver.Config`, but not updating any of the image resolution behavior to depend on this yet. That will be in a subsequent PR.

### Motivation
In order to implement the tag-based gradual rollout, we need to be able to calculate which rollout bucket a given cluster belongs to. For our first iteration, we've decided to go with the API key in order to have an even distribution among the total number of buckets.

### Describe how you validated your changes
Added unit tests to validate behavior as well as the evenness of the distribution with the current hashing implementation.

### Additional Notes


Co-authored-by: erika.yasuda &lt;erika.yasuda@datadoghq.com&gt;
diff --git a/pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config.go b/pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config.go
@@ -10,12 +10,19 @@
 package imageresolver
 
 import (
+	"crypto/sha256"
+	"encoding/binary"
+	"strconv"
 	"time"
 
 	"github.com/DataDog/datadog-agent/comp/core/config"
 	"github.com/DataDog/datadog-agent/pkg/remoteconfig/state"
 )
 
+const (
+	rolloutBucketCount = 10 // Max number of buckets for gradual rollout
+)
+
 // RemoteConfigClient defines the interface we need for remote config operations
 type RemoteConfigClient interface {
 	GetConfigs(product string) map[string]state.RawConfig
@@ -29,6 +36,14 @@ type Config struct {
 	RCClient       RemoteConfigClient
 	MaxInitRetries int
 	InitRetryDelay time.Duration
+	BucketID       string
+}
+
+func calculateRolloutBucket(apiKey string) string {
+	// DEV: If the API key is empty for whatever reason, resolves to bucket 2
+	hash := sha256.Sum256([]byte(apiKey))
+	hashInt := binary.BigEndian.Uint64(hash[:8])
+	return strconv.Itoa(int(hashInt % rolloutBucketCount))
 }
 
 // NewConfig creates a new Config
@@ -39,5 +54,6 @@ func NewConfig(cfg config.Component, rcClient RemoteConfigClient) Config {
 		RCClient:       rcClient,
 		MaxInitRetries: 5,
 		InitRetryDelay: 1 * time.Second,
+		BucketID:       calculateRolloutBucket(cfg.GetString("api_key")),
 	}
 }
diff --git a/pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config_test.go b/pkg/clusteragent/admission/mutate/autoinstrumentation/imageresolver/config_test.go
@@ -8,6 +8,8 @@
 package imageresolver
 
 import (
+	"fmt"
+	"math"
 	"testing"
 	"time"
 
@@ -35,6 +37,7 @@ func TestNewConfig(t *testing.T) {
 				RCClient:       nil,
 				MaxInitRetries: 5,
 				InitRetryDelay: 1 * time.Second,
+				BucketID:       "2",
 			},
 		},
 		{
@@ -51,6 +54,7 @@ func TestNewConfig(t *testing.T) {
 				RCClient:       nil,
 				MaxInitRetries: 5,
 				InitRetryDelay: 1 * time.Second,
+				BucketID:       "2",
 			},
 		},
 		{
@@ -66,6 +70,24 @@ func TestNewConfig(t *testing.T) {
 				RCClient:       nil,
 				MaxInitRetries: 5,
 				InitRetryDelay: 1 * time.Second,
+				BucketID:       "2",
+			},
+		},
+		{
+			name: "bucket_id_based_on_api_key",
+			configFactory: func(t *testing.T) config.Component {
+				mockConfig := config.NewMock(t)
+				mockConfig.SetWithoutSource("site", "datadoghq.com")
+				mockConfig.SetWithoutSource("api_key", "1234567890abcdef")
+				return mockConfig
+			},
+			expectedState: Config{
+				Site:           "datadoghq.com",
+				DDRegistries:   map[string]struct{}{"gcr.io/datadoghq": {}, "docker.io/datadog": {}, "public.ecr.aws/datadog": {}},
+				RCClient:       nil,
+				MaxInitRetries: 5,
+				InitRetryDelay: 1 * time.Second,
+				BucketID:       "0",
 			},
 		},
 	}
@@ -79,3 +101,33 @@ func TestNewConfig(t *testing.T) {
 		})
 	}
 }
+
+func TestCalculateRolloutBucket_EvenlyDistributed(t *testing.T) {
+	bucketCounts := make(map[string]int)
+
+	numSamples := 10000
+	for i := 0; i < numSamples; i++ {
+		apiKey := fmt.Sprintf("api-key-%d", i)
+		bucket := calculateRolloutBucket(apiKey)
+		bucketCounts[bucket]++
+	}
+
+	require.Len(t, bucketCounts, rolloutBucketCount, "Should use all %d buckets", rolloutBucketCount)
+
+	expectedPerBucket := float64(numSamples) / float64(rolloutBucketCount)
+	p := 1.0 / float64(rolloutBucketCount)
+	stdDev := math.Sqrt(float64(numSamples) * p * (1.0 - p))
+	tolerance := 4.0 // 4 std devs give 99.99% confidence
+
+	minCount := int(expectedPerBucket - tolerance*stdDev)
+	maxCount := int(expectedPerBucket + tolerance*stdDev)
+
+	for bucket, count := range bucketCounts {
+		require.GreaterOrEqual(t, count, minCount,
+			"Bucket %s has too few samples: %d (expected between %d and %d)",
+			bucket, count, minCount, maxCount)
+		require.LessOrEqual(t, count, maxCount,
+			"Bucket %s has too many samples: %d (expected between %d and %d)",
+			bucket, count, minCount, maxCount)
+	}
+}