Skip to content

Commit 0344b0c

Browse files
authored
add storage driver support for built-in dind (#4371)
* add storage driver support for built-in dind Signed-off-by: Min Min <[email protected]> * fix dind not upgrading Signed-off-by: Min Min <[email protected]> * add sts support Signed-off-by: Min Min <[email protected]> * logic update and fix bug Signed-off-by: Min Min <[email protected]> * fix bug Signed-off-by: Min Min <[email protected]> * add cluster ID hex Signed-off-by: Min Min <[email protected]> * debug Signed-off-by: Min Min <[email protected]> * added debug logs Signed-off-by: Min Min <[email protected]> * fix panic bug Signed-off-by: Min Min <[email protected]> * fix panic Signed-off-by: Min Min <[email protected]> * debug Signed-off-by: Min Min <[email protected]> * debug logs Signed-off-by: Min Min <[email protected]> * possibly fix bugs Signed-off-by: Min Min <[email protected]> * restore debug logs Signed-off-by: Min Min <[email protected]> * remove unecessary code and change logic on how we update dind Signed-off-by: Min Min <[email protected]> * add retry logic so the service does not panic on conflict Signed-off-by: Min Min <[email protected]> --------- Signed-off-by: Min Min <[email protected]>
1 parent cd64c5c commit 0344b0c

File tree

8 files changed

+340
-57
lines changed

8 files changed

+340
-57
lines changed

pkg/microservice/aslan/core/common/repository/models/k8s_cluster.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,11 @@ type Resources struct {
108108
}
109109

110110
type DindCfg struct {
111-
Replicas int `json:"replicas" bson:"replicas"`
112-
Resources *Resources `json:"resources" bson:"resources"`
113-
Storage *DindStorage `json:"storage" bson:"storage"`
114-
StrategyID string `json:"strategy_id" bson:"strategy_id"`
111+
Replicas int `json:"replicas" bson:"replicas"`
112+
Resources *Resources `json:"resources" bson:"resources"`
113+
Storage *DindStorage `json:"storage" bson:"storage"`
114+
StrategyID string `json:"strategy_id" bson:"strategy_id"`
115+
StorageDriver string `json:"storage_driver" bson:"storage_driver"`
115116
}
116117

117118
type DindStorageType string

pkg/microservice/aslan/core/common/service/kube/helm.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,6 @@ func EnsureDeleteZadigServiceBySvcName(ctx context.Context, env *commonmodels.Pr
540540
}
541541
return nil
542542
}
543-
544543
func DeploySingleHelmRelease(product *commonmodels.Product, productSvc *commonmodels.ProductService,
545544
svcTemp *commonmodels.Service, images []string, maxHistory, timeout int, user string) error {
546545
chartInfo := productSvc.GetServiceRender()

pkg/microservice/aslan/core/common/service/kube/service.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ func (s *Service) GetYaml(id, agentImage, aslanURL, hubURI string, useDeployment
305305
return nil, err
306306
}
307307

308-
dindReplicas, dindLimitsCPU, dindLimitsMemory, dindEnablePV, dindSCName, dindStorageSizeInGiB := getDindCfg(cluster)
308+
dindReplicas, dindLimitsCPU, dindLimitsMemory, dindEnablePV, dindSCName, dindStorageSizeInGiB, dindStorageDriver := getDindCfg(cluster)
309309

310310
yaml := agentYaml
311311
if cluster.AdvancedConfig != nil {
@@ -356,6 +356,7 @@ func (s *Service) GetYaml(id, agentImage, aslanURL, hubURI string, useDeployment
356356
DindEnablePV: dindEnablePV,
357357
DindStorageClassName: dindSCName,
358358
DindStorageSizeInGiB: dindStorageSizeInGiB,
359+
DindStorageDriver: dindStorageDriver,
359360
ScheduleWorkflow: scheduleWorkflow,
360361
EnableIRSA: cluster.AdvancedConfig.EnableIRSA,
361362
IRSARoleARN: cluster.AdvancedConfig.IRSARoleARM,
@@ -380,6 +381,7 @@ func (s *Service) GetYaml(id, agentImage, aslanURL, hubURI string, useDeployment
380381
DindEnablePV: dindEnablePV,
381382
DindStorageClassName: dindSCName,
382383
DindStorageSizeInGiB: dindStorageSizeInGiB,
384+
DindStorageDriver: dindStorageDriver,
383385
EnableIRSA: cluster.AdvancedConfig.EnableIRSA,
384386
NodeSelector: cluster.AdvancedConfig.AgentNodeSelector,
385387
Toleration: cluster.AdvancedConfig.AgentToleration,
@@ -406,13 +408,14 @@ func (s *Service) UpdateUpgradeAgentInfo(id, updateHubagentErrorMsg string) erro
406408
return err
407409
}
408410

409-
func getDindCfg(cluster *models.K8SCluster) (replicas int, limitsCPU, limitsMemory string, enablePV bool, scName string, storageSizeInGiB int) {
411+
func getDindCfg(cluster *models.K8SCluster) (replicas int, limitsCPU, limitsMemory string, enablePV bool, scName string, storageSizeInGiB int, storageDriver string) {
410412
replicas = DefaultDindReplicas
411413
limitsCPU = strconv.Itoa(DefaultDindLimitsCPU) + setting.CpuUintM
412414
limitsMemory = strconv.Itoa(DefaultDindLimitsMemory) + setting.MemoryUintMi
413415
enablePV = DefaultDindEnablePV
414416
scName = DefaultDindStorageClassName
415417
storageSizeInGiB = DefaultDindStorageSizeInGiB
418+
storageDriver = ""
416419

417420
if cluster.DindCfg != nil {
418421
if cluster.DindCfg.Replicas > 0 {
@@ -433,6 +436,10 @@ func getDindCfg(cluster *models.K8SCluster) (replicas int, limitsCPU, limitsMemo
433436
scName = cluster.DindCfg.Storage.StorageClass
434437
storageSizeInGiB = int(cluster.DindCfg.Storage.StorageSizeInGiB)
435438
}
439+
440+
if cluster.DindCfg.StorageDriver != "" {
441+
storageDriver = cluster.DindCfg.StorageDriver
442+
}
436443
}
437444

438445
return
@@ -676,6 +683,7 @@ type TemplateSchema struct {
676683
DindEnablePV bool
677684
DindStorageClassName string
678685
DindStorageSizeInGiB int
686+
DindStorageDriver string
679687
ScheduleWorkflow bool
680688
EnableIRSA bool
681689
IRSARoleARN string
@@ -1047,6 +1055,10 @@ spec:
10471055
containers:
10481056
- name: dind
10491057
image: {{.DindImage}}
1058+
{{- if .DindStorageDriver }}
1059+
args:
1060+
- --storage-driver={{.DindStorageDriver}}
1061+
{{- end }}
10501062
env:
10511063
- name: DOCKER_TLS_CERTDIR
10521064
value: ""
@@ -1188,6 +1200,10 @@ spec:
11881200
containers:
11891201
- name: dind
11901202
image: {{.DindImage}}
1203+
{{- if .DindStorageDriver }}
1204+
args:
1205+
- --storage-driver={{.DindStorageDriver}}
1206+
{{- end }}
11911207
env:
11921208
- name: DOCKER_TLS_CERTDIR
11931209
value: ""

pkg/microservice/aslan/core/multicluster/service/clusters.go

Lines changed: 91 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,26 +1172,43 @@ func UpgradeDind(kclient client.Client, cluster *commonmodels.K8SCluster, ns str
11721172
}
11731173

11741174
ctx := context.TODO()
1175-
dindSts := &appsv1.StatefulSet{}
1176-
err := kclient.Get(ctx, client.ObjectKey{
1177-
Name: types.DindStatefulSetName,
1178-
Namespace: ns,
1179-
}, dindSts)
1180-
if err != nil {
1181-
return fmt.Errorf("failed to get dind StatefulSet in local cluster: %s", err)
1182-
}
11831175

1184-
scaleupd := false
1185-
if *dindSts.Spec.Replicas < int32(cluster.DindCfg.Replicas) {
1186-
scaleupd = true
1176+
// Retry logic for handling concurrent modifications
1177+
err := retryOnConflict(func() error {
1178+
dindSts := &appsv1.StatefulSet{}
1179+
err := kclient.Get(ctx, client.ObjectKey{
1180+
Name: types.DindStatefulSetName,
1181+
Namespace: ns,
1182+
}, dindSts)
1183+
if err != nil {
1184+
return fmt.Errorf("failed to get dind StatefulSet in local cluster: %s", err)
1185+
}
1186+
1187+
originalSts := new(appsv1.StatefulSet)
1188+
err = util.DeepCopy(originalSts, dindSts)
1189+
if err != nil {
1190+
return fmt.Errorf("failed to deep copy original dind statefulset, error: %s", err)
1191+
}
1192+
1193+
return applyDindUpgrade(kclient, ctx, dindSts, originalSts, cluster, ns)
1194+
})
1195+
1196+
if err != nil {
1197+
return err
11871198
}
11881199

1189-
originalSts := new(appsv1.StatefulSet)
1190-
err = util.DeepCopy(originalSts, dindSts)
1200+
// Sync registry configuration after successful update
1201+
err = commonutil.SyncDinDForRegistries()
11911202
if err != nil {
1192-
return fmt.Errorf("failed to deep copy original dind statefulset, error: %s", err)
1203+
log.Errorf("SyncDinDForRegistries error: %v", err)
11931204
}
11941205

1206+
return nil
1207+
}
1208+
1209+
func applyDindUpgrade(kclient client.Client, ctx context.Context, dindSts, originalSts *appsv1.StatefulSet, cluster *commonmodels.K8SCluster, ns string) error {
1210+
var err error
1211+
11951212
dindSts.Spec.Replicas = util.GetInt32Pointer(int32(cluster.DindCfg.Replicas))
11961213

11971214
if cluster.DindCfg.Resources != nil && cluster.DindCfg.Resources.Limits != nil {
@@ -1311,6 +1328,28 @@ func UpgradeDind(kclient client.Client, cluster *commonmodels.K8SCluster, ns str
13111328
dindSts.Spec.Template.Spec.Affinity = commonutil.AddNodeAffinity(cluster.AdvancedConfig, cluster.DindCfg.StrategyID)
13121329
}
13131330

1331+
// Update storage driver argument if configured
1332+
if len(dindSts.Spec.Template.Spec.Containers) > 0 {
1333+
currentArgs := dindSts.Spec.Template.Spec.Containers[0].Args
1334+
finalArgs := make([]string, 0)
1335+
1336+
// Filter out existing storage-driver arguments
1337+
for _, arg := range currentArgs {
1338+
if strings.HasPrefix(arg, "--storage-driver=") {
1339+
continue
1340+
}
1341+
finalArgs = append(finalArgs, arg)
1342+
}
1343+
1344+
// Add storage driver arg if configured
1345+
if cluster.DindCfg.StorageDriver != "" {
1346+
expectedStorageDriverArg := fmt.Sprintf("--storage-driver=%s", cluster.DindCfg.StorageDriver)
1347+
finalArgs = append(finalArgs, expectedStorageDriverArg)
1348+
}
1349+
1350+
dindSts.Spec.Template.Spec.Containers[0].Args = finalArgs
1351+
}
1352+
13141353
if stsHasImmutableFieldChanged(originalSts, dindSts) {
13151354
log.Infof("dind has immutable field changed, recreating dind.")
13161355
err = kclient.Delete(ctx, dindSts)
@@ -1347,24 +1386,53 @@ func UpgradeDind(kclient client.Client, cluster *commonmodels.K8SCluster, ns str
13471386
return err
13481387
}
13491388
} else {
1350-
err = kclient.Update(ctx, dindSts)
1351-
if err != nil {
1352-
err = fmt.Errorf("failed to update StatefulSet `dind`: %s", err)
1353-
log.Error(err)
1354-
return err
1389+
// Check if StatefulSet is stuck (e.g., due to wrong storage driver) and handle it
1390+
isStuck := kube.IsStatefulSetStuckInUpdate(dindSts, log.SugaredLogger())
1391+
if isStuck {
1392+
log.Warnf("StatefulSet %s/%s is stuck, attempting to fix by deleting stuck pods before update", ns, types.DindStatefulSetName)
1393+
clusterID := cluster.ID.Hex()
1394+
clientSet, err := clientmanager.NewKubeClientManager().GetKubernetesClientSet(clusterID)
1395+
if err != nil {
1396+
log.Warnf("Failed to get clientset for cluster %s to handle stuck StatefulSet: %v", clusterID, err)
1397+
// Continue with update even if we can't get clientset
1398+
} else {
1399+
if fixErr := kube.HandleStuckStatefulSet(dindSts, clientSet, log.SugaredLogger()); fixErr != nil {
1400+
log.Warnf("Failed to clean up stuck pods for StatefulSet %s/%s: %v", ns, types.DindStatefulSetName, fixErr)
1401+
// Continue with update even if cleanup fails
1402+
}
1403+
}
13551404
}
1356-
}
13571405

1358-
if scaleupd {
1359-
err = commonutil.SyncDinDForRegistries()
1406+
err := kclient.Update(ctx, dindSts)
13601407
if err != nil {
1361-
log.Errorf("SyncDinDForRegistries error: %v", err)
1408+
return fmt.Errorf("failed to update StatefulSet `dind`: %s", err)
13621409
}
13631410
}
13641411

13651412
return nil
13661413
}
13671414

1415+
// retryOnConflict retries the given function on conflict errors
1416+
func retryOnConflict(fn func() error) error {
1417+
for i := 0; i < 5; i++ {
1418+
err := fn()
1419+
if err == nil {
1420+
return nil
1421+
}
1422+
1423+
if apierrors.IsConflict(err) {
1424+
log.Warnf("Conflict updating StatefulSet (attempt %d/5), retrying: %s", i+1, err)
1425+
time.Sleep(time.Duration(100*(i+1)) * time.Millisecond)
1426+
continue
1427+
}
1428+
1429+
// Non-conflict error, return immediately
1430+
return err
1431+
}
1432+
1433+
return fmt.Errorf("failed to update after 5 retries due to conflicts")
1434+
}
1435+
13681436
func GetPVCName(prefix string, nfsProperties *types.NFSProperties) string {
13691437
name := fmt.Sprintf("%s-storage-%s-%d", prefix, nfsProperties.StorageClass, nfsProperties.StorageSizeInGiB)
13701438
return util.TruncateName(name, 63)

pkg/microservice/hubagent/server/server.go

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,11 @@ func Serve(ctx context.Context) error {
9696
}
9797

9898
func initResource() {
99-
client := aslan.NewExternal(config2.AslanBaseAddr(), "")
99+
token, err := login.GetInternalToken("hub-agent")
100+
if err != nil {
101+
log.Fatalf("failed to get internal token, err: %s", err)
102+
}
103+
client := aslan.NewExternal(config2.AslanBaseAddr(), token)
100104

101105
scheduleWorkflow := config2.ScheduleWorkflow()
102106
if scheduleWorkflow == "" {
@@ -110,13 +114,7 @@ func initResource() {
110114
}
111115

112116
if schedule {
113-
token, err := login.GetInternalToken("hub-agent")
114-
if err != nil {
115-
log.Fatalf("failed to get internal token, err: %s", err)
116-
}
117-
log.Infof("token: %s", token)
118-
119-
ls, err := client.ListRegistries(token)
117+
ls, err := client.ListRegistries()
120118
if err != nil {
121119
log.Fatalf("failed to list registries from zadig server, error: %s", err)
122120
}

pkg/shared/client/aslan/multicluster.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ type ClusterDetail struct {
9797
Provider int8 `json:"provider" bson:"provider"`
9898
Local bool `json:"local" bson:"local"`
9999
Cache types.Cache `json:"cache" bson:"cache"`
100+
DindCfg *DindCfg `json:"dind_cfg" bson:"dind_cfg"`
100101

101102
// new field in 1.14, intended to enable kubeconfig for cluster management
102103
Type string `json:"type" bson:"type"` // either agent or kubeconfig supported
@@ -117,6 +118,29 @@ type AdvancedConfig struct {
117118
IRSARoleARM string `json:"irsa_role_arn" bson:"irsa_role_arn"`
118119
}
119120

121+
type DindCfg struct {
122+
Replicas int `json:"replicas" bson:"replicas"`
123+
Resources *Resources `json:"resources" bson:"resources"`
124+
Storage *DindStorage `json:"storage" bson:"storage"`
125+
StrategyID string `json:"strategy_id" bson:"strategy_id"`
126+
StorageDriver string `json:"storage_driver" bson:"storage_driver"`
127+
}
128+
129+
type Resources struct {
130+
Limits *Limits `json:"limits" bson:"limits"`
131+
}
132+
133+
type Limits struct {
134+
CPU int `json:"cpu" bson:"cpu"`
135+
Memory int `json:"memory" bson:"memory"`
136+
}
137+
138+
type DindStorage struct {
139+
Type string `json:"type" bson:"type"`
140+
StorageClass string `json:"storage_class" bson:"storage_class"`
141+
StorageSizeInGiB int64 `json:"storage_size_in_gib" bson:"storage_size_in_gib"`
142+
}
143+
120144
func (c *Client) GetClusterInfo(clusterID string) (*ClusterDetail, error) {
121145
url := fmt.Sprintf("/cluster/clusters/%s", clusterID)
122146

pkg/shared/client/aslan/registries.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@ package aslan
1818

1919
import "github.com/koderover/zadig/v2/pkg/tool/httpclient"
2020

21-
func (c *Client) ListRegistries(token string) ([]*RegistryInfo, error) {
21+
func (c *Client) ListRegistries() ([]*RegistryInfo, error) {
2222
url := "/system/registry/project"
2323
res := make([]*RegistryInfo, 0)
2424

2525
_, err := c.Get(url,
26-
httpclient.SetHeader("Authorization", "Bearer "+token),
2726
httpclient.SetResult(&res),
2827
)
2928
if err != nil {

0 commit comments

Comments
 (0)