Skip to content

Commit 07040a0

Browse files
authored
Fix: Sidecar crashes during initialization lead to pods not being able to initialize [other implementation] (#570)
* PublishNotReadyAddresses & WithReadinessByContainer * fix style * Canonize * Return immediately if requested container is not ready
1 parent 3cc4be3 commit 07040a0

26 files changed

+60
-5
lines changed

pkg/components/master.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func buildMasterOptions(resource *ytv1.Ytsaurus) []Option {
4545
ContainerPort: consts.MasterRPCPort,
4646
Protocol: corev1.ProtocolTCP,
4747
}),
48+
WithReadinessByContainer(consts.YTServerContainerName),
4849
}
4950

5051
if resource.Spec.PrimaryMasters.HydraPersistenceUploader != nil && resource.Spec.PrimaryMasters.HydraPersistenceUploader.Image != nil {

pkg/components/server.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ type serverImpl struct {
6464

6565
readinessProbePort intstr.IntOrString
6666
readinessProbeHTTPPath string
67+
readinessByContainers []string
6768
}
6869

6970
func newServer(
@@ -191,6 +192,7 @@ func newServerConfigured(
191192

192193
readinessProbePort: opts.readinessProbeEndpointPort,
193194
readinessProbeHTTPPath: opts.readinessProbeEndpointPath,
195+
readinessByContainers: opts.readinessByContainers,
194196
}
195197
}
196198

@@ -291,7 +293,7 @@ func (s *serverImpl) needUpdate() bool {
291293
}
292294

293295
func (s *serverImpl) arePodsReady(ctx context.Context) bool {
294-
return s.statefulSet.ArePodsReady(ctx, int(s.instanceSpec.InstanceCount), s.instanceSpec.MinReadyInstanceCount)
296+
return s.statefulSet.ArePodsReady(ctx, int(s.instanceSpec.InstanceCount), s.instanceSpec.MinReadyInstanceCount, s.readinessByContainers)
295297
}
296298

297299
func (s *serverImpl) buildStatefulSet() *appsv1.StatefulSet {

pkg/components/serveroptions.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ type options struct {
1212
readinessProbeEndpointPath string
1313

1414
sidecarImages map[string]string
15+
16+
readinessByContainers []string
1517
}
1618

1719
type Option func(opts *options)
@@ -42,3 +44,9 @@ func WithSidecarImage(name, image string) Option {
4244
opts.sidecarImages[name] = image
4345
}
4446
}
47+
48+
func WithReadinessByContainer(name string) Option {
49+
return func(opts *options) {
50+
opts.readinessByContainers = append(opts.readinessByContainers, name)
51+
}
52+
}

pkg/resources/headless_service.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ func NewHeadlessService(name string, labeller *labeller2.Labeller, apiProxy apip
2525
func (s *HeadlessService) Build() *corev1.Service {
2626
s.newObject.ObjectMeta = s.labeller.GetObjectMeta(s.name)
2727
s.newObject.Spec = corev1.ServiceSpec{
28-
ClusterIP: "None",
29-
Selector: s.labeller.GetSelectorLabelMap(),
28+
ClusterIP: "None",
29+
Selector: s.labeller.GetSelectorLabelMap(),
30+
PublishNotReadyAddresses: true,
3031
}
3132

3233
return s.newObject

pkg/resources/statefulset.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,23 @@ func (s *StatefulSet) ArePodsRemoved(ctx context.Context) bool {
8686
return true
8787
}
8888

89-
func (s *StatefulSet) ArePodsReady(ctx context.Context, instanceCount int, minReadyInstanceCount *int) bool {
89+
func checkReadinessByContainers(pod corev1.Pod, byContainerNames []string) bool {
90+
found := 0
91+
for _, containerNameToCheck := range byContainerNames {
92+
for _, containerStatus := range pod.Status.ContainerStatuses {
93+
if containerStatus.Name != containerNameToCheck {
94+
continue
95+
}
96+
if !containerStatus.Ready {
97+
return false
98+
}
99+
found++
100+
}
101+
}
102+
return found == len(byContainerNames)
103+
}
104+
105+
func (s *StatefulSet) ArePodsReady(ctx context.Context, instanceCount int, minReadyInstanceCount *int, byContainerNames []string) bool {
90106
logger := log.FromContext(ctx)
91107
podList := s.getPods(ctx)
92108
if podList == nil {
@@ -100,7 +116,13 @@ func (s *StatefulSet) ArePodsReady(ctx context.Context, instanceCount int, minRe
100116

101117
readyInstanceCount := 0
102118
for _, pod := range podList.Items {
103-
if pod.Status.Phase != corev1.PodRunning {
119+
var ready bool
120+
if len(byContainerNames) > 0 {
121+
ready = checkReadinessByContainers(pod, byContainerNames)
122+
} else {
123+
ready = pod.Status.Phase == corev1.PodRunning
124+
}
125+
if !ready {
104126
logger.Info("pod is not yet running", "podName", pod.Name, "phase", pod.Status.Phase)
105127
} else {
106128
readyInstanceCount += 1

test/r8r/canondata/Components reconciler Minimal Test/Service discovery.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ metadata:
2222
resourceVersion: "1"
2323
spec:
2424
clusterIP: None
25+
publishNotReadyAddresses: true
2526
selector:
2627
yt_component: test-ytsaurus-yt-discovery
2728
status:

test/r8r/canondata/Components reconciler Minimal Test/Service http-proxies.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ metadata:
2222
resourceVersion: "1"
2323
spec:
2424
clusterIP: None
25+
publishNotReadyAddresses: true
2526
selector:
2627
yt_component: test-ytsaurus-yt-http-proxy
2728
status:

test/r8r/canondata/Components reconciler Minimal Test/Service masters.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ metadata:
2222
resourceVersion: "1"
2323
spec:
2424
clusterIP: None
25+
publishNotReadyAddresses: true
2526
selector:
2627
yt_component: test-ytsaurus-yt-master
2728
status:

test/r8r/canondata/Components reconciler With CRI job environment - CRI-O Test/Service discovery.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ metadata:
2222
resourceVersion: "1"
2323
spec:
2424
clusterIP: None
25+
publishNotReadyAddresses: true
2526
selector:
2627
yt_component: test-ytsaurus-yt-discovery
2728
status:

test/r8r/canondata/Components reconciler With CRI job environment - CRI-O Test/Service exec-nodes.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ metadata:
2222
resourceVersion: "1"
2323
spec:
2424
clusterIP: None
25+
publishNotReadyAddresses: true
2526
selector:
2627
yt_component: test-ytsaurus-yt-exec-node
2728
status:

0 commit comments

Comments
 (0)