Skip to content

Commit 3df7818

Browse files
authored
[Test] add e2e tests for autoscaler v1 and v2 with GCS FT (#3888)
Signed-off-by: Rueian <[email protected]>
1 parent 1f98479 commit 3df7818

File tree

4 files changed

+169
-63
lines changed

4 files changed

+169
-63
lines changed

ray-operator/test/e2e/raycluster_gcs_ft_test.go

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@ import (
1717
. "github.com/ray-project/kuberay/ray-operator/test/support"
1818
)
1919

20-
const (
21-
redisPassword = "5241590000000000"
22-
redisAddress = "redis:6379"
23-
)
24-
2520
func TestRayClusterGCSFaultTolerance(t *testing.T) {
2621
test := With(t)
2722
g := NewWithT(t)
@@ -33,14 +28,14 @@ func TestRayClusterGCSFaultTolerance(t *testing.T) {
3328
g.Expect(err).NotTo(HaveOccurred())
3429

3530
test.T().Run("Test Detached Actor", func(_ *testing.T) {
36-
checkRedisDBSize := deployRedis(test, namespace.Name, redisPassword)
31+
checkRedisDBSize := DeployRedis(test, namespace.Name, RedisPassword)
3732
defer g.Eventually(checkRedisDBSize, time.Second*30, time.Second).Should(BeEquivalentTo("0"))
3833

3934
rayClusterSpecAC := rayv1ac.RayClusterSpec().
4035
WithGcsFaultToleranceOptions(
4136
rayv1ac.GcsFaultToleranceOptions().
42-
WithRedisAddress(redisAddress).
43-
WithRedisPassword(rayv1ac.RedisCredential().WithValue(redisPassword)),
37+
WithRedisAddress(RedisAddress).
38+
WithRedisPassword(rayv1ac.RedisCredential().WithValue(RedisPassword)),
4439
).
4540
WithRayVersion(GetRayVersion()).
4641
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
@@ -82,7 +77,7 @@ func TestRayClusterGCSFaultTolerance(t *testing.T) {
8277
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_1.py", rayNamespace})
8378

8479
// [Test 1: Kill GCS process to "restart" the head Pod]
85-
// Assertion is implement in python, so no furthur handling needed here, and so are other ExecPodCmd
80+
// Assertion is implement in python, so no further handling needed here, and so are other ExecPodCmd
8681
stdout, stderr := ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"pkill", "gcs_server"})
8782
LogWithTimestamp(test.T(), "pkill gcs_server output - stdout: %s, stderr: %s", stdout.String(), stderr.String())
8883

@@ -145,49 +140,49 @@ func TestGcsFaultToleranceOptions(t *testing.T) {
145140
return rayv1ac.RayCluster("raycluster-gcsft", namespace).WithSpec(
146141
newRayClusterSpec().WithGcsFaultToleranceOptions(
147142
rayv1ac.GcsFaultToleranceOptions().
148-
WithRedisAddress(redisAddress),
143+
WithRedisAddress(RedisAddress),
149144
),
150145
)
151146
},
152147
createSecret: false,
153148
},
154149
{
155150
name: "Redis Password",
156-
redisPassword: redisPassword,
151+
redisPassword: RedisPassword,
157152
rayClusterFn: func(namespace string) *rayv1ac.RayClusterApplyConfiguration {
158153
return rayv1ac.RayCluster("raycluster-gcsft", namespace).WithSpec(
159154
newRayClusterSpec().WithGcsFaultToleranceOptions(
160155
rayv1ac.GcsFaultToleranceOptions().
161-
WithRedisAddress(redisAddress).
162-
WithRedisPassword(rayv1ac.RedisCredential().WithValue(redisPassword)),
156+
WithRedisAddress(RedisAddress).
157+
WithRedisPassword(rayv1ac.RedisCredential().WithValue(RedisPassword)),
163158
),
164159
)
165160
},
166161
createSecret: false,
167162
},
168163
{
169164
name: "Redis Password and Username",
170-
redisPassword: redisPassword,
165+
redisPassword: RedisPassword,
171166
rayClusterFn: func(namespace string) *rayv1ac.RayClusterApplyConfiguration {
172167
return rayv1ac.RayCluster("raycluster-gcsft", namespace).WithSpec(
173168
newRayClusterSpec().WithGcsFaultToleranceOptions(
174169
rayv1ac.GcsFaultToleranceOptions().
175-
WithRedisAddress(redisAddress).
170+
WithRedisAddress(RedisAddress).
176171
WithRedisUsername(rayv1ac.RedisCredential().WithValue("default")).
177-
WithRedisPassword(rayv1ac.RedisCredential().WithValue(redisPassword)),
172+
WithRedisPassword(rayv1ac.RedisCredential().WithValue(RedisPassword)),
178173
),
179174
)
180175
},
181176
createSecret: false,
182177
},
183178
{
184179
name: "Redis Password In Secret",
185-
redisPassword: redisPassword,
180+
redisPassword: RedisPassword,
186181
rayClusterFn: func(namespace string) *rayv1ac.RayClusterApplyConfiguration {
187182
return rayv1ac.RayCluster("raycluster-gcsft", namespace).WithSpec(
188183
newRayClusterSpec().WithGcsFaultToleranceOptions(
189184
rayv1ac.GcsFaultToleranceOptions().
190-
WithRedisAddress(redisAddress).
185+
WithRedisAddress(RedisAddress).
191186
WithRedisPassword(rayv1ac.RedisCredential().
192187
WithValueFrom(corev1.EnvVarSource{
193188
SecretKeyRef: &corev1.SecretKeySelector{
@@ -209,7 +204,7 @@ func TestGcsFaultToleranceOptions(t *testing.T) {
209204
return rayv1ac.RayCluster("raycluster-with-a-very-long-name-exceeding-k8s-limit", namespace).WithSpec(
210205
newRayClusterSpec().WithGcsFaultToleranceOptions(
211206
rayv1ac.GcsFaultToleranceOptions().
212-
WithRedisAddress(redisAddress),
207+
WithRedisAddress(RedisAddress),
213208
),
214209
)
215210
},
@@ -223,7 +218,7 @@ func TestGcsFaultToleranceOptions(t *testing.T) {
223218
g := NewWithT(t)
224219
namespace := test.NewTestNamespace()
225220

226-
checkRedisDBSize := deployRedis(test, namespace.Name, tc.redisPassword)
221+
checkRedisDBSize := DeployRedis(test, namespace.Name, tc.redisPassword)
227222
defer g.Eventually(checkRedisDBSize, time.Second*30, time.Second).Should(BeEquivalentTo("0"))
228223

229224
if tc.createSecret {
@@ -283,18 +278,18 @@ func TestGcsFaultToleranceAnnotations(t *testing.T) {
283278
name: "GCS FT with redis password in ray start params",
284279
storageNS: "",
285280
redisPasswordEnv: "",
286-
redisPasswordInRayStartParams: redisPassword,
281+
redisPasswordInRayStartParams: RedisPassword,
287282
},
288283
{
289284
name: "GCS FT with redis password in ray start params referring to env",
290285
storageNS: "",
291-
redisPasswordEnv: redisPassword,
286+
redisPasswordEnv: RedisPassword,
292287
redisPasswordInRayStartParams: "$REDIS_PASSWORD",
293288
},
294289
{
295290
name: "GCS FT with storage namespace",
296291
storageNS: "test-storage-ns",
297-
redisPasswordEnv: redisPassword,
292+
redisPasswordEnv: RedisPassword,
298293
redisPasswordInRayStartParams: "$REDIS_PASSWORD",
299294
},
300295
}
@@ -315,13 +310,13 @@ func TestGcsFaultToleranceAnnotations(t *testing.T) {
315310
redisPassword = tc.redisPasswordInRayStartParams
316311
}
317312

318-
checkRedisDBSize := deployRedis(test, namespace.Name, redisPassword)
313+
checkRedisDBSize := DeployRedis(test, namespace.Name, redisPassword)
319314
defer g.Eventually(checkRedisDBSize, time.Second*30, time.Second).Should(BeEquivalentTo("0"))
320315

321316
// Prepare RayCluster ApplyConfiguration
322317
podTemplateAC := headPodTemplateApplyConfiguration()
323318
podTemplateAC.Spec.Containers[utils.RayContainerIndex].WithEnv(
324-
corev1ac.EnvVar().WithName("RAY_REDIS_ADDRESS").WithValue(redisAddress),
319+
corev1ac.EnvVar().WithName("RAY_REDIS_ADDRESS").WithValue(RedisAddress),
325320
)
326321
if tc.redisPasswordEnv != "" {
327322
podTemplateAC.Spec.Containers[utils.RayContainerIndex].WithEnv(

ray-operator/test/e2e/support.go

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package e2e
22

33
import (
44
"embed"
5-
"strings"
65

76
"github.com/stretchr/testify/require"
87
corev1 "k8s.io/api/core/v1"
@@ -176,40 +175,3 @@ func jobSubmitterPodTemplateApplyConfiguration() *corev1ac.PodTemplateSpecApplyC
176175
corev1.ResourceMemory: resource.MustParse("500Mi"),
177176
}))))
178177
}
179-
180-
func deployRedis(t Test, namespace string, password string) func() string {
181-
redisContainer := corev1ac.Container().WithName("redis").WithImage("redis:7.4").
182-
WithPorts(corev1ac.ContainerPort().WithContainerPort(6379))
183-
dbSizeCmd := []string{"redis-cli", "--no-auth-warning", "DBSIZE"}
184-
if password != "" {
185-
redisContainer.WithCommand("redis-server", "--requirepass", password)
186-
dbSizeCmd = []string{"redis-cli", "--no-auth-warning", "-a", password, "DBSIZE"}
187-
}
188-
189-
pod, err := t.Client().Core().CoreV1().Pods(namespace).Apply(
190-
t.Ctx(),
191-
corev1ac.Pod("redis", namespace).
192-
WithLabels(map[string]string{"app": "redis"}).
193-
WithSpec(corev1ac.PodSpec().WithContainers(redisContainer)),
194-
TestApplyOptions,
195-
)
196-
require.NoError(t.T(), err)
197-
198-
_, err = t.Client().Core().CoreV1().Services(namespace).Apply(
199-
t.Ctx(),
200-
corev1ac.Service("redis", namespace).
201-
WithSpec(corev1ac.ServiceSpec().
202-
WithSelector(map[string]string{"app": "redis"}).
203-
WithPorts(corev1ac.ServicePort().
204-
WithPort(6379),
205-
),
206-
),
207-
TestApplyOptions,
208-
)
209-
require.NoError(t.T(), err)
210-
211-
return func() string {
212-
stdout, stderr := ExecPodCmd(t, pod, "redis", dbSizeCmd)
213-
return strings.TrimSpace(stdout.String() + stderr.String())
214-
}
215-
}

ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
"github.com/onsi/gomega"
10+
. "github.com/onsi/gomega"
1011
corev1 "k8s.io/api/core/v1"
1112
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1213
corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
@@ -500,3 +501,101 @@ func TestRayClusterAutoscalerPlacementGroup(t *testing.T) {
500501
}
501502
}
502503
}
504+
505+
func TestRayClusterAutoscalerGCSFT(t *testing.T) {
506+
for _, tc := range tests {
507+
t.Run(tc.name, func(t *testing.T) {
508+
test := With(t)
509+
g := gomega.NewWithT(t)
510+
511+
// Create a namespace
512+
namespace := test.NewTestNamespace()
513+
514+
// Scripts for creating and terminating detached actors to trigger autoscaling
515+
scriptsAC := newConfigMap(namespace.Name, files(test, "create_detached_actor.py", "terminate_detached_actor.py"))
516+
scripts, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), scriptsAC, TestApplyOptions)
517+
g.Expect(err).NotTo(gomega.HaveOccurred())
518+
LogWithTimestamp(test.T(), "Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name)
519+
520+
checkRedisDBSize := DeployRedis(test, namespace.Name, RedisPassword)
521+
defer g.Eventually(checkRedisDBSize, time.Second*60, time.Second).Should(BeEquivalentTo("0"))
522+
523+
rayClusterSpecAC := rayv1ac.RayClusterSpec().
524+
WithEnableInTreeAutoscaling(true).
525+
WithGcsFaultToleranceOptions(
526+
rayv1ac.GcsFaultToleranceOptions().
527+
WithRedisAddress(RedisAddress).
528+
WithRedisPassword(rayv1ac.RedisCredential().WithValue(RedisPassword)),
529+
).
530+
WithRayVersion(GetRayVersion()).
531+
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
532+
WithRayStartParams(map[string]string{
533+
"num-cpus": "0",
534+
}).
535+
WithTemplate(tc.HeadPodTemplateGetter()),
536+
).
537+
WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
538+
WithRayStartParams(map[string]string{
539+
"num-cpus": "1",
540+
}).
541+
WithGroupName("small-group").
542+
WithReplicas(0).
543+
WithMinReplicas(0).
544+
WithMaxReplicas(2).
545+
WithTemplate(tc.WorkerPodTemplateGetter()),
546+
)
547+
rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).
548+
WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts")))
549+
550+
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions)
551+
g.Expect(err).NotTo(gomega.HaveOccurred())
552+
LogWithTimestamp(test.T(), "Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)
553+
554+
// Wait for RayCluster to become ready and verify the number of available worker replicas.
555+
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
556+
Should(gomega.WithTransform(RayClusterState, gomega.Equal(rayv1.Ready)))
557+
g.Expect(GetRayCluster(test, rayCluster.Namespace, rayCluster.Name)).To(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0))))
558+
559+
headPod, err := GetHeadPod(test, rayCluster)
560+
g.Expect(err).NotTo(gomega.HaveOccurred())
561+
LogWithTimestamp(test.T(), "Found head pod %s/%s", headPod.Namespace, headPod.Name)
562+
563+
// Create a detached actor, and a worker should be created.
564+
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "actor1"})
565+
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
566+
Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(1))))
567+
568+
// Terminate a detached actor, and a worker should be deleted.
569+
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "actor1"})
570+
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
571+
Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0))))
572+
573+
// Delete the head Pod
574+
err = test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{})
575+
g.Expect(err).NotTo(HaveOccurred())
576+
577+
PodUID := func(p *corev1.Pod) string { return string(p.UID) }
578+
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
579+
ShouldNot(WithTransform(PodUID, Equal(string(headPod.UID)))) // Use UID to check if the new head pod is created.
580+
581+
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
582+
Should(WithTransform(func(p *corev1.Pod) string { return string(p.Status.Phase) }, Equal("Running")))
583+
584+
headPod, err = GetHeadPod(test, rayCluster) // Replace the old head pod
585+
g.Expect(err).NotTo(HaveOccurred())
586+
587+
// Create a detached actor, and a worker should be created after the new head pod is ready.
588+
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "actor1"})
589+
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
590+
Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(1))))
591+
592+
// Terminate a detached actor, and a worker should be deleted.
593+
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "actor1"})
594+
g.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
595+
Should(gomega.WithTransform(RayClusterDesiredWorkerReplicas, gomega.Equal(int32(0))))
596+
597+
err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
598+
g.Expect(err).NotTo(HaveOccurred())
599+
})
600+
}
601+
}

ray-operator/test/support/redis.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package support
2+
3+
import (
4+
"strings"
5+
6+
"github.com/stretchr/testify/require"
7+
v1 "k8s.io/client-go/applyconfigurations/core/v1"
8+
)
9+
10+
func DeployRedis(t Test, namespace string, password string) func() string {
11+
redisContainer := v1.Container().WithName("redis").WithImage("redis:7.4").
12+
WithPorts(v1.ContainerPort().WithContainerPort(6379))
13+
dbSizeCmd := []string{"redis-cli", "--no-auth-warning", "DBSIZE"}
14+
if password != "" {
15+
redisContainer.WithCommand("redis-server", "--requirepass", password)
16+
dbSizeCmd = []string{"redis-cli", "--no-auth-warning", "-a", password, "DBSIZE"}
17+
}
18+
19+
pod, err := t.Client().Core().CoreV1().Pods(namespace).Apply(
20+
t.Ctx(),
21+
v1.Pod("redis", namespace).
22+
WithLabels(map[string]string{"app": "redis"}).
23+
WithSpec(v1.PodSpec().WithContainers(redisContainer)),
24+
TestApplyOptions,
25+
)
26+
require.NoError(t.T(), err)
27+
28+
_, err = t.Client().Core().CoreV1().Services(namespace).Apply(
29+
t.Ctx(),
30+
v1.Service("redis", namespace).
31+
WithSpec(v1.ServiceSpec().
32+
WithSelector(map[string]string{"app": "redis"}).
33+
WithPorts(v1.ServicePort().
34+
WithPort(6379),
35+
),
36+
),
37+
TestApplyOptions,
38+
)
39+
require.NoError(t.T(), err)
40+
41+
return func() string {
42+
stdout, stderr := ExecPodCmd(t, pod, "redis", dbSizeCmd)
43+
return strings.TrimSpace(stdout.String() + stderr.String())
44+
}
45+
}
46+
47+
const (
48+
RedisAddress = "redis:6379"
49+
RedisPassword = "5241590000000000"
50+
)

0 commit comments

Comments
 (0)