Skip to content

Commit 02a215c

Browse files
WIP
1 parent 4a5a050 commit 02a215c

File tree

8 files changed

+143
-99
lines changed

8 files changed

+143
-99
lines changed

Makefile

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,15 +118,11 @@ test/e2e/%: bin/cockroach bin/kubectl bin/helm build/self-signer test/cluster/up
118118
$(MAKE) test/cluster/down; \
119119
exit $${EXIT_CODE:-0}
120120

121-
test/e2e/multi-region: bin/cockroach bin/kubectl bin/helm build/self-signer test/single-cluster/up
122-
@PATH="$(PWD)/bin:${PATH}" go test -timeout 60m -v -test.run TestOperatorInMultiRegion ./tests/e2e/operator/multiRegion/... || EXIT_CODE=$$?; \
123-
$(MAKE) test/multi-cluster/down; \
124-
exit $${EXIT_CODE:-0}
121+
test/e2e/multi-region: bin/cockroach bin/kubectl bin/helm build/self-signer
122+
@PATH="$(PWD)/bin:${PATH}" go test -timeout 60m -v -test.run TestOperatorInMultiRegion ./tests/e2e/operator/multiRegion/... || EXIT_CODE=$$?;
125123

126-
test/e2e/single-region: bin/cockroach bin/kubectl bin/helm build/self-signer test/single-cluster/up
127-
@PATH="$(PWD)/bin:${PATH}" go test -timeout 60m -v -test.run TestOperatorInSingleRegion ./tests/e2e/operator/singleRegion/... || EXIT_CODE=$$?; \
128-
$(MAKE) test/multi-cluster/down; \
129-
exit $${EXIT_CODE:-0}
124+
test/e2e/single-region: bin/cockroach bin/kubectl bin/helm build/self-signer
125+
@PATH="$(PWD)/bin:${PATH}" go test -timeout 60m -v -test.run TestOperatorInSingleRegion ./tests/e2e/operator/singleRegion/... || EXIT_CODE=$$?;
130126

131127
test/e2e/migrate: bin/cockroach bin/kubectl bin/helm bin/migration-helper build/self-signer test/cluster/up/3
132128
@PATH="$(PWD)/bin:${PATH}" go test -timeout 30m -v ./tests/e2e/migrate/... || EXIT_CODE=$$?; \
@@ -137,7 +133,7 @@ test/single-cluster/up: bin/k3d
137133
./tests/k3d/dev-multi-cluster.sh up --name "$(K3D_CLUSTER)" --nodes $(MULTI_REGION_NODE_SIZE) --clusters 1
138134

139135
test/multi-cluster/down: bin/k3d
140-
./tests/k3d/dev-multi-cluster.sh down --name "$(K3D_CLUSTER)" --nodes $(MULTI_REGION_NODE_SIZE) --clusters $(REGIONS)
136+
./tests/k3d/dev-multi-cluster.sh down
141137

142138
test/nightly-e2e/single-region: bin/cockroach bin/kubectl bin/helm build/self-signer
143139
@PATH="$(PWD)/bin:${PATH}" go test -timeout 60m -v -test.run TestOperatorInSingleRegion ./tests/e2e/operator/singleRegion/... || EXIT_CODE=$$?; \

tests/e2e/operator/infra/common.go

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package infra
22

33
import (
4-
"context"
54
"fmt"
65
"net"
76
"os/exec"
@@ -56,11 +55,11 @@ const (
5655

5756
// RegionCodes maps provider types to their region codes
5857
var RegionCodes = map[string][]string{
59-
ProviderK3D: {"us-east1", "us-east2", "us-west1"},
60-
ProviderKind: {"us-east1", "us-east2", "us-west1"},
58+
ProviderK3D: {"us-east1", "us-east2"},
59+
ProviderKind: {"us-east1", "us-east2"},
6160
ProviderGCP: {"us-central1", "us-east1", "us-west1"},
62-
ProviderAzure: {"centralus", "eastus", "westus"},
63-
ProviderAWS: {"us-east-1", "us-east-2", "us-west-1"},
61+
ProviderAzure: {"centralus", "eastus"},
62+
ProviderAWS: {"us-east-1", "us-east-2"},
6463
}
6564

6665
// LoadBalancerAnnotations contains provider-specific service annotations
@@ -355,23 +354,3 @@ func UpdateKubeconfigAzure(t *testing.T, resourceGroup, clusterName string) erro
355354
}
356355
return nil
357356
}
358-
359-
// WaitWithTimeout waits for a condition to be true with a timeout
360-
func WaitWithTimeout(ctx context.Context, interval, timeout time.Duration, condition func() (bool, error)) error {
361-
deadline := time.Now().Add(timeout)
362-
for time.Now().Before(deadline) {
363-
success, err := condition()
364-
if err != nil {
365-
return err
366-
}
367-
if success {
368-
return nil
369-
}
370-
select {
371-
case <-ctx.Done():
372-
return ctx.Err()
373-
case <-time.After(interval):
374-
}
375-
}
376-
return fmt.Errorf("timed out after %s", timeout)
377-
}

tests/e2e/operator/infra/gcp.go

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ const (
3333
webhookFirewallRuleName = "allow-9443-port-for-webhook"
3434
internalFirewallRuleName = "allow-internal"
3535
defaultNodePool = "default-pool" // default node pool name for GKE clusters
36-
subnetSuffix = "subnet-1" // suffix for dynamically created subnets
36+
subnetSuffix = "subnet" // suffix for dynamically created subnets
3737
)
3838

3939
// Helper functions to get network configuration from common.go
@@ -252,14 +252,60 @@ func (r *GcpRegion) TeardownInfra(t *testing.T) {
252252
computeService, err := createComputeServiceClient(ctx)
253253
require.NoError(t, err)
254254

255-
// 1) Delete GKE clusters via gcloud (ensures proper cleanup of node pools, etc.)
255+
gkeService, err := createGKEServiceClient(ctx)
256+
require.NoError(t, err)
257+
258+
// 1) Delete GKE clusters
256259
for _, cfg := range clusterConfigurations[:len(r.Clusters)] {
257260
t.Logf("[%s] Deleting GKE cluster '%s'", ProviderGCP, cfg.ClusterName)
261+
262+
// Check for ongoing operations
263+
clusterPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", projectID, cfg.Region, cfg.ClusterName)
264+
cluster, err := gkeService.Projects.Locations.Clusters.Get(clusterPath).Context(ctx).Do()
265+
if err != nil {
266+
if IsResourceNotFound(err) {
267+
t.Logf("[%s] Cluster %s already deleted", ProviderGCP, cfg.ClusterName)
268+
continue
269+
}
270+
t.Logf("[%s] Warning: error checking cluster %s status: %v", ProviderGCP, cfg.ClusterName, err)
271+
continue
272+
}
273+
274+
// If cluster is in a transitioning state, wait for current operation to complete
275+
if cluster.Status != "RUNNING" && cluster.Status != "ERROR" {
276+
t.Logf("[%s] Cluster %s is in %s state, waiting for operation to complete...", ProviderGCP, cfg.ClusterName, cluster.Status)
277+
if cluster.CurrentMasterVersion != "" { // Check if there's an ongoing operation
278+
err = waitForGKEOperation(gkeService, cluster.CurrentMasterVersion, cfg.Region, "")
279+
if err != nil {
280+
t.Logf("[%s] Warning: error waiting for operation on cluster %s: %v", ProviderGCP, cfg.ClusterName, err)
281+
}
282+
}
283+
}
284+
285+
// Now try to delete the cluster
258286
delCmd := exec.Command("gcloud", "container", "clusters", "delete", cfg.ClusterName,
259-
"--region", cfg.Region, "--project", projectID, "--quiet")
287+
"--region", cfg.Region, "--project", projectID, "--quiet", "--async")
260288
delCmd.Stdout = os.Stdout
261289
delCmd.Stderr = os.Stderr
262-
_ = delCmd.Run()
290+
if err := delCmd.Run(); err != nil {
291+
t.Logf("[%s] Warning: error initiating deletion of cluster %s: %v", ProviderGCP, cfg.ClusterName, err)
292+
}
293+
}
294+
295+
// Wait for all cluster deletions to complete
296+
for _, cfg := range clusterConfigurations[:len(r.Clusters)] {
297+
clusterPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", projectID, cfg.Region, cfg.ClusterName)
298+
for retries := 0; retries < 10; retries++ {
299+
_, err := gkeService.Projects.Locations.Clusters.Get(clusterPath).Context(ctx).Do()
300+
if IsResourceNotFound(err) {
301+
t.Logf("[%s] Confirmed deletion of cluster %s", ProviderGCP, cfg.ClusterName)
302+
break
303+
}
304+
if retries == 9 {
305+
t.Logf("[%s] Warning: timed out waiting for cluster %s deletion", ProviderGCP, cfg.ClusterName)
306+
}
307+
time.Sleep(30 * time.Second)
308+
}
263309
}
264310

265311
// 2) Delete static IPs (unreserve)
@@ -296,22 +342,23 @@ func (r *GcpRegion) TeardownInfra(t *testing.T) {
296342
t.Logf("[%s] Infrastructure teardown completed", ProviderGCP)
297343
}
298344

345+
// This is a no-op right now
299346
func (r *GcpRegion) ScaleNodePool(t *testing.T, location string, nodeCount, index int) {
300-
t.Logf("[%s] Scaling node pool for cluster '%s' to %d nodes", ProviderGCP, clusterConfigurations[index].ClusterName, nodeCount)
301-
302-
ctx := context.Background()
303-
gkeService, err := createGKEServiceClient(ctx)
304-
require.NoError(t, err, "failed to create GKE client")
305-
306-
scaleOp, err := scaleNodePool(ctx, gkeService, projectID, location, clusterConfigurations[index].ClusterName, defaultNodePool, int64(nodeCount))
307-
require.NoError(t, err, "error initiating scaling for node pool")
308-
309-
err = waitForGKEOperation(gkeService, scaleOp.Name, location, "")
310-
if err != nil {
311-
t.Logf("[%s] Error during scaling operation for node pool '%s': %v", ProviderGCP, defaultNodePool, err)
312-
} else {
313-
t.Logf("[%s] Successfully scaled node pool '%s' to %d nodes", ProviderGCP, defaultNodePool, nodeCount)
314-
}
347+
//t.Logf("[%s] Scaling node pool for cluster '%s' to %d nodes", ProviderGCP, clusterConfigurations[index].ClusterName, nodeCount)
348+
//
349+
//ctx := context.Background()
350+
//gkeService, err := createGKEServiceClient(ctx)
351+
//require.NoError(t, err, "failed to create GKE client")
352+
//
353+
//scaleOp, err := scaleNodePool(ctx, gkeService, projectID, location, clusterConfigurations[index].ClusterName, defaultNodePool, int64(1))
354+
//require.NoError(t, err, "error initiating scaling for node pool")
355+
//
356+
//err = waitForGKEOperation(gkeService, scaleOp.Name, location, "")
357+
//if err != nil {
358+
// t.Logf("[%s] Error during scaling operation for node pool '%s': %v", ProviderGCP, defaultNodePool, err)
359+
//} else {
360+
// t.Logf("[%s] Successfully scaled node pool '%s' to %d nodes", ProviderGCP, defaultNodePool, nodeCount)
361+
//}
315362
}
316363

317364
// getServiceAccountKeyPath returns the path to the service account key file
@@ -490,7 +537,10 @@ func createGKERegionalCluster(ctx context.Context, client *container.Service, se
490537
"--tags", strings.Join([]string{defaultNodeTag}, ","), // Join tags if there are multiple
491538
"--enable-master-authorized-networks",
492539
"--master-authorized-networks", strings.Join([]string{"0.0.0.0/0"}, ","),
493-
"--num-nodes", fmt.Sprint(DefaultNodesPerZone), // For regional, this is total nodes spread across 3 zones by default
540+
"--num-nodes", fmt.Sprint(DefaultNodesPerZone),
541+
"--min-nodes", fmt.Sprint(DefaultNodesPerZone),
542+
"--max-nodes", fmt.Sprint(DefaultNodesPerZone + 1), // Needed for scaling cluster
543+
"--enable-autoscaling", // Enable autoscaling
494544
"--autoprovisioning-network-tags", strings.Join([]string{defaultNodeTag}, ","),
495545
"--machine-type", GCPDefaultMachineType,
496546
"--quiet", // Suppress interactive prompts

tests/e2e/operator/infra/k3d.go

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@ import (
77
"testing"
88
"time"
99

10-
"github.com/gruntwork-io/terratest/modules/random"
11-
1210
"github.com/cockroachdb/errors"
1311
"github.com/cockroachdb/helm-charts/tests/e2e/calico"
1412
"github.com/cockroachdb/helm-charts/tests/e2e/coredns"
1513
"github.com/cockroachdb/helm-charts/tests/e2e/operator"
14+
"github.com/cockroachdb/helm-charts/tests/testutil"
1615
"github.com/gruntwork-io/terratest/modules/k8s"
1716
"github.com/gruntwork-io/terratest/modules/retry"
1817
"github.com/gruntwork-io/terratest/modules/shell"
@@ -32,7 +31,23 @@ type K3dRegion struct {
3231

3332
// TeardownInfra cleans up all resources created by SetUpInfra
3433
func (r *K3dRegion) TeardownInfra(t *testing.T) {
35-
t.Logf("[%s] K3D teardown not implemented - clusters will be cleaned up by the test framework", ProviderK3D)
34+
t.Logf("[%s] Tearing down K3D infrastructure", ProviderK3D)
35+
36+
cmd := shell.Command{
37+
Command: "make",
38+
Args: []string{
39+
"test/multi-cluster/down",
40+
},
41+
WorkingDir: testutil.GetGitRoot(),
42+
}
43+
44+
output, err := shell.RunCommandAndGetOutputE(t, cmd)
45+
if err != nil {
46+
t.Logf("[%s] Warning: Failed to tear down K3D clusters: %v\nOutput: %s",
47+
ProviderK3D, err, output)
48+
} else {
49+
t.Logf("[%s] Successfully tore down K3D clusters", ProviderK3D)
50+
}
3651
}
3752

3853
// ScaleNodePool scales the node pool in a K3D cluster
@@ -59,11 +74,10 @@ func (r *K3dRegion) SetUpInfra(t *testing.T) {
5974
for i, cluster := range r.Clusters {
6075
if _, ok := rawConfig.Contexts[cluster]; !ok {
6176
// Create cluster using shell command.
62-
err := createK3DCluster(t)
77+
err := createK3DCluster(t, cluster, r.NodeCount)
6378
require.NoError(t, err)
6479
}
6580

66-
r.Namespace[cluster] = fmt.Sprintf("%s-%s", operator.Namespace, strings.ToLower(random.UniqueId()))
6781
cfg, err := config.GetConfigWithContext(cluster)
6882
require.NoError(t, err)
6983
k8sClient, err := client.New(cfg, client.Options{})
@@ -212,14 +226,16 @@ func (r *K3dRegion) setupNetworking(t *testing.T, ctx context.Context, region st
212226
// createK3DCluster creates a new k3d cluster
213227
// by calling the make command which will create
214228
// single k3d cluster.
215-
func createK3DCluster(t *testing.T) error {
216-
t.Logf("[%s] Creating new K3D cluster", ProviderK3D)
229+
func createK3DCluster(t *testing.T, clusterName string, nodeCount int) error {
230+
t.Logf("[%s] Creating new K3D cluster: %s with %d nodes", ProviderK3D, clusterName, nodeCount)
217231
cmd := shell.Command{
218232
Command: "make",
219233
Args: []string{
220234
"test/single-cluster/up",
235+
fmt.Sprintf("name=%s", strings.TrimLeft(clusterName, "k3d-")),
236+
fmt.Sprintf("nodes=%d", nodeCount),
221237
},
222-
WorkingDir: "../../../../..",
238+
WorkingDir: testutil.GetGitRoot(),
223239
}
224240

225241
output, err := shell.RunCommandAndGetOutputE(t, cmd)
@@ -228,6 +244,6 @@ func createK3DCluster(t *testing.T) error {
228244
return fmt.Errorf("failed to create cluster: %v\nOutput: %s", err, output)
229245
}
230246

231-
t.Logf("[%s] Successfully created new K3D cluster", ProviderK3D)
247+
t.Logf("[%s] Successfully created new K3D cluster: %s", ProviderK3D, clusterName)
232248
return nil
233249
}

tests/e2e/operator/infra/provider.go

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,19 @@ import (
77
)
88

99
// CloudProvider defines the interface that all cloud providers must implement
10+
// Some methods are optional - providers that don't support certain operations
11+
// can implement them as no-ops with appropriate logging
1012
type CloudProvider interface {
1113
// SetUpInfra creates the necessary infrastructure for the tests
14+
// This is the only required method for all providers
1215
SetUpInfra(t *testing.T)
13-
}
1416

15-
// CloudProviderWithTeardown extends CloudProvider with teardown capability
16-
type CloudProviderWithTeardown interface {
17-
CloudProvider
1817
// TeardownInfra cleans up all resources created by SetUpInfra
18+
// Optional: providers that don't support teardown can implement as no-op
1919
TeardownInfra(t *testing.T)
20-
}
2120

22-
// CloudProviderWithScaling extends CloudProvider with scaling capability
23-
type CloudProviderWithScaling interface {
24-
CloudProvider
2521
// ScaleNodePool scales the node pool in a cluster
22+
// Optional: providers that don't support scaling can implement as no-op
2623
ScaleNodePool(t *testing.T, location string, nodeCount, index int)
2724
}
2825

@@ -55,17 +52,27 @@ func ProviderFactory(providerType string, region *operator.Region) CloudProvider
5552
}
5653

5754
// CanTeardown checks if the provider supports teardown
58-
func CanTeardown(provider CloudProvider) (CloudProviderWithTeardown, bool) {
59-
if p, ok := provider.(CloudProviderWithTeardown); ok {
60-
return p, true
55+
// This function is kept for backward compatibility
56+
func CanTeardown(provider CloudProvider) (CloudProvider, bool) {
57+
// Check if the TeardownInfra method is a no-op implementation
58+
// Kind and K3D providers have no-op implementations
59+
switch provider.(type) {
60+
case *KindRegion:
61+
return nil, false
62+
default:
63+
return provider, true
6164
}
62-
return nil, false
6365
}
6466

6567
// CanScale checks if the provider supports scaling
66-
func CanScale(provider CloudProvider) (CloudProviderWithScaling, bool) {
67-
if p, ok := provider.(CloudProviderWithScaling); ok {
68-
return p, true
68+
// This function is kept for backward compatibility
69+
func CanScale(provider CloudProvider) (CloudProvider, bool) {
70+
// Check if the ScaleNodePool method is a no-op implementation
71+
// GCP, Kind and K3D providers have no-op implementations
72+
switch provider.(type) {
73+
case *K3dRegion, *KindRegion, *GcpRegion:
74+
return nil, false
75+
default:
76+
return provider, true
6977
}
70-
return nil, false
7178
}

tests/e2e/operator/multiRegion/cockroachdb_multi_region_e2e_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ func TestOperatorInMultiRegion(t *testing.T) {
5252
provider := provider // Create new variable to avoid closure issues
5353
t.Run(provider, func(t *testing.T) {
5454
r.Provider = provider
55-
r.Clusters = append(r.Clusters, fmt.Sprintf("%s-%s", r.Provider, operator.Clusters[0]))
55+
for _, cluster := range operator.Clusters {
56+
r.Clusters = append(r.Clusters, fmt.Sprintf("%s-%s", r.Provider, cluster))
57+
}
5658

5759
t.Run("TestHelmInstall", r.TestHelmInstall)
5860
t.Run("TestHelmUpgrade", r.TestHelmUpgrade)
@@ -374,7 +376,7 @@ func (r *multiRegion) TestClusterScaleUp(t *testing.T) {
374376
// Modify the nodes in each region and apply helm upgrade.
375377
for i, cluster := range r.Clusters {
376378
kubectlOptions := k8s.NewKubectlOptions(cluster, kubeConfig, r.Namespace[cluster])
377-
r.NodeCount = 4
379+
r.NodeCount += 1
378380

379381
// Scale the node pool in the cloud infrastructure
380382
r.scaleNodePool(t, r.RegionCodes[i], r.NodeCount, i)
@@ -395,10 +397,6 @@ func (r *multiRegion) TestClusterScaleUp(t *testing.T) {
395397
DesiredNodes: r.NodeCount,
396398
}
397399
testutil.RequireCRDBClusterToBeReadyEventuallyTimeout(t, kubectlOptions, crdbCluster, 600*time.Second)
398-
pods := k8s.ListPods(t, kubectlOptions, metav1.ListOptions{
399-
LabelSelector: operator.LabelSelector,
400-
})
401-
require.True(t, len(pods) == 4)
402400
// Validate CockroachDB cluster.
403401
r.ValidateCRDB(t, cluster)
404402
}

0 commit comments

Comments
 (0)