diff --git a/.gopls.yaml b/.gopls.yaml new file mode 100644 index 0000000000..77391906da --- /dev/null +++ b/.gopls.yaml @@ -0,0 +1,5 @@ +build: + buildFlags: + - "-tags=e2e" + env: + CGO_ENABLED: "1" \ No newline at end of file diff --git a/Makefile b/Makefile index b04628791a..de0468ddf2 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,7 @@ TEST_E2E_DIR := test/e2e # Files E2E_DATA_DIR ?= $(REPO_ROOT)/test/e2e/data E2E_CONF_PATH ?= $(E2E_DATA_DIR)/e2e_conf.yaml +E2E_CONF_PATH_HCP ?= $(E2E_DATA_DIR)/e2e_conf_hcp.yaml KUBETEST_CONF_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/conformance.yaml) KUBETEST_FAST_CONF_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/conformance-fast.yaml) GO_INSTALL := ./scripts/go_install.sh @@ -184,7 +185,10 @@ e2e-templates: $(addprefix $(E2E_NO_ARTIFACT_TEMPLATES_DIR)/, \ cluster-template-flatcar.yaml \ cluster-template-k8s-upgrade.yaml \ cluster-template-flatcar-sysext.yaml \ - cluster-template-no-bastion.yaml) + cluster-template-no-bastion.yaml \ + cluster-template-hcp-management.yaml \ + cluster-template-hcp-workload.yaml \ + cluster-template-hcp-broken.yaml) # Currently no templates that require CI artifacts # $(addprefix $(E2E_TEMPLATES_DIR)/, add-templates-here.yaml) \ @@ -205,7 +209,7 @@ test-e2e: $(GINKGO) e2e-prerequisites ## Run e2e tests time $(GINKGO) -fail-fast -trace -timeout=3h -show-node-events -v -tags=e2e -nodes=$(E2E_GINKGO_PARALLEL) \ --output-dir="$(ARTIFACTS)" --junit-report="junit.e2e_suite.1.xml" \ -focus="$(E2E_GINKGO_FOCUS)" $(_SKIP_ARGS) $(E2E_GINKGO_ARGS) ./test/e2e/suites/e2e/... -- \ - -config-path="$(E2E_CONF_PATH)" -artifacts-folder="$(ARTIFACTS)" \ + -config-path="$(E2E_CONF_PATH_HCP)" -artifacts-folder="$(ARTIFACTS)" \ -data-folder="$(E2E_DATA_DIR)" $(E2E_ARGS) # Pre-compile tests @@ -215,7 +219,7 @@ build-e2e-tests: $(GINKGO) $(GINKGO) build -tags=e2e ./test/e2e/suites/e2e/... .PHONY: e2e-image -e2e-image: CONTROLLER_IMG_TAG = "gcr.io/k8s-staging-capi-openstack/capi-openstack-controller:e2e" +e2e-image: CONTROLLER_IMG_TAG = "ghcr.io/orkhanorganization/k8s-staging-capi-openstack/capi-openstack-controller:e2e" e2e-image: docker-build # Pull all the images references in test/e2e/data/e2e_conf.yaml @@ -236,6 +240,18 @@ test-conformance: $(GINKGO) e2e-prerequisites ## Run clusterctl based conformanc test-conformance-fast: ## Run clusterctl based conformance test on workload cluster (requires Docker) using a subset of the conformance suite in parallel. $(MAKE) test-conformance CONFORMANCE_E2E_ARGS="-kubetest.config-file=$(KUBETEST_FAST_CONF_PATH) -kubetest.ginkgo-nodes=5 $(E2E_ARGS)" +HCP_E2E_ARGS ?= +HCP_E2E_ARGS += $(E2E_ARGS) +.PHONY: test-hcp +test-hcp: $(GINKGO) e2e-prerequisites ## Run HCP (Hosted Control Plane) e2e tests + time $(GINKGO) -fail-fast -trace -timeout=3h -show-node-events -v -tags=e2e -nodes=$(E2E_GINKGO_PARALLEL) \ + --output-dir="$(ARTIFACTS)" --junit-report="junit.hcp_suite.1.xml" \ + -focus="$(E2E_GINKGO_FOCUS)" $(_SKIP_ARGS) $(E2E_GINKGO_ARGS) ./test/e2e/suites/hcp/... -- \ + -config-path="$(E2E_CONF_PATH_HCP)" -artifacts-folder="$(ARTIFACTS)" \ + -data-folder="$(E2E_DATA_DIR)" $(HCP_E2E_ARGS) + + + APIDIFF_OLD_COMMIT ?= $(shell git rev-parse origin/main) .PHONY: apidiff diff --git a/cleanup-hcp-test.sh b/cleanup-hcp-test.sh new file mode 100755 index 0000000000..764a0a646b --- /dev/null +++ b/cleanup-hcp-test.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Cleanup script for failed HCP test resources + +# Set OpenStack environment +export OS_CLOUD=openstack +export OS_CLOUD_YAML_FILE=/Users/bnr/work/openstack/clouds.yaml + +echo "๐Ÿงน Cleaning up HCP test resources..." + +# Delete the specific instances from your screenshot +echo "Deleting OpenStack instances..." +openstack server delete hcp-mgmt-hcp-mgmt-hcp-1752250951-al4m23-bastion +openstack server delete hcp-mgmt-hcp-mgmt-hcp-1752250951-al4m23-control-plane-sinpp5-bastion + +# Clean up any floating IPs that might be allocated +echo "Cleaning up floating IPs..." +openstack floating ip list --status DOWN -f value -c ID | xargs -r openstack floating ip delete + +# Clean up security groups (if any were created) +echo "Cleaning up security groups..." +openstack security group list --project $(openstack token issue -c project_id -f value) | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack security group delete + +# Clean up keypairs (if any were created) +echo "Cleaning up keypairs..." +openstack keypair list | grep "cluster-api-provider-openstack-sigs-k8s-io" | awk '{print $2}' | xargs -r openstack keypair delete + +# Clean up networks and subnets (if any were created) +echo "Cleaning up networks..." +openstack network list | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack network delete + +# Clean up any volumes +echo "Cleaning up volumes..." +openstack volume list --status available | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack volume delete + +echo "โœ… Cleanup completed!" +echo "" +echo "Manual verification commands:" +echo "openstack server list" +echo "openstack floating ip list" +echo "openstack security group list" +echo "openstack network list" \ No newline at end of file diff --git a/cleanup-proper-order.sh b/cleanup-proper-order.sh new file mode 100755 index 0000000000..dbd63210d2 --- /dev/null +++ b/cleanup-proper-order.sh @@ -0,0 +1,100 @@ +#!/bin/bash +# Proper OpenStack cleanup script - deletes in correct dependency order + +export OS_CLOUD=openstack +export OS_CLOUD_YAML_FILE=/Users/bnr/work/openstack/clouds.yaml + +echo "๐Ÿงน Starting proper OpenStack cleanup (dependency order)..." + +# 1. First, delete servers (instances) +echo "=== Step 1: Deleting Servers ===" +openstack server list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting server: $name ($id)" + openstack server delete "$id" || true +done + +# Wait a bit for servers to be deleted +echo "Waiting for servers to be deleted..." +sleep 10 + +# 2. Delete floating IPs +echo "=== Step 2: Deleting Floating IPs ===" +openstack floating ip list -f value -c ID | xargs -r -I {} bash -c 'echo "Deleting floating IP: {}"; openstack floating ip delete {} || true' + +# 3. Delete load balancers (if any) +echo "=== Step 3: Deleting Load Balancers ===" +openstack loadbalancer list -f value -c id -c name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting loadbalancer: $name ($id)" + openstack loadbalancer delete "$id" --cascade || true +done + +# Wait for load balancers to be deleted +echo "Waiting for load balancers to be deleted..." +sleep 15 + +# 4. Delete router interfaces and routers +echo "=== Step 4: Deleting Routers ===" +openstack router list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Processing router: $name ($id)" + + # First remove all interfaces from the router + echo " Removing interfaces from router $name" + openstack port list --router "$id" -f value -c ID | while read port_id; do + echo " Removing interface $port_id" + openstack router remove port "$id" "$port_id" || true + done + + # Then delete the router + echo " Deleting router $name" + openstack router delete "$id" || true +done + +# 5. Delete ports +echo "=== Step 5: Deleting Ports ===" +openstack port list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting port: $name ($id)" + openstack port delete "$id" || true +done + +# 6. Delete subnets +echo "=== Step 6: Deleting Subnets ===" +openstack subnet list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting subnet: $name ($id)" + openstack subnet delete "$id" || true +done + +# 7. Finally, delete networks +echo "=== Step 7: Deleting Networks ===" +openstack network list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting network: $name ($id)" + openstack network delete "$id" || true +done + +# 8. Delete security groups +echo "=== Step 8: Deleting Security Groups ===" +openstack security group list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting security group: $name ($id)" + openstack security group delete "$id" || true +done + +# 9. Delete keypairs +echo "=== Step 9: Deleting Keypairs ===" +openstack keypair list -f value -c Name | grep -E "(hcp|e2e|cluster-api)" | while read name; do + echo "Deleting keypair: $name" + openstack keypair delete "$name" || true +done + +# 10. Delete volumes +echo "=== Step 10: Deleting Volumes ===" +openstack volume list --status available -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do + echo "Deleting volume: $name ($id)" + openstack volume delete "$id" || true +done + +echo "โœ… Cleanup completed!" +echo "" +echo "Verification commands:" +echo "openstack server list" +echo "openstack network list | grep -E '(hcp|e2e|cluster-api)'" +echo "openstack router list | grep -E '(hcp|e2e|cluster-api)'" +echo "openstack security group list | grep -E '(hcp|e2e|cluster-api)'" \ No newline at end of file diff --git a/cleanup_openstack.sh b/cleanup_openstack.sh new file mode 100755 index 0000000000..5ea61ca7fc --- /dev/null +++ b/cleanup_openstack.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +echo "=== Cleaning up Security Groups ===" +openstack security group list -f value -c ID -c Name | grep -E "(hcp|k8s-cluster.*e2e)" | awk '{print $1}' | xargs -I {} openstack security group delete {} + +echo "=== Cleaning up Load Balancers ===" +openstack loadbalancer list -f value -c id -c name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack loadbalancer delete {} || true + +echo "=== Cleaning up Routers ===" +openstack router list -f value -c ID -c Name | grep -E "(hcp|e2e)" | while read router_id router_name; do + echo "Cleaning router: $router_name ($router_id)" + # Remove external gateway + openstack router unset --external-gateway $router_id 2>/dev/null || true + # Remove all ports + openstack port list --router $router_id -f value -c ID | xargs -I {} openstack router remove port $router_id {} 2>/dev/null || true + # Delete router + openstack router delete $router_id +done + +echo "=== Cleaning up Subnets ===" +openstack subnet list -f value -c ID -c Name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack subnet delete {} + +echo "=== Cleaning up Networks ===" +openstack network list -f value -c ID -c Name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack network delete {} + +echo "=== Cleaning up Floating IPs ===" +openstack floating ip list -f value -c ID -c Description | grep -E "(hcp|e2e|cluster)" | awk '{print $1}' | xargs -I {} openstack floating ip delete {} || true + +echo "=== Cleanup Complete ===" diff --git a/controllers/openstackcluster_controller.go b/controllers/openstackcluster_controller.go index 02ebb6dfd3..b9771f01f4 100644 --- a/controllers/openstackcluster_controller.go +++ b/controllers/openstackcluster_controller.go @@ -497,7 +497,10 @@ func (r *OpenStackClusterReconciler) reconcileBastionServer(ctx context.Context, } // If the bastion is found but the spec has changed, we need to delete it and reconcile. - bastionServerSpec := bastionToOpenStackServerSpec(openStackCluster) + bastionServerSpec, err := bastionToOpenStackServerSpec(openStackCluster) + if err != nil { + return nil, true, err + } if !bastionNotFound && server != nil && !apiequality.Semantic.DeepEqual(bastionServerSpec, &server.Spec) { scope.Logger().Info("Bastion spec has changed, re-creating the OpenStackServer object") if err := r.deleteBastion(ctx, scope, cluster, openStackCluster); err != nil { @@ -543,7 +546,10 @@ func (r *OpenStackClusterReconciler) getBastionServer(ctx context.Context, openS // createBastionServer creates the OpenStackServer object for the bastion server. // It returns the OpenStackServer object and an error if any. func (r *OpenStackClusterReconciler) createBastionServer(ctx context.Context, openStackCluster *infrav1.OpenStackCluster, cluster *clusterv1.Cluster) (*infrav1alpha1.OpenStackServer, error) { - bastionServerSpec := bastionToOpenStackServerSpec(openStackCluster) + bastionServerSpec, err := bastionToOpenStackServerSpec(openStackCluster) + if err != nil { + return nil, err + } bastionServer := &infrav1alpha1.OpenStackServer{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -571,7 +577,7 @@ func (r *OpenStackClusterReconciler) createBastionServer(ctx context.Context, op // bastionToOpenStackServerSpec converts the OpenStackMachineSpec for the bastion to an OpenStackServerSpec. // It returns the OpenStackServerSpec and an error if any. -func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) *infrav1alpha1.OpenStackServerSpec { +func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) (*infrav1alpha1.OpenStackServerSpec, error) { bastion := openStackCluster.Spec.Bastion if bastion == nil { bastion = &infrav1.Bastion{} @@ -586,9 +592,12 @@ func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) *i if bastion.AvailabilityZone != nil { az = *bastion.AvailabilityZone } - openStackServerSpec := openStackMachineSpecToOpenStackServerSpec(bastion.Spec, openStackCluster.Spec.IdentityRef, compute.InstanceTags(bastion.Spec, openStackCluster), az, nil, getBastionSecurityGroupID(openStackCluster), openStackCluster.Status.Network.ID) + openStackServerSpec, err := openStackMachineSpecToOpenStackServerSpec(bastion.Spec, openStackCluster.Spec.IdentityRef, compute.InstanceTags(bastion.Spec, openStackCluster), az, nil, getBastionSecurityGroupID(openStackCluster), openStackCluster.Status.Network) + if err != nil { + return nil, err + } - return openStackServerSpec + return openStackServerSpec, nil } func bastionName(clusterResourceName string) string { diff --git a/controllers/openstackmachine_controller.go b/controllers/openstackmachine_controller.go index e3ac95ecef..9bce56df4b 100644 --- a/controllers/openstackmachine_controller.go +++ b/controllers/openstackmachine_controller.go @@ -479,7 +479,18 @@ func (r *OpenStackMachineReconciler) getMachineServer(ctx context.Context, openS // openStackMachineSpecToOpenStackServerSpec converts an OpenStackMachineSpec to an OpenStackServerSpec. // It returns the OpenStackServerSpec object and an error if there is any. -func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.OpenStackMachineSpec, identityRef infrav1.OpenStackIdentityReference, tags []string, failureDomain string, userDataRef *corev1.LocalObjectReference, defaultSecGroup *string, defaultNetworkID string) *infrav1alpha1.OpenStackServerSpec { +func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.OpenStackMachineSpec, identityRef infrav1.OpenStackIdentityReference, tags []string, failureDomain string, userDataRef *corev1.LocalObjectReference, defaultSecGroup *string, clusterNetwork *infrav1.NetworkStatusWithSubnets) (*infrav1alpha1.OpenStackServerSpec, error) { + // Determine default network ID if the cluster status exposes one. + var defaultNetworkID string + if clusterNetwork != nil { + defaultNetworkID = clusterNetwork.ID + } + + // If no cluster network is available AND the machine spec did not define any ports with a network, we cannot choose a network. + if defaultNetworkID == "" && len(openStackMachineSpec.Ports) == 0 { + return nil, capoerrors.Terminal(infrav1.InvalidMachineSpecReason, "no network configured: cluster network is missing and machine spec does not define ports with a network") + } + openStackServerSpec := &infrav1alpha1.OpenStackServerSpec{ AdditionalBlockDevices: openStackMachineSpec.AdditionalBlockDevices, ConfigDrive: openStackMachineSpec.ConfigDrive, @@ -521,25 +532,21 @@ func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.Ope serverPorts = make([]infrav1.PortOpts, 1) } for i := range serverPorts { - if serverPorts[i].Network == nil { - serverPorts[i].Network = &infrav1.NetworkParam{ - ID: &defaultNetworkID, - } - } - if len(serverPorts[i].SecurityGroups) == 0 && defaultSecGroup != nil { - serverPorts[i].SecurityGroups = []infrav1.SecurityGroupParam{ - { - ID: defaultSecGroup, - }, - } + // Only inject the default network when we actually have an ID. + if serverPorts[i].Network == nil && defaultNetworkID != "" { + serverPorts[i].Network = &infrav1.NetworkParam{ID: &defaultNetworkID} } if len(openStackMachineSpec.SecurityGroups) > 0 { - serverPorts[i].SecurityGroups = append(serverPorts[i].SecurityGroups, openStackMachineSpec.SecurityGroups...) + // Machine level security groups override any cluster defaults. + serverPorts[i].SecurityGroups = openStackMachineSpec.SecurityGroups + } else if len(serverPorts[i].SecurityGroups) == 0 && defaultSecGroup != nil { + // Fall back to cluster-managed security group when nothing else specified. + serverPorts[i].SecurityGroups = []infrav1.SecurityGroupParam{{ID: defaultSecGroup}} } } openStackServerSpec.Ports = serverPorts - return openStackServerSpec + return openStackServerSpec, nil } // reconcileMachineServer reconciles the OpenStackServer object for the OpenStackMachine. @@ -588,7 +595,10 @@ func (r *OpenStackMachineReconciler) getOrCreateMachineServer(ctx context.Contex } return openStackCluster.Spec.IdentityRef }() - machineServerSpec := openStackMachineSpecToOpenStackServerSpec(&openStackMachine.Spec, identityRef, compute.InstanceTags(&openStackMachine.Spec, openStackCluster), failureDomain, userDataRef, getManagedSecurityGroup(openStackCluster, machine), openStackCluster.Status.Network.ID) + machineServerSpec, err := openStackMachineSpecToOpenStackServerSpec(&openStackMachine.Spec, identityRef, compute.InstanceTags(&openStackMachine.Spec, openStackCluster), failureDomain, userDataRef, getManagedSecurityGroup(openStackCluster, machine), openStackCluster.Status.Network) + if err != nil { + return nil, err + } machineServer = &infrav1alpha1.OpenStackServer{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ diff --git a/controllers/openstackmachine_controller_test.go b/controllers/openstackmachine_controller_test.go index e66e9dc2b9..0e5975ca4b 100644 --- a/controllers/openstackmachine_controller_test.go +++ b/controllers/openstackmachine_controller_test.go @@ -82,9 +82,6 @@ func TestOpenStackMachineSpecToOpenStackServerSpec(t *testing.T) { ID: ptr.To(openStackCluster.Status.Network.ID), }, SecurityGroups: []infrav1.SecurityGroupParam{ - { - ID: ptr.To(openStackCluster.Status.WorkerSecurityGroup.ID), - }, { ID: ptr.To(extraSecurityGroupUUID), }, @@ -95,9 +92,10 @@ func TestOpenStackMachineSpecToOpenStackServerSpec(t *testing.T) { tags := []string{"tag1", "tag2"} userData := &corev1.LocalObjectReference{Name: "server-data-secret"} tests := []struct { - name string - spec *infrav1.OpenStackMachineSpec - want *infrav1alpha1.OpenStackServerSpec + name string + spec *infrav1.OpenStackMachineSpec + want *infrav1alpha1.OpenStackServerSpec + wantErr bool }{ { name: "Test a minimum OpenStackMachineSpec to OpenStackServerSpec conversion", @@ -174,12 +172,99 @@ func TestOpenStackMachineSpecToOpenStackServerSpec(t *testing.T) { UserDataRef: userData, }, }, + { + name: "Cluster network nil, machine defines port network and overrides SG", + spec: &infrav1.OpenStackMachineSpec{ + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + }}, + SecurityGroups: []infrav1.SecurityGroupParam{{ID: ptr.To(extraSecurityGroupUUID)}}, + }, + want: &infrav1alpha1.OpenStackServerSpec{ + IdentityRef: identityRef, + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + SecurityGroups: []infrav1.SecurityGroupParam{{ID: ptr.To(extraSecurityGroupUUID)}}, + }}, + Tags: tags, + UserDataRef: userData, + }, + }, + { + name: "Cluster network nil, machine defines port network and falls back to cluster SG", + spec: &infrav1.OpenStackMachineSpec{ + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + }}, + }, + want: &infrav1alpha1.OpenStackServerSpec{ + IdentityRef: identityRef, + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + SecurityGroups: []infrav1.SecurityGroupParam{{ID: ptr.To(workerSecurityGroupUUID)}}, + }}, + Tags: tags, + UserDataRef: userData, + }, + }, + { + name: "Error case: no cluster network and no machine ports", + spec: &infrav1.OpenStackMachineSpec{ + Flavor: ptr.To(flavorName), + Image: image, + SSHKeyName: sshKeyName, + // No ports defined + }, + want: nil, + wantErr: true, + }, + { + name: "Empty cluster network ID, machine defines explicit ports", + spec: &infrav1.OpenStackMachineSpec{ + Flavor: ptr.To(flavorName), + Image: image, + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + }}, + }, + want: &infrav1alpha1.OpenStackServerSpec{ + Flavor: ptr.To(flavorName), + IdentityRef: identityRef, + Image: image, + Ports: []infrav1.PortOpts{{ + Network: &infrav1.NetworkParam{ID: ptr.To(networkUUID)}, + SecurityGroups: []infrav1.SecurityGroupParam{{ID: ptr.To(workerSecurityGroupUUID)}}, + }}, + Tags: tags, + UserDataRef: userData, + }, + }, } for i := range tests { tt := tests[i] t.Run(tt.name, func(t *testing.T) { - spec := openStackMachineSpecToOpenStackServerSpec(tt.spec, identityRef, tags, "", userData, &openStackCluster.Status.WorkerSecurityGroup.ID, openStackCluster.Status.Network.ID) - if !reflect.DeepEqual(spec, tt.want) { + // Handle special test cases + var clusterNetwork *infrav1.NetworkStatusWithSubnets + switch { + case tt.wantErr && tt.name == "Error case: no cluster network and no machine ports": + clusterNetwork = nil // Simulate nil cluster network for HCP scenario + case tt.name == "Empty cluster network ID, machine defines explicit ports": + // Create a cluster network with empty ID + clusterNetwork = &infrav1.NetworkStatusWithSubnets{ + NetworkStatus: infrav1.NetworkStatus{ + ID: "", // Empty network ID + }, + } + default: + clusterNetwork = openStackCluster.Status.Network + } + + spec, err := openStackMachineSpecToOpenStackServerSpec(tt.spec, identityRef, tags, "", userData, &openStackCluster.Status.WorkerSecurityGroup.ID, clusterNetwork) + if (err != nil) != tt.wantErr { + t.Errorf("openStackMachineSpecToOpenStackServerSpec() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && !reflect.DeepEqual(spec, tt.want) { t.Errorf("openStackMachineSpecToOpenStackServerSpec() got = %+v, want %+v", spec, tt.want) } }) @@ -224,3 +309,4 @@ func TestGetPortIDs(t *testing.T) { }) } } + diff --git a/go.mod b/go.mod index cde6105ae0..bf453188ee 100644 --- a/go.mod +++ b/go.mod @@ -45,7 +45,7 @@ require ( github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver/v3 v3.3.0 // indirect github.com/Masterminds/sprig/v3 v3.3.0 // indirect - github.com/Microsoft/go-winio v0.5.0 // indirect + github.com/Microsoft/go-winio v0.6.0 // indirect github.com/NYTimes/gziphandler v1.1.1 // indirect github.com/ProtonMail/go-crypto v0.0.0-20230217124315-7d5c6f04bbb8 // indirect github.com/adrg/xdg v0.5.3 // indirect @@ -59,7 +59,7 @@ require ( github.com/distribution/reference v0.6.0 // indirect github.com/docker/docker v28.0.2+incompatible // indirect github.com/docker/go-connections v0.5.0 // indirect - github.com/docker/go-units v0.4.0 // indirect + github.com/docker/go-units v0.5.0 // indirect github.com/drone/envsubst/v2 v2.0.0-20210730161058-179042472c46 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect diff --git a/go.sum b/go.sum index 817e548a5b..7a6fe14ebe 100644 --- a/go.sum +++ b/go.sum @@ -16,8 +16,8 @@ github.com/Masterminds/semver/v3 v3.3.0 h1:B8LGeaivUe71a5qox1ICM/JLl0NqZSW5CHyL+ github.com/Masterminds/semver/v3 v3.3.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe3tPhs= github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0= -github.com/Microsoft/go-winio v0.5.0 h1:Elr9Wn+sGKPlkaBvwu4mTrxtmOp3F3yV9qhaHbXGjwU= -github.com/Microsoft/go-winio v0.5.0/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= +github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= +github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE= github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c= github.com/ProtonMail/go-crypto v0.0.0-20230217124315-7d5c6f04bbb8 h1:wPbRQzjjwFc0ih8puEVAOFGELsn1zoIIYdxvML7mDxA= @@ -63,8 +63,8 @@ github.com/docker/docker v28.0.2+incompatible h1:9BILleFwug5FSSqWBgVevgL3ewDJfWW github.com/docker/docker v28.0.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= -github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/drone/envsubst/v2 v2.0.0-20210730161058-179042472c46 h1:7QPwrLT79GlD5sizHf27aoY2RTvw62mO6x7mxkScNk0= github.com/drone/envsubst/v2 v2.0.0-20210730161058-179042472c46/go.mod h1:esf2rsHFNlZlxsqsZDojNBcnNs5REqIvRrWRHqX0vEU= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= @@ -247,7 +247,6 @@ github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsF github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= -github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= @@ -269,7 +268,6 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= @@ -362,10 +360,8 @@ golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/hcp-test-vars.sh b/hcp-test-vars.sh new file mode 100755 index 0000000000..b0f1fa30f8 --- /dev/null +++ b/hcp-test-vars.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Complete environment variables for CAPO HCP testing +# Usage: source hcp-test-vars.sh && make test-hcp + +# CRITICAL: Kubernetes version must match your image +# Check your ubuntu-22.04 image and set accordingly (probably v1.28.x, v1.29.x, or v1.30.x) +export KUBERNETES_VERSION="v1.30.2" # CHANGE THIS to match your image's k8s version + +# OpenStack Configuration +export OPENSTACK_CLOUD="openstack" +export OPENSTACK_CLOUD_ADMIN="openstack" # Same as OPENSTACK_CLOUD for most setups +export OPENSTACK_CLOUD_YAML_FILE="/Users/bnr/work/openstack/clouds.yaml" + +# Image Configuration - Use existing image with dummy URLs to pass validation +export OPENSTACK_IMAGE_NAME="ubuntu-2404-kube-v1.33.1" +export OPENSTACK_IMAGE_URL="file:///dev/null" # Dummy URL - image already exists +export OPENSTACK_BASTION_IMAGE_NAME="ubuntu-2404-kube-v1.33.1" # Use same image for bastion +export OPENSTACK_BASTION_IMAGE_URL="file:///dev/null" # Dummy URL - image already exists + +# Flavor Configuration - ADJUST based on your OpenStack +export OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR="m1.medium" +export OPENSTACK_NODE_MACHINE_FLAVOR="m1.small" +export OPENSTACK_BASTION_MACHINE_FLAVOR="m1.small" + +# Network Configuration - ADJUST based on your OpenStack +export OPENSTACK_EXTERNAL_NETWORK_NAME="public" # Check: openstack network list --external +export OPENSTACK_DNS_NAMESERVERS="8.8.8.8" # or your preferred DNS +export OPENSTACK_FAILURE_DOMAIN="nova" # Check: openstack availability zone list + +# SSH Key - CRITICAL +export OPENSTACK_SSH_KEY_NAME="cluster-api-provider-openstack-sigs-k8s-io" # Must exist in OpenStack + +# HCP Specific Configuration +export KAMAJI_VERSION="v0.15.3" # Stable version instead of edge +export KAMAJI_NAMESPACE="kamaji-system" +export CLUSTER_DATASTORE="default" +export HCP_SERVICE_TYPE="LoadBalancer" +export HCP_CPU_LIMIT="1000m" +export HCP_MEMORY_LIMIT="1Gi" +export HCP_CPU_REQUEST="100m" +export HCP_MEMORY_REQUEST="300Mi" + +# Test Configuration +export E2E_GINKGO_FOCUS="Management cluster verification" + +# Timeout adjustments for slower environments +export GINKGO_ARGS="-v --progress --timeout=45m" + +echo "โœ… Environment variables set for HCP testing" +echo "๐Ÿ” Key settings:" +echo " Kubernetes Version: $KUBERNETES_VERSION" +echo " Image Name: $OPENSTACK_IMAGE_NAME" +echo " Control Plane Flavor: $OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR" +echo " External Network: $OPENSTACK_EXTERNAL_NETWORK_NAME" +echo " SSH Key: $OPENSTACK_SSH_KEY_NAME" +echo "" +echo "โš ๏ธ VERIFY these match your OpenStack environment:" +echo " 1. Check image exists: openstack image show $OPENSTACK_IMAGE_NAME" +echo " 2. Check flavors exist: openstack flavor show $OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR" +echo " 3. Check SSH key exists: openstack keypair show $OPENSTACK_SSH_KEY_NAME" +echo " 4. Check external network: openstack network show $OPENSTACK_EXTERNAL_NETWORK_NAME" +echo "" +echo "๐Ÿš€ Run: make test-hcp" \ No newline at end of file diff --git a/pkg/cloud/services/networking/securitygroups.go b/pkg/cloud/services/networking/securitygroups.go index 8c6b30f443..27cef59cf0 100644 --- a/pkg/cloud/services/networking/securitygroups.go +++ b/pkg/cloud/services/networking/securitygroups.go @@ -20,6 +20,7 @@ import ( "errors" "fmt" "slices" + "strings" "github.com/gophercloud/gophercloud/v2/openstack/networking/v2/extensions/attributestags" "github.com/gophercloud/gophercloud/v2/openstack/networking/v2/extensions/security/groups" @@ -564,6 +565,11 @@ func (s *Service) createRule(securityGroupID string, r resolvedSecurityGroupRule s.scope.Logger().V(6).Info("Creating rule", "description", r.Description, "direction", dir, "portRangeMin", r.PortRangeMin, "portRangeMax", r.PortRangeMax, "proto", proto, "etherType", etherType, "remoteGroupID", r.RemoteGroupID, "remoteIPPrefix", r.RemoteIPPrefix, "securityGroupID", securityGroupID) _, err := s.client.CreateSecGroupRule(createOpts) if err != nil { + // Handle HTTP 409 (SecurityGroupRuleExists) as success - the rule already exists + if strings.Contains(err.Error(), "SecurityGroupRuleExists") || strings.Contains(err.Error(), "already exists") { + s.scope.Logger().V(4).Info("Security group rule already exists, treating as success", "description", r.Description, "securityGroupID", securityGroupID) + return nil + } return err } return nil diff --git a/test/e2e/data/ccm/cloud-controller-manager.yaml b/test/e2e/data/ccm/cloud-controller-manager.yaml index 90ef0035c8..75e1735c39 100644 --- a/test/e2e/data/ccm/cloud-controller-manager.yaml +++ b/test/e2e/data/ccm/cloud-controller-manager.yaml @@ -1,6 +1,23 @@ # From: https://raw.githubusercontent.com/kubernetes/cloud-provider-openstack/master/manifests/controller-manager/openstack-cloud-controller-manager-ds.yaml --- apiVersion: v1 +kind: Secret +metadata: + name: cloud-config + namespace: kube-system +stringData: + clouds.conf: | + [Global] + auth-url= + application-credential-id= + application-credential-secret= + region= + [LoadBalancer] + use-octavia=true + floating-network-id= + subnet-id= +--- +apiVersion: v1 kind: ServiceAccount metadata: name: cloud-controller-manager @@ -26,7 +43,6 @@ spec: spec: nodeSelector: node-role.kubernetes.io/control-plane: "" - # we need user root to read the cloud.conf from the host securityContext: runAsUser: 0 tolerations: @@ -40,17 +56,19 @@ spec: serviceAccountName: cloud-controller-manager containers: - name: openstack-cloud-controller-manager - image: >- - registry.k8s.io/provider-os/openstack-cloud-controller-manager:v1.32.0 + image: registry.k8s.io/provider-os/openstack-cloud-controller-manager:v1.32.0 args: - /bin/openstack-cloud-controller-manager - --v=1 - --cluster-name=$(CLUSTER_NAME) - - --cloud-config=$(CLOUD_CONFIG) + - --cloud-config=/etc/cloud/clouds.conf - --cloud-provider=openstack - --use-service-account-credentials=false - --bind-address=127.0.0.1 volumeMounts: + - mountPath: /etc/cloud + name: cloud-config + readOnly: true - mountPath: /etc/kubernetes name: k8s readOnly: true @@ -65,11 +83,14 @@ spec: cpu: 200m env: - name: CLOUD_CONFIG - value: /etc/kubernetes/cloud.conf + value: /etc/cloud/clouds.conf - name: CLUSTER_NAME value: kubernetes hostNetwork: true volumes: + - name: cloud-config + secret: + secretName: cloud-config - hostPath: path: /etc/kubernetes type: DirectoryOrCreate diff --git a/test/e2e/data/e2e_conf.yaml b/test/e2e/data/e2e_conf.yaml index 40e5b17a1b..0e3fbcf9bb 100644 --- a/test/e2e/data/e2e_conf.yaml +++ b/test/e2e/data/e2e_conf.yaml @@ -92,6 +92,24 @@ providers: new: "imagePullPolicy: IfNotPresent" - old: "--leader-elect" new: "--leader-elect=false\n - --sync-period=1m" + + + +- name: kamaji + type: ControlPlaneProvider + versions: + - name: v0.15.3 + value: "https://github.com/clastix/kamaji/releases/download/v0.15.3/kamaji-cluster-api-control-plane-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + + - name: openstack type: InfrastructureProvider versions: @@ -181,6 +199,15 @@ variables: CNI: "../../data/cni/calico.yaml" CCM: "../../data/ccm/cloud-controller-manager.yaml" EXP_CLUSTER_RESOURCE_SET: "true" + # HCP/Kamaji configuration + KAMAJI_VERSION: "edge-25.7.1" + KAMAJI_NAMESPACE: "kamaji-system" + CLUSTER_DATASTORE: "default" + HCP_SERVICE_TYPE: "LoadBalancer" + HCP_CPU_LIMIT: "1000m" + HCP_MEMORY_LIMIT: "1Gi" + HCP_CPU_REQUEST: "100m" + HCP_MEMORY_REQUEST: "300Mi" OPENSTACK_BASTION_IMAGE_NAME: "cirros-0.6.1-x86_64-disk" OPENSTACK_BASTION_IMAGE_URL: https://storage.googleapis.com/artifacts.k8s-staging-capi-openstack.appspot.com/test/cirros/2022-12-05/cirros-0.6.1-x86_64-disk.img OPENSTACK_BASTION_IMAGE_HASH: 0c839612eb3f2469420f2ccae990827f @@ -192,7 +219,7 @@ variables: OPENSTACK_CLOUD_CACERT_B64: "Cg==" OPENSTACK_CLOUD_YAML_FILE: '../../../../clouds.yaml' OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR: "m1.medium" - OPENSTACK_DNS_NAMESERVERS: "8.8.8.8" + OPENSTACK_FAILURE_DOMAIN: "testaz1" OPENSTACK_FAILURE_DOMAIN_ALT: "testaz2" OPENSTACK_IMAGE_NAME: "ubuntu-2404-kube-v1.33.1" @@ -222,7 +249,7 @@ intervals: conformance/wait-worker-nodes: ["30m", "10s"] default/wait-controllers: ["3m", "10s"] default/wait-bastion: ["5m", "10s"] - default/wait-cluster: ["20m", "10s"] + default/wait-cluster: ["60m", "10s"] default/wait-control-plane: ["30m", "10s"] default/wait-worker-nodes: ["30m", "10s"] default/wait-delete-cluster: ["5m", "10s"] @@ -233,3 +260,4 @@ intervals: default/wait-machine-remediation: ["10m", "10s"] default/wait-image-create: ["15m", "10s"] default/wait-image-delete: ["2m", "10s"] + default/wait-daemonset: ["10m", "30s"] diff --git a/test/e2e/data/e2e_conf_hcp.yaml b/test/e2e/data/e2e_conf_hcp.yaml new file mode 100644 index 0000000000..37bfa6f33b --- /dev/null +++ b/test/e2e/data/e2e_conf_hcp.yaml @@ -0,0 +1,236 @@ +# E2E test scenario using local dev images and manifests built from the source tree for following providers: +# - openstack + +# To run tests, run the following from the root of this repository. +# `OPENSTACK_CLOUD=capo-e2e OPENSTACK_CLOUD_YAML_FILE=/tmp/clouds.yaml make test-conformance` + +managementClusterName: capo-e2e + +images: +# Use local dev images built source tree; +- name: ghcr.io/orkhanorganization/k8s-staging-capi-openstack/capi-openstack-controller:e2e + loadBehavior: mustLoad +- name: quay.io/orc/openstack-resource-controller:v2.2.0 + loadBehavior: tryLoad + +providers: +- name: cluster-api + type: CoreProvider + versions: + - name: "{go://sigs.k8s.io/cluster-api@v1.10}" + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/{go://sigs.k8s.io/cluster-api@v1.10}/core-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" + # For clusterctl upgrade test + - name: "{go://sigs.k8s.io/cluster-api@v1.9}" + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/{go://sigs.k8s.io/cluster-api@v1.9}/core-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" +- name: kubeadm + type: BootstrapProvider + versions: + - name: "{go://sigs.k8s.io/cluster-api@v1.10}" + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/{go://sigs.k8s.io/cluster-api@v1.10}/bootstrap-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" + # For clusterctl upgrade test + - name: "{go://sigs.k8s.io/cluster-api@v1.9}" + value: "https://github.com/kubernetes-sigs/cluster-api/releases/download/{go://sigs.k8s.io/cluster-api@v1.9}/bootstrap-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" + + + +- name: kubeadm + type: ControlPlaneProvider + versions: + - name: v0.15.3 + value: "https://raw.githubusercontent.com/clastix/cluster-api-control-plane-provider-kamaji/v0.15.3/config/control-plane-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + + +- name: openstack + type: InfrastructureProvider + versions: + # This is only for clusterctl upgrade tests + - name: "{go://github.com/kubernetes-sigs/cluster-api-provider-openstack@v0.11}" + value: "https://github.com/kubernetes-sigs/cluster-api-provider-openstack/releases/download/{go://github.com/kubernetes-sigs/cluster-api-provider-openstack@v0.11}/infrastructure-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1_provider/metadata.yaml" + - sourcePath: "./infrastructure-openstack-no-artifact/cluster-template.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--v=2" + new: "--v=4" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" + # This is only for clusterctl upgrade tests + - name: "{go://github.com/kubernetes-sigs/cluster-api-provider-openstack@v0.12}" + value: "https://github.com/kubernetes-sigs/cluster-api-provider-openstack/releases/download/{go://github.com/kubernetes-sigs/cluster-api-provider-openstack@v0.12}/infrastructure-components.yaml" + type: url + contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1_provider/metadata.yaml" + - sourcePath: "./infrastructure-openstack-no-artifact/cluster-template.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--v=2" + new: "--v=4" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" + - name: v0.12.99 + value: ../../../config/default + # This is the upcoming version. + # Specify no contract so that upgrade tests that start from a specific contract won't pick it up. + # contract: v1beta1 + files: + - sourcePath: "../data/shared/v1beta1_provider/metadata.yaml" + - sourcePath: "./infrastructure-openstack-no-artifact/cluster-template.yaml" + - sourcePath: "./infrastructure-openstack-no-artifact/cluster-template-without-lb.yaml" + replacements: + - old: gcr.io/k8s-staging-capi-openstack/capi-openstack-controller:dev + new: ghcr.io/orkhanorganization/k8s-staging-capi-openstack/capi-openstack-controller:e2e + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + - old: "--v=2" + new: "--v=4" + - old: "--leader-elect" + new: "--leader-elect=false\n - --sync-period=1m" +- name: openstack-resource-controller + type: RuntimeExtensionProvider # ORC isn't a provider but we fake it so it can be handled by the clusterctl machinery. + versions: + - name: v2.2.0 + value: ../../../../cluster-api-provider-openstack/test/infrastructure/openstack-resource-controller/config/default + contract: v1beta1 + files: + - sourcePath: "../data/shared/openstack-resource-controller/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + # This is only for clusterctl upgrade tests, latest stable release of major version v1.0 + - name: v1.0.2 + value: ../../../../cluster-api-provider-openstack/test/infrastructure/openstack-resource-controller/config/upgrade-from + contract: v1beta1 + files: + - sourcePath: "../data/shared/openstack-resource-controller/metadata.yaml" + replacements: + - old: "imagePullPolicy: Always" + new: "imagePullPolicy: IfNotPresent" + + +# default variables for the e2e test; those values could be overridden via env variables, thus +# allowing the same e2e config file to be re-used in different prow jobs e.g. each one with a K8s version permutation +variables: + # used to ensure we deploy to the correct management cluster + KUBE_CONTEXT: "kind-capo-e2e" + KUBERNETES_VERSION: "v1.33.1" + KUBERNETES_VERSION_UPGRADE_FROM: "v1.32.5" + KUBERNETES_VERSION_UPGRADE_TO: "v1.33.1" + # NOTE: To see default images run kubeadm config images list (optionally with --kubernetes-version=vX.Y.Z) + ETCD_VERSION_UPGRADE_TO: "3.5.21-0" + COREDNS_VERSION_UPGRADE_TO: "v1.12.0" + CONTROL_PLANE_MACHINE_TEMPLATE_UPGRADE_TO: "upgrade-to-control-plane" + WORKERS_MACHINE_TEMPLATE_UPGRADE_TO: "upgrade-to-md-0" + CNI: "../../data/cni/calico.yaml" + CCM: "../../data/ccm/cloud-controller-manager.yaml" + EXP_CLUSTER_RESOURCE_SET: "true" + # HCP/Kamaji configuration + KAMAJI_VERSION: "edge-25.7.1" + KAMAJI_NAMESPACE: "kamaji-system" + CLUSTER_DATASTORE: "default" + HCP_SERVICE_TYPE: "LoadBalancer" + HCP_CPU_LIMIT: "1000m" + HCP_MEMORY_LIMIT: "1Gi" + HCP_CPU_REQUEST: "100m" + HCP_MEMORY_REQUEST: "300Mi" + OPENSTACK_BASTION_IMAGE_NAME: "cirros-0.6.1-x86_64-disk" + OPENSTACK_BASTION_IMAGE_URL: https://storage.googleapis.com/artifacts.k8s-staging-capi-openstack.appspot.com/test/cirros/2022-12-05/cirros-0.6.1-x86_64-disk.img + OPENSTACK_BASTION_IMAGE_HASH: 0c839612eb3f2469420f2ccae990827f + OPENSTACK_BASTION_IMAGE_HASH_ALGORITHM: "md5" + OPENSTACK_BASTION_MACHINE_FLAVOR: "m1.tiny" + OPENSTACK_BASTION_MACHINE_FLAVOR_ALT: "m1.tiny.alt" + OPENSTACK_CLOUD: "capo-e2e" + OPENSTACK_CLOUD_ADMIN: "capo-e2e-admin" + OPENSTACK_CLOUD_CACERT_B64: "Cg==" + OPENSTACK_CLOUD_YAML_FILE: '../../../../clouds.yaml' + OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR: "m1.medium" + OPENSTACK_FAILURE_DOMAIN: "testaz1" + OPENSTACK_FAILURE_DOMAIN_ALT: "testaz2" + OPENSTACK_IMAGE_NAME: "ubuntu-2404-kube-v1.33.1" + OPENSTACK_IMAGE_URL: https://storage.googleapis.com/artifacts.k8s-staging-capi-openstack.appspot.com/test/ubuntu/ubuntu-2404-kube-v1.33.1 + OPENSTACK_IMAGE_NAME_UPGRADE_FROM: "ubuntu-2404-kube-v1.32.5" + OPENSTACK_IMAGE_URL_UPGRADE_FROM: https://storage.googleapis.com/artifacts.k8s-staging-capi-openstack.appspot.com/test/ubuntu/ubuntu-2404-kube-v1.32.5 + OPENSTACK_NODE_MACHINE_FLAVOR: "m1.small" + OPENSTACK_SSH_KEY_NAME: "cluster-api-provider-openstack-sigs-k8s-io" + # The default external network created by devstack + OPENSTACK_EXTERNAL_NETWORK_NAME: "public" + OPENSTACK_VOLUME_TYPE_ALT: "test-volume-type" + CONFORMANCE_WORKER_MACHINE_COUNT: "5" + CONFORMANCE_CONTROL_PLANE_MACHINE_COUNT: "1" + E2E_IMAGE_URL: "http://10.0.3.15/capo-e2e-image.tar" + # The default user for SSH connections from bastion to machines + SSH_USER_MACHINE: "ubuntu" + EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true" + # The Flatcar image produced by the image-builder + OPENSTACK_FLATCAR_IMAGE_NAME: "flatcar-stable-4152.2.3-kube-v1.33.1" + OPENSTACK_FLATCAR_IMAGE_URL: "https://storage.googleapis.com/artifacts.k8s-staging-capi-openstack.appspot.com/test/flatcar/flatcar-stable-4152.2.3-kube-v1.33.1" + # A plain Flatcar from the Flatcar releases server + FLATCAR_IMAGE_NAME: "flatcar_production_openstack_image" + FLATCAR_IMAGE_URL: https://stable.release.flatcar-linux.net/amd64-usr/current/flatcar_production_openstack_image.img + +intervals: + conformance/wait-control-plane: ["30m", "10s"] + conformance/wait-worker-nodes: ["30m", "10s"] + default/wait-controllers: ["3m", "10s"] + default/wait-bastion: ["5m", "10s"] + default/wait-cluster: ["60m", "10s"] + default/wait-control-plane: ["30m", "10s"] + default/wait-worker-nodes: ["30m", "10s"] + default/wait-delete-cluster: ["5m", "10s"] + default/wait-delete-machine: ["30m", "10s"] + default/wait-alt-az: ["20m", "30s"] + default/wait-machine-upgrade: ["30m", "10s"] + default/wait-nodes-ready: ["15m", "10s"] + default/wait-machine-remediation: ["10m", "10s"] + default/wait-image-create: ["15m", "10s"] + default/wait-image-delete: ["2m", "10s"] + default/wait-daemonset: ["10m", "30s"] diff --git a/test/e2e/data/kustomize/hcp-broken/kustomization.yaml b/test/e2e/data/kustomize/hcp-broken/kustomization.yaml new file mode 100644 index 0000000000..5f1b14ee65 --- /dev/null +++ b/test/e2e/data/kustomize/hcp-broken/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../default + +namePrefix: hcp-broken- + +patchesStrategicMerge: +- patch-broken-networking.yaml \ No newline at end of file diff --git a/test/e2e/data/kustomize/hcp-broken/patch-broken-networking.yaml b/test/e2e/data/kustomize/hcp-broken/patch-broken-networking.yaml new file mode 100644 index 0000000000..5bb2fdcc98 --- /dev/null +++ b/test/e2e/data/kustomize/hcp-broken/patch-broken-networking.yaml @@ -0,0 +1,20 @@ +--- +# This patch removes networking configuration from OpenStackCluster +# to test graceful failure scenarios when cluster network is missing +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: OpenStackCluster +metadata: + name: ${CLUSTER_NAME} +spec: + # Remove managedSubnets to simulate missing network configuration + managedSubnets: [] + # Remove managedSecurityGroups to further test edge cases + managedSecurityGroups: null + # Keep other required configuration + apiServerLoadBalancer: + enabled: true + externalNetwork: + id: ${OPENSTACK_EXTERNAL_NETWORK_ID} + identityRef: + cloudName: ${OPENSTACK_CLOUD} + name: ${CLUSTER_NAME}-cloud-config \ No newline at end of file diff --git a/test/e2e/data/kustomize/hcp-management/kustomization.yaml b/test/e2e/data/kustomize/hcp-management/kustomization.yaml new file mode 100644 index 0000000000..4b26c2b995 --- /dev/null +++ b/test/e2e/data/kustomize/hcp-management/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Use same base resources as default but with our own component overrides +resources: +- ../../../../../kustomize/v1beta1/default + +# Override components to use existing images instead of downloading +components: +- ../common-patches/cluster +- ../common-patches/cni +- ../upgrade-patches +- ../common-patches/ccm +- ../common-patches/externalNetworkByName +- ../common-patches/images-without-ref # Use existing images instead of ORC download + +patchesStrategicMerge: +- patch-management-cluster.yaml \ No newline at end of file diff --git a/test/e2e/data/kustomize/hcp-management/patch-management-cluster.yaml b/test/e2e/data/kustomize/hcp-management/patch-management-cluster.yaml new file mode 100644 index 0000000000..35ccdb783d --- /dev/null +++ b/test/e2e/data/kustomize/hcp-management/patch-management-cluster.yaml @@ -0,0 +1,84 @@ +--- +# Patch the Cluster to ensure it has sufficient resources for HCP hosting +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + name: ${CLUSTER_NAME} +spec: + clusterNetwork: + pods: + cidrBlocks: + - 10.244.0.0/16 # Different CIDR to avoid conflicts with workload clusters + services: + cidrBlocks: + - 10.96.0.0/12 # Different service CIDR +--- +# Patch the OpenStackCluster to ensure robust networking for HCP +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: OpenStackCluster +metadata: + name: ${CLUSTER_NAME} +spec: + managedSubnets: + - cidr: 10.10.0.0/24 # Larger management network + dnsNameservers: + - ${OPENSTACK_DNS_NAMESERVERS} + managedSecurityGroups: + allNodesSecurityGroupRules: + - description: Created by cluster-api-provider-openstack - BGP (calico) + direction: ingress + etherType: IPv4 + name: BGP (Calico) + portRangeMax: 179 + portRangeMin: 179 + protocol: tcp + remoteManagedGroups: + - controlplane + - worker + - description: Created by cluster-api-provider-openstack - IP-in-IP (calico) + direction: ingress + etherType: IPv4 + name: IP-in-IP (calico) + protocol: "4" + remoteManagedGroups: + - controlplane + - worker + # Additional security group rules for HCP hosting + - description: Created by cluster-api-provider-openstack - Kamaji API Server + direction: ingress + etherType: IPv4 + name: Kamaji API Server + portRangeMax: 6443 + portRangeMin: 6443 + protocol: tcp + remoteManagedGroups: + - controlplane + - worker + - description: Created by cluster-api-provider-openstack - Kamaji etcd + direction: ingress + etherType: IPv4 + name: Kamaji etcd + portRangeMax: 2380 + portRangeMin: 2379 + protocol: tcp + remoteManagedGroups: + - controlplane + - worker +--- +# Patch the control plane to use larger instances for HCP hosting +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: KubeadmControlPlane +metadata: + name: ${CLUSTER_NAME}-control-plane +spec: + replicas: ${CONTROL_PLANE_MACHINE_COUNT:-3} # Default to 3 for HA +--- +# Patch the control plane machine template for larger flavor +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: OpenStackMachineTemplate +metadata: + name: ${CLUSTER_NAME}-control-plane +spec: + template: + spec: + flavor: ${OPENSTACK_CONTROL_PLANE_MACHINE_FLAVOR:-m1.large} # Larger default for HCP hosting \ No newline at end of file diff --git a/test/e2e/data/kustomize/hcp-workload/hcp-workload-template.yaml b/test/e2e/data/kustomize/hcp-workload/hcp-workload-template.yaml new file mode 100644 index 0000000000..25e5f992e3 --- /dev/null +++ b/test/e2e/data/kustomize/hcp-workload/hcp-workload-template.yaml @@ -0,0 +1,166 @@ +--- +apiVersion: v1 +data: + cacert: ${OPENSTACK_CLOUD_CACERT_B64} + clouds.yaml: ${OPENSTACK_CLOUD_YAML_B64} +kind: Secret +metadata: + labels: + clusterctl.cluster.x-k8s.io/move: "true" + name: ${CLUSTER_NAME}-cloud-config +--- +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 +spec: + template: + spec: + files: + - content: | + #!/bin/bash + DOWNLOAD_E2E_IMAGE=${DOWNLOAD_E2E_IMAGE:=false} + if [ ! "${DOWNLOAD_E2E_IMAGE}" = true ]; then + echo "Not downloading E2E image, exiting" + exit 0 + fi + # Download the locally built CAPO controller image + echo "Downloading ${E2E_IMAGE_URL}" + wget "${E2E_IMAGE_URL}" -O "/tmp/capo-controller-manager.tar" + sudo ctr -n k8s.io images import "/tmp/capo-controller-manager.tar" || echo "* ignoring expected 'ctr images import' result" + owner: root:root + path: /usr/local/bin/ci-artifacts-openstack.sh + permissions: "0750" + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + provider-id: openstack:///'{{ instance_id }}' + name: '{{ local_hostname }}' + postKubeadmCommands: + - /usr/local/bin/ci-artifacts-openstack.sh +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + name: ${CLUSTER_NAME} +spec: + clusterNetwork: + pods: + cidrBlocks: + - 10.244.0.0/16 # Different from management cluster + serviceDomain: cluster.local + services: + cidrBlocks: + - 10.96.0.0/12 # Different from management cluster + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1alpha1 + kind: KamajiControlPlane + name: ${CLUSTER_NAME}-control-plane + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: OpenStackCluster + name: ${CLUSTER_NAME} +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineDeployment +metadata: + name: ${CLUSTER_NAME}-md-0 +spec: + clusterName: ${CLUSTER_NAME} + replicas: ${WORKER_MACHINE_COUNT} + selector: + matchLabels: null + template: + spec: + bootstrap: + configRef: + apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 + kind: KubeadmConfigTemplate + name: ${CLUSTER_NAME}-md-0 + clusterName: ${CLUSTER_NAME} + failureDomain: ${OPENSTACK_FAILURE_DOMAIN} + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: OpenStackMachineTemplate + name: ${CLUSTER_NAME}-md-0 + version: ${KUBERNETES_VERSION} +--- +apiVersion: controlplane.cluster.x-k8s.io/v1alpha1 +kind: KamajiControlPlane +metadata: + name: ${CLUSTER_NAME}-control-plane +spec: + dataStoreName: ${CLUSTER_DATASTORE} + addons: + coreDNS: {} + kubeProxy: {} + network: + serviceType: ${HCP_SERVICE_TYPE} + kubernetesSemVer: ${KUBERNETES_VERSION} + deployment: + replicas: ${CONTROL_PLANE_MACHINE_COUNT} + resources: + limits: + cpu: ${HCP_CPU_LIMIT} + memory: ${HCP_MEMORY_LIMIT} + requests: + cpu: ${HCP_CPU_REQUEST} + memory: ${HCP_MEMORY_REQUEST} + nodeSelector: + kubernetes.io/os: linux +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: OpenStackCluster +metadata: + name: ${CLUSTER_NAME} +spec: + apiServerLoadBalancer: + enabled: true + bastion: + enabled: true + spec: + flavor: ${OPENSTACK_BASTION_MACHINE_FLAVOR} + sshKeyName: ${OPENSTACK_SSH_KEY_NAME} + controlPlaneAvailabilityZones: + - ${OPENSTACK_FAILURE_DOMAIN} + externalNetwork: + filter: + name: ${OPENSTACK_EXTERNAL_NETWORK_NAME} + identityRef: + cloudName: ${OPENSTACK_CLOUD} + name: ${CLUSTER_NAME}-cloud-config + managedSecurityGroups: + allNodesSecurityGroupRules: + - description: Created by cluster-api-provider-openstack - BGP (calico) + direction: ingress + etherType: IPv4 + name: BGP (Calico) + portRangeMax: 179 + portRangeMin: 179 + protocol: tcp + remoteManagedGroups: + - controlplane + - worker + - description: Created by cluster-api-provider-openstack - IP-in-IP (calico) + direction: ingress + etherType: IPv4 + name: IP-in-IP (calico) + protocol: "4" + remoteManagedGroups: + - controlplane + - worker + managedSubnets: + - cidr: 10.20.0.0/24 # Different from management cluster network + dnsNameservers: + - ${OPENSTACK_DNS_NAMESERVERS} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: OpenStackMachineTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 +spec: + template: + spec: + flavor: ${OPENSTACK_NODE_MACHINE_FLAVOR} + sshKeyName: ${OPENSTACK_SSH_KEY_NAME} \ No newline at end of file diff --git a/test/e2e/data/kustomize/hcp-workload/kustomization.yaml b/test/e2e/data/kustomize/hcp-workload/kustomization.yaml new file mode 100644 index 0000000000..104e904727 --- /dev/null +++ b/test/e2e/data/kustomize/hcp-workload/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Build from scratch with our own template file +resources: +- hcp-workload-template.yaml + +components: +- ../common-patches/cluster +- ../common-patches/cni +- ../common-patches/ccm +- ../common-patches/externalNetworkByName +- ../common-patches/images-without-ref + +namePrefix: hcp-workload- \ No newline at end of file diff --git a/test/e2e/shared/defaults.go b/test/e2e/shared/defaults.go index daacc99c6b..5f0d10cf30 100644 --- a/test/e2e/shared/defaults.go +++ b/test/e2e/shared/defaults.go @@ -60,6 +60,9 @@ const ( FlavorFlatcar = "flatcar" FlavorKubernetesUpgrade = "k8s-upgrade" FlavorFlatcarSysext = "flatcar-sysext" + FlavorBrokenHCP = "broken-hcp" + FlavorHCPManagement = "hcp-management" + FlavorHCPWorkload = "hcp-workload" ) // DefaultScheme returns the default scheme to use for testing. diff --git a/test/e2e/suites/hcp/README.md b/test/e2e/suites/hcp/README.md new file mode 100644 index 0000000000..b3be764ea8 --- /dev/null +++ b/test/e2e/suites/hcp/README.md @@ -0,0 +1,104 @@ +# HCP (Hosted Control Plane) E2E Tests + +This directory contains end-to-end tests for Hosted Control Plane (HCP) functionality using Kamaji as the control plane provider. + +## Overview + +The HCP tests verify that: + +1. **Management Cluster**: A traditional CAPO cluster can be configured to host control planes for other clusters +2. **Workload Cluster**: Worker-only clusters can be created with external control planes managed by Kamaji +3. **Graceful Failures**: Broken configurations fail gracefully without panics or nil pointer exceptions + +## Test Structure + +### Test Suites + +- **Management Cluster Test**: Creates a CAPO cluster with HCP hosting capabilities +- **Workload Cluster Test**: Creates worker-only clusters using KamajiControlPlane +- **Graceful Failure Tests**: Validates error handling for broken configurations + +### Test Flow + +1. **Suite Setup**: Creates shared management cluster with Kamaji installed +2. **Workload Tests**: Creates multiple workload clusters using the shared management cluster +3. **Failure Tests**: Tests broken scenarios to ensure graceful error handling +4. **Suite Cleanup**: Cleans up shared resources + +## Running Tests + +### Prerequisites + +- OpenStack environment configured +- `OPENSTACK_CLOUD_YAML_FILE` environment variable set +- Docker installed ([[memory:2673423]]) + +### Run HCP Tests + +```bash +# Run all HCP tests +make test-hcp + +# Run with specific focus +E2E_GINKGO_FOCUS="Management cluster" make test-hcp + +# Run with existing cluster +E2E_ARGS="-use-existing-cluster=true" make test-hcp +``` + +### Test Configuration + +Tests use the same configuration as other e2e tests: +- Config: `test/e2e/data/e2e_conf.yaml` +- Templates: `test/e2e/data/kustomize/hcp-*` +- Artifacts: `_artifacts/` directory + +## Template Structure + +### HCP Management (`hcp-management`) +- Traditional CAPO cluster with larger worker nodes +- Additional security rules for hosting control planes +- Kamaji installation and configuration + +### HCP Workload (`hcp-workload`) +- Worker-only cluster configuration +- Uses `KamajiControlPlane` instead of `KubeadmControlPlane` +- Different network CIDRs to avoid conflicts + +### HCP Broken (`hcp-broken`) +- Intentionally broken networking configuration +- Used to test graceful failure scenarios + +## Test Intervals + +HCP tests use dedicated intervals defined in `e2e_conf.yaml`: + +```yaml +intervals: + hcp/wait-kamaji-install: ["10m", "30s"] + hcp/wait-kamaji-control-plane: ["15m", "30s"] + hcp/wait-cluster: ["25m", "10s"] + hcp/wait-control-plane: ["30m", "10s"] + hcp/wait-worker-nodes: ["30m", "10s"] +``` + +## Debugging + +### Log Collection +Logs are automatically collected in `_artifacts/clusters/` for failed tests. + +### Manual Debug +Use the `skip-cleanup` flag to preserve resources for investigation: + +```bash +E2E_ARGS="-skip-cleanup=true" make test-hcp +``` + +### Kamaji Resources +Check Kamaji-specific resources: + +```bash +kubectl get kamajicontrolplane -A +kubectl get datastore -A +kubectl get -n kamaji-system pods +``` \ No newline at end of file diff --git a/test/e2e/suites/hcp/hcp_helpers.go b/test/e2e/suites/hcp/hcp_helpers.go new file mode 100644 index 0000000000..a325f8b28e --- /dev/null +++ b/test/e2e/suites/hcp/hcp_helpers.go @@ -0,0 +1,340 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hcp + +import ( + "context" + "fmt" + "path/filepath" + "time" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + "sigs.k8s.io/cluster-api/test/framework" + "sigs.k8s.io/cluster-api/test/framework/clusterctl" + "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/cluster-api-provider-openstack/test/e2e/shared" +) + +// HCPTestContext holds context for HCP test isolation. +type HCPTestContext struct { + ManagementCluster framework.ClusterProxy + ManagementClusterResources *clusterctl.ApplyClusterTemplateAndWaitResult + ManagementNamespace *corev1.Namespace + KamajiInstalled bool + IsolationID string +} + +// createHCPTestContext creates an isolated test context for HCP tests. +func createHCPTestContext(ctx context.Context) *HCPTestContext { + isolationID := fmt.Sprintf("hcp-%d", time.Now().Unix()) + shared.Logf("Creating HCP test context with isolation ID: %s", isolationID) + + return &HCPTestContext{ + IsolationID: isolationID, + KamajiInstalled: false, + } +} + +// setupSharedManagementCluster creates or reuses the management cluster for HCP tests. +func setupSharedManagementCluster(ctx context.Context, hcpCtx *HCPTestContext, e2eCtx *shared.E2EContext) { + shared.Logf("Setting up shared management cluster for HCP tests") + + // Create dedicated namespace for management cluster + hcpCtx.ManagementNamespace = shared.SetupSpecNamespace(ctx, "hcp-mgmt-"+hcpCtx.IsolationID, e2eCtx) + hcpCtx.ManagementClusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult) + + // Create management cluster using clusterctl + managementClusterName := fmt.Sprintf("hcp-mgmt-%s", hcpCtx.ManagementNamespace.Name) + configCluster := clusterctl.ConfigClusterInput{ + LogFolder: filepath.Join(e2eCtx.Settings.ArtifactFolder, "clusters", e2eCtx.Environment.BootstrapClusterProxy.GetName()), + ClusterctlConfigPath: e2eCtx.Environment.ClusterctlConfigPath, + KubeconfigPath: e2eCtx.Environment.BootstrapClusterProxy.GetKubeconfigPath(), + InfrastructureProvider: clusterctl.DefaultInfrastructureProvider, + Flavor: shared.FlavorHCPManagement, + Namespace: hcpCtx.ManagementNamespace.Name, + ClusterName: managementClusterName, + KubernetesVersion: e2eCtx.E2EConfig.MustGetVariable(shared.KubernetesVersion), + ControlPlaneMachineCount: ptr.To(int64(1)), + WorkerMachineCount: ptr.To(int64(2)), + } + + shared.Logf("Creating management cluster %s", managementClusterName) + clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ + ClusterProxy: e2eCtx.Environment.BootstrapClusterProxy, + ConfigCluster: configCluster, + WaitForClusterIntervals: e2eCtx.E2EConfig.GetIntervals("hcp", "wait-cluster"), + WaitForControlPlaneIntervals: e2eCtx.E2EConfig.GetIntervals("hcp", "wait-control-plane"), + WaitForMachineDeployments: e2eCtx.E2EConfig.GetIntervals("hcp", "wait-worker-nodes"), + }, hcpCtx.ManagementClusterResources) + + hcpCtx.ManagementCluster = e2eCtx.Environment.BootstrapClusterProxy.GetWorkloadCluster(ctx, hcpCtx.ManagementNamespace.Name, managementClusterName) + shared.Logf("Management cluster %s created successfully", managementClusterName) +} + +// installKamajiUsingClusterctl installs Kamaji using clusterctl. +func installKamajiUsingClusterctl(ctx context.Context, hcpCtx *HCPTestContext, e2eCtx *shared.E2EContext) { + if hcpCtx.KamajiInstalled { + shared.Logf("Kamaji already installed, skipping") + return + } + + shared.Logf("Installing Kamaji using clusterctl") + + // Initialize Kamaji provider on management cluster + clusterctl.InitManagementClusterAndWatchControllerLogs(ctx, clusterctl.InitManagementClusterAndWatchControllerLogsInput{ + ClusterProxy: hcpCtx.ManagementCluster, + ClusterctlConfigPath: e2eCtx.Environment.ClusterctlConfigPath, + InfrastructureProviders: []string{"openstack"}, + BootstrapProviders: []string{"kubeadm"}, + ControlPlaneProviders: []string{"kubeadm"}, + CoreProvider: "", + LogFolder: filepath.Join(e2eCtx.Settings.ArtifactFolder, "clusters", hcpCtx.ManagementCluster.GetName()), + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-controllers")...) + + // Create default datastore for Kamaji + createDefaultDatastore(ctx, hcpCtx.ManagementCluster, e2eCtx) + + // Wait for Kamaji to be ready + waitForKamajiReady(ctx, hcpCtx.ManagementCluster, e2eCtx) + + hcpCtx.KamajiInstalled = true + shared.Logf("Kamaji installation completed successfully") +} + +// createDefaultDatastore creates the default etcd datastore for Kamaji. +func createDefaultDatastore(ctx context.Context, managementCluster framework.ClusterProxy, e2eCtx *shared.E2EContext) { + shared.Logf("Creating default datastore for Kamaji") + + datastore := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "apiVersion": "kamaji.clastix.io/v1alpha1", + "kind": "DataStore", + "metadata": map[string]interface{}{ + "name": e2eCtx.E2EConfig.MustGetVariable("CLUSTER_DATASTORE"), + }, + "spec": map[string]interface{}{ + "driver": "etcd", + "endpoints": []interface{}{ + "etcd-cluster.kamaji-system.svc.cluster.local:2379", + }, + }, + }, + } + + Eventually(func() error { + return managementCluster.GetClient().Create(ctx, datastore) + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-controllers")...).Should(Succeed()) + + shared.Logf("Default datastore created successfully") +} + +// waitForKamajiReady waits for Kamaji components to be ready. +func waitForKamajiReady(ctx context.Context, managementCluster framework.ClusterProxy, e2eCtx *shared.E2EContext) { + shared.Logf("Waiting for Kamaji components to be ready") + + Eventually(func() bool { + podList := &corev1.PodList{} + err := managementCluster.GetClient().List(ctx, podList, client.InNamespace(e2eCtx.E2EConfig.MustGetVariable("KAMAJI_NAMESPACE"))) + if err != nil { + return false + } + + if len(podList.Items) == 0 { + return false + } + + for _, pod := range podList.Items { + if pod.Status.Phase != corev1.PodRunning { + return false + } + } + return true + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-controllers")...).Should(BeTrue()) + + shared.Logf("Kamaji components are ready") +} + +// waitForKamajiControlPlane waits for a KamajiControlPlane to be ready. +func waitForKamajiControlPlane(ctx context.Context, managementCluster framework.ClusterProxy, namespace, name string, e2eCtx *shared.E2EContext) { + shared.Logf("Waiting for KamajiControlPlane %s/%s to be ready", namespace, name) + + kcpGVK := schema.GroupVersionKind{ + Group: "controlplane.cluster.x-k8s.io", + Version: "v1alpha1", + Kind: "KamajiControlPlane", + } + + Eventually(func() bool { + kcp := &unstructured.Unstructured{} + kcp.SetGroupVersionKind(kcpGVK) + + err := managementCluster.GetClient().Get(ctx, client.ObjectKey{ + Namespace: namespace, + Name: name, + }, kcp) + if err != nil { + return false + } + + conditions, found, err := unstructured.NestedSlice(kcp.Object, "status", "conditions") + if err != nil || !found { + return false + } + + for _, condition := range conditions { + condMap, ok := condition.(map[string]interface{}) + if !ok { + continue + } + + if condMap["type"] == "Ready" && condMap["status"] == "True" { + return true + } + } + return false + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-control-plane")...).Should(BeTrue()) + + shared.Logf("KamajiControlPlane %s/%s is ready", namespace, name) +} + +// verifyHCPWorkloadCluster performs comprehensive validation of HCP workload cluster. +func verifyHCPWorkloadCluster(ctx context.Context, workloadCluster framework.ClusterProxy, clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult, e2eCtx *shared.E2EContext) { + shared.Logf("Verifying HCP workload cluster") + + // Verify cluster is accessible + Eventually(func() error { + _, err := workloadCluster.GetClientSet().Discovery().ServerVersion() + return err + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-nodes-ready")...).Should(Succeed()) + + // Verify worker nodes are ready + Eventually(func() bool { + nodeList := &corev1.NodeList{} + err := workloadCluster.GetClient().List(ctx, nodeList) + if err != nil { + return false + } + + readyNodes := 0 + for _, node := range nodeList.Items { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { + readyNodes++ + break + } + } + } + + expectedNodes := int(*clusterResources.MachineDeployments[0].Spec.Replicas) + return readyNodes == expectedNodes + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-nodes-ready")...).Should(BeTrue()) + + // Verify no control plane nodes (HCP workload should have only workers) + nodeList := &corev1.NodeList{} + Expect(workloadCluster.GetClient().List(ctx, nodeList)).To(Succeed()) + + for _, node := range nodeList.Items { + Expect(node.Labels).NotTo(HaveKey("node-role.kubernetes.io/control-plane")) + Expect(node.Labels).NotTo(HaveKey("node-role.kubernetes.io/master")) + } + + shared.Logf("HCP workload cluster verification completed successfully") +} + +// isolatedCleanup provides isolated cleanup for test resources. +func isolatedCleanup(ctx context.Context, hcpCtx *HCPTestContext, testNamespace *corev1.Namespace, clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult, e2eCtx *shared.E2EContext) { + shared.Logf("Starting isolated cleanup for test context %s", hcpCtx.IsolationID) + + // Clean up test-specific resources first + if clusterResources != nil && clusterResources.Cluster != nil { + shared.Logf("Cleaning up workload cluster %s", clusterResources.Cluster.Name) + shared.DumpSpecResourcesAndCleanup(ctx, "hcp-workload", testNamespace, e2eCtx) + } + + // Note: Management cluster cleanup is handled separately in suite cleanup + shared.Logf("Isolated cleanup completed for test context %s", hcpCtx.IsolationID) +} + +// cleanupSharedManagementCluster cleans up the shared management cluster (called once per suite). +func cleanupSharedManagementCluster(ctx context.Context, hcpCtx *HCPTestContext, e2eCtx *shared.E2EContext) { + if hcpCtx.ManagementCluster == nil { + return + } + + shared.Logf("Cleaning up shared management cluster for context %s", hcpCtx.IsolationID) + + // Clean up management cluster resources + if hcpCtx.ManagementClusterResources != nil && hcpCtx.ManagementClusterResources.Cluster != nil { + shared.DumpSpecResourcesAndCleanup(ctx, "hcp-management", hcpCtx.ManagementNamespace, e2eCtx) + } + + shared.Logf("Shared management cluster cleanup completed") +} + +// waitForTerminalError waits for a cluster to reach a terminal error state (for broken HCP tests). +func waitForTerminalError(ctx context.Context, managementCluster framework.ClusterProxy, namespace, clusterName string, e2eCtx *shared.E2EContext) { + shared.Logf("Waiting for cluster %s/%s to reach terminal error state", namespace, clusterName) + + Eventually(func() bool { + cluster := &unstructured.Unstructured{} + cluster.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "cluster.x-k8s.io", + Version: "v1beta1", + Kind: "Cluster", + }) + + err := managementCluster.GetClient().Get(ctx, client.ObjectKey{ + Namespace: namespace, + Name: clusterName, + }, cluster) + if err != nil { + return false + } + + // Check for terminal error conditions + conditions, found, err := unstructured.NestedSlice(cluster.Object, "status", "conditions") + if err != nil || !found { + return false + } + + for _, condition := range conditions { + condMap, ok := condition.(map[string]interface{}) + if !ok { + continue + } + + // Look for terminal error indicators + if condMap["type"] == "InfrastructureReady" && condMap["status"] == "False" { + if reason, ok := condMap["reason"].(string); ok && + (reason == "InvalidConfiguration" || reason == "ProvisioningFailed") { + return true + } + } + } + return false + }, e2eCtx.E2EConfig.GetIntervals("hcp", "wait-cluster")...).Should(BeTrue()) + + shared.Logf("Cluster %s/%s reached terminal error state as expected", namespace, clusterName) +} diff --git a/test/e2e/suites/hcp/hcp_suite_test.go b/test/e2e/suites/hcp/hcp_suite_test.go new file mode 100644 index 0000000000..b8fce1d0fc --- /dev/null +++ b/test/e2e/suites/hcp/hcp_suite_test.go @@ -0,0 +1,59 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hcp + +import ( + "context" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + ctrl "sigs.k8s.io/controller-runtime" + + "sigs.k8s.io/cluster-api-provider-openstack/test/e2e/shared" +) + +var e2eCtx *shared.E2EContext + +func init() { + e2eCtx = shared.NewE2EContext() + shared.CreateDefaultFlags(e2eCtx) +} + +func TestHCP(t *testing.T) { + RegisterFailHandler(Fail) + ctrl.SetLogger(GinkgoLogr) + + suiteConfig, reporterConfig := GinkgoConfiguration() + + RunSpecs(t, "capo-hcp", suiteConfig, reporterConfig) +} + +var _ = SynchronizedBeforeSuite(func(ctx context.Context) []byte { + return shared.Node1BeforeSuite(ctx, e2eCtx) +}, func(data []byte) { + shared.AllNodesBeforeSuite(e2eCtx, data) +}) + +var _ = SynchronizedAfterSuite(func() { + shared.AllNodesAfterSuite(e2eCtx) +}, func(ctx context.Context) { + shared.Node1AfterSuite(ctx, e2eCtx) +}) diff --git a/test/e2e/suites/hcp/hcp_test.go b/test/e2e/suites/hcp/hcp_test.go new file mode 100644 index 0000000000..4be73faa00 --- /dev/null +++ b/test/e2e/suites/hcp/hcp_test.go @@ -0,0 +1,213 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hcp + +import ( + "context" + "fmt" + "path/filepath" + "sync" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + "sigs.k8s.io/cluster-api/test/framework/clusterctl" + + "sigs.k8s.io/cluster-api-provider-openstack/test/e2e/shared" +) + +const specName = "hcp" + +var ( + sharedHCPContext *HCPTestContext + setupOnce sync.Once +) + +var _ = Describe("HCP (Hosted Control Plane) tests", func() { + Describe("Management cluster verification", func() { + It("should create and manage HCP-capable cluster", func(ctx context.Context) { + // Setup shared HCP infrastructure once + setupOnce.Do(func() { + shared.Logf("Setting up shared HCP test infrastructure") + + // Create isolated test context for this suite + sharedHCPContext = createHCPTestContext(ctx) + + // Set up shared management cluster + setupSharedManagementCluster(ctx, sharedHCPContext, e2eCtx) + + // Install Kamaji using clusterctl + installKamajiUsingClusterctl(ctx, sharedHCPContext, e2eCtx) + + shared.Logf("Shared HCP infrastructure ready with isolation ID: %s", sharedHCPContext.IsolationID) + }) + + shared.Logf("Verifying management cluster capabilities") + + // Verify management cluster is operational + Expect(sharedHCPContext.ManagementCluster).ToNot(BeNil()) + Expect(sharedHCPContext.KamajiInstalled).To(BeTrue()) + + // Verify cluster can schedule workloads + Eventually(func() error { + _, err := sharedHCPContext.ManagementCluster.GetClientSet().Discovery().ServerVersion() + return err + }, e2eCtx.E2EConfig.GetIntervals(specName, "wait-nodes-ready")...).Should(Succeed()) + + shared.Logf("Management cluster verification completed successfully") + }) + }) + + Describe("Workload cluster with hosted control plane", func() { + var ( + // Per-test isolation + testNamespace *corev1.Namespace + clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult + ) + + BeforeEach(func(ctx context.Context) { + // Ensure shared HCP infrastructure is ready + setupOnce.Do(func() { + shared.Logf("Setting up shared HCP test infrastructure") + sharedHCPContext = createHCPTestContext(ctx) + setupSharedManagementCluster(ctx, sharedHCPContext, e2eCtx) + installKamajiUsingClusterctl(ctx, sharedHCPContext, e2eCtx) + shared.Logf("Shared HCP infrastructure ready with isolation ID: %s", sharedHCPContext.IsolationID) + }) + + // Create isolated namespace for this specific test + testNamespace = shared.SetupSpecNamespace(ctx, specName+"-workload", e2eCtx) + clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult) + }) + + AfterEach(func(ctx context.Context) { + isolatedCleanup(ctx, sharedHCPContext, testNamespace, clusterResources, e2eCtx) + }) + + It("should create workload cluster with external control plane", func(ctx context.Context) { + shared.Logf("Creating HCP workload cluster using shared management cluster") + + // Create HCP workload cluster configuration + clusterName := fmt.Sprintf("hcp-workload-%s", testNamespace.Name) + configCluster := clusterctl.ConfigClusterInput{ + LogFolder: filepath.Join(e2eCtx.Settings.ArtifactFolder, "clusters", e2eCtx.Environment.BootstrapClusterProxy.GetName()), + ClusterctlConfigPath: e2eCtx.Environment.ClusterctlConfigPath, + KubeconfigPath: e2eCtx.Environment.BootstrapClusterProxy.GetKubeconfigPath(), + InfrastructureProvider: clusterctl.DefaultInfrastructureProvider, + Flavor: shared.FlavorHCPWorkload, + Namespace: testNamespace.Name, + ClusterName: clusterName, + KubernetesVersion: e2eCtx.E2EConfig.MustGetVariable(shared.KubernetesVersion), + ControlPlaneMachineCount: ptr.To(int64(0)), // No control plane machines for HCP + WorkerMachineCount: ptr.To(int64(2)), + } + + shared.Logf("Creating HCP workload cluster: %s", clusterName) + clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ + ClusterProxy: e2eCtx.Environment.BootstrapClusterProxy, + ConfigCluster: configCluster, + WaitForClusterIntervals: e2eCtx.E2EConfig.GetIntervals(specName, "wait-cluster"), + WaitForControlPlaneIntervals: e2eCtx.E2EConfig.GetIntervals(specName, "wait-control-plane"), + WaitForMachineDeployments: e2eCtx.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), + }, clusterResources) + + // Wait for KamajiControlPlane to be ready + kcpName := fmt.Sprintf("%s-control-plane", clusterName) + waitForKamajiControlPlane(ctx, sharedHCPContext.ManagementCluster, testNamespace.Name, kcpName, e2eCtx) + + // Get workload cluster proxy + workloadCluster := e2eCtx.Environment.BootstrapClusterProxy.GetWorkloadCluster(ctx, testNamespace.Name, clusterName) + + // Verification of HCP workload cluster + verifyHCPWorkloadCluster(ctx, workloadCluster, clusterResources, e2eCtx) + + shared.Logf("HCP workload cluster %s created and verified successfully", clusterName) + }) + }) + + Describe("Graceful failure handling", func() { + var ( + testNamespace *corev1.Namespace + clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult + ) + + BeforeEach(func(ctx context.Context) { + // Ensure shared HCP infrastructure is ready + setupOnce.Do(func() { + shared.Logf("Setting up shared HCP test infrastructure") + sharedHCPContext = createHCPTestContext(ctx) + setupSharedManagementCluster(ctx, sharedHCPContext, e2eCtx) + installKamajiUsingClusterctl(ctx, sharedHCPContext, e2eCtx) + shared.Logf("Shared HCP infrastructure ready with isolation ID: %s", sharedHCPContext.IsolationID) + }) + + testNamespace = shared.SetupSpecNamespace(ctx, specName+"-broken", e2eCtx) + clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult) + }) + + AfterEach(func(ctx context.Context) { + isolatedCleanup(ctx, sharedHCPContext, testNamespace, clusterResources, e2eCtx) + }) + + It("should handle broken HCP configuration gracefully", func(ctx context.Context) { + shared.Logf("Testing graceful failure handling for broken HCP configuration") + + // Create broken HCP cluster configuration + clusterName := fmt.Sprintf("hcp-broken-%s", testNamespace.Name) + configCluster := clusterctl.ConfigClusterInput{ + LogFolder: filepath.Join(e2eCtx.Settings.ArtifactFolder, "clusters", e2eCtx.Environment.BootstrapClusterProxy.GetName()), + ClusterctlConfigPath: e2eCtx.Environment.ClusterctlConfigPath, + KubeconfigPath: e2eCtx.Environment.BootstrapClusterProxy.GetKubeconfigPath(), + InfrastructureProvider: clusterctl.DefaultInfrastructureProvider, + Flavor: shared.FlavorBrokenHCP, // This template has broken networking + Namespace: testNamespace.Name, + ClusterName: clusterName, + KubernetesVersion: e2eCtx.E2EConfig.MustGetVariable(shared.KubernetesVersion), + ControlPlaneMachineCount: ptr.To(int64(1)), + WorkerMachineCount: ptr.To(int64(1)), + } + + shared.Logf("Creating broken HCP cluster for graceful failure testing: %s", clusterName) + + // Apply broken template - this should fail gracefully + func() { + defer func() { + if r := recover(); r != nil { + shared.Logf("Graceful recovery from panic during broken cluster creation: %v", r) + } + }() + + clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ + ClusterProxy: e2eCtx.Environment.BootstrapClusterProxy, + ConfigCluster: configCluster, + WaitForClusterIntervals: e2eCtx.E2EConfig.GetIntervals(specName, "wait-cluster"), + WaitForControlPlaneIntervals: e2eCtx.E2EConfig.GetIntervals(specName, "wait-control-plane"), + WaitForMachineDeployments: e2eCtx.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), + }, clusterResources) + }() + + // Verify cluster reaches terminal error state gracefully + waitForTerminalError(ctx, e2eCtx.Environment.BootstrapClusterProxy, testNamespace.Name, clusterName, e2eCtx) + + shared.Logf("Graceful failure handling verified successfully") + }) + }) +})