Skip to content

🐛 Fix panic when OpenStackCluster.Status.Network is nil in HCP scenarios #2635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gopls.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
build:
buildFlags:
- "-tags=e2e"
env:
CGO_ENABLED: "1"
22 changes: 19 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ TEST_E2E_DIR := test/e2e
# Files
E2E_DATA_DIR ?= $(REPO_ROOT)/test/e2e/data
E2E_CONF_PATH ?= $(E2E_DATA_DIR)/e2e_conf.yaml
E2E_CONF_PATH_HCP ?= $(E2E_DATA_DIR)/e2e_conf_hcp.yaml
KUBETEST_CONF_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/conformance.yaml)
KUBETEST_FAST_CONF_PATH ?= $(abspath $(E2E_DATA_DIR)/kubetest/conformance-fast.yaml)
GO_INSTALL := ./scripts/go_install.sh
Expand Down Expand Up @@ -184,7 +185,10 @@ e2e-templates: $(addprefix $(E2E_NO_ARTIFACT_TEMPLATES_DIR)/, \
cluster-template-flatcar.yaml \
cluster-template-k8s-upgrade.yaml \
cluster-template-flatcar-sysext.yaml \
cluster-template-no-bastion.yaml)
cluster-template-no-bastion.yaml \
cluster-template-hcp-management.yaml \
cluster-template-hcp-workload.yaml \
cluster-template-hcp-broken.yaml)
# Currently no templates that require CI artifacts
# $(addprefix $(E2E_TEMPLATES_DIR)/, add-templates-here.yaml) \

Expand All @@ -205,7 +209,7 @@ test-e2e: $(GINKGO) e2e-prerequisites ## Run e2e tests
time $(GINKGO) -fail-fast -trace -timeout=3h -show-node-events -v -tags=e2e -nodes=$(E2E_GINKGO_PARALLEL) \
--output-dir="$(ARTIFACTS)" --junit-report="junit.e2e_suite.1.xml" \
-focus="$(E2E_GINKGO_FOCUS)" $(_SKIP_ARGS) $(E2E_GINKGO_ARGS) ./test/e2e/suites/e2e/... -- \
-config-path="$(E2E_CONF_PATH)" -artifacts-folder="$(ARTIFACTS)" \
-config-path="$(E2E_CONF_PATH_HCP)" -artifacts-folder="$(ARTIFACTS)" \
-data-folder="$(E2E_DATA_DIR)" $(E2E_ARGS)

# Pre-compile tests
Expand All @@ -215,7 +219,7 @@ build-e2e-tests: $(GINKGO)
$(GINKGO) build -tags=e2e ./test/e2e/suites/e2e/...

.PHONY: e2e-image
e2e-image: CONTROLLER_IMG_TAG = "gcr.io/k8s-staging-capi-openstack/capi-openstack-controller:e2e"
e2e-image: CONTROLLER_IMG_TAG = "ghcr.io/orkhanorganization/k8s-staging-capi-openstack/capi-openstack-controller:e2e"
e2e-image: docker-build

# Pull all the images references in test/e2e/data/e2e_conf.yaml
Expand All @@ -236,6 +240,18 @@ test-conformance: $(GINKGO) e2e-prerequisites ## Run clusterctl based conformanc
test-conformance-fast: ## Run clusterctl based conformance test on workload cluster (requires Docker) using a subset of the conformance suite in parallel.
$(MAKE) test-conformance CONFORMANCE_E2E_ARGS="-kubetest.config-file=$(KUBETEST_FAST_CONF_PATH) -kubetest.ginkgo-nodes=5 $(E2E_ARGS)"

HCP_E2E_ARGS ?=
HCP_E2E_ARGS += $(E2E_ARGS)
.PHONY: test-hcp
test-hcp: $(GINKGO) e2e-prerequisites ## Run HCP (Hosted Control Plane) e2e tests
time $(GINKGO) -fail-fast -trace -timeout=3h -show-node-events -v -tags=e2e -nodes=$(E2E_GINKGO_PARALLEL) \
--output-dir="$(ARTIFACTS)" --junit-report="junit.hcp_suite.1.xml" \
-focus="$(E2E_GINKGO_FOCUS)" $(_SKIP_ARGS) $(E2E_GINKGO_ARGS) ./test/e2e/suites/hcp/... -- \
-config-path="$(E2E_CONF_PATH_HCP)" -artifacts-folder="$(ARTIFACTS)" \
-data-folder="$(E2E_DATA_DIR)" $(HCP_E2E_ARGS)



APIDIFF_OLD_COMMIT ?= $(shell git rev-parse origin/main)

.PHONY: apidiff
Expand Down
41 changes: 41 additions & 0 deletions cleanup-hcp-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash
# Cleanup script for failed HCP test resources

# Set OpenStack environment
export OS_CLOUD=openstack
export OS_CLOUD_YAML_FILE=/Users/bnr/work/openstack/clouds.yaml

echo "🧹 Cleaning up HCP test resources..."

# Delete the specific instances from your screenshot
echo "Deleting OpenStack instances..."
openstack server delete hcp-mgmt-hcp-mgmt-hcp-1752250951-al4m23-bastion
openstack server delete hcp-mgmt-hcp-mgmt-hcp-1752250951-al4m23-control-plane-sinpp5-bastion

# Clean up any floating IPs that might be allocated
echo "Cleaning up floating IPs..."
openstack floating ip list --status DOWN -f value -c ID | xargs -r openstack floating ip delete

# Clean up security groups (if any were created)
echo "Cleaning up security groups..."
openstack security group list --project $(openstack token issue -c project_id -f value) | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack security group delete

# Clean up keypairs (if any were created)
echo "Cleaning up keypairs..."
openstack keypair list | grep "cluster-api-provider-openstack-sigs-k8s-io" | awk '{print $2}' | xargs -r openstack keypair delete

# Clean up networks and subnets (if any were created)
echo "Cleaning up networks..."
openstack network list | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack network delete

# Clean up any volumes
echo "Cleaning up volumes..."
openstack volume list --status available | grep "hcp-mgmt\|cluster-api" | awk '{print $2}' | xargs -r openstack volume delete

echo "✅ Cleanup completed!"
echo ""
echo "Manual verification commands:"
echo "openstack server list"
echo "openstack floating ip list"
echo "openstack security group list"
echo "openstack network list"
100 changes: 100 additions & 0 deletions cleanup-proper-order.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/bin/bash
# Proper OpenStack cleanup script - deletes in correct dependency order

export OS_CLOUD=openstack
export OS_CLOUD_YAML_FILE=/Users/bnr/work/openstack/clouds.yaml

echo "🧹 Starting proper OpenStack cleanup (dependency order)..."

# 1. First, delete servers (instances)
echo "=== Step 1: Deleting Servers ==="
openstack server list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting server: $name ($id)"
openstack server delete "$id" || true
done

# Wait a bit for servers to be deleted
echo "Waiting for servers to be deleted..."
sleep 10

# 2. Delete floating IPs
echo "=== Step 2: Deleting Floating IPs ==="
openstack floating ip list -f value -c ID | xargs -r -I {} bash -c 'echo "Deleting floating IP: {}"; openstack floating ip delete {} || true'

# 3. Delete load balancers (if any)
echo "=== Step 3: Deleting Load Balancers ==="
openstack loadbalancer list -f value -c id -c name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting loadbalancer: $name ($id)"
openstack loadbalancer delete "$id" --cascade || true
done

# Wait for load balancers to be deleted
echo "Waiting for load balancers to be deleted..."
sleep 15

# 4. Delete router interfaces and routers
echo "=== Step 4: Deleting Routers ==="
openstack router list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Processing router: $name ($id)"

# First remove all interfaces from the router
echo " Removing interfaces from router $name"
openstack port list --router "$id" -f value -c ID | while read port_id; do
echo " Removing interface $port_id"
openstack router remove port "$id" "$port_id" || true
done

# Then delete the router
echo " Deleting router $name"
openstack router delete "$id" || true
done

# 5. Delete ports
echo "=== Step 5: Deleting Ports ==="
openstack port list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting port: $name ($id)"
openstack port delete "$id" || true
done

# 6. Delete subnets
echo "=== Step 6: Deleting Subnets ==="
openstack subnet list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting subnet: $name ($id)"
openstack subnet delete "$id" || true
done

# 7. Finally, delete networks
echo "=== Step 7: Deleting Networks ==="
openstack network list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting network: $name ($id)"
openstack network delete "$id" || true
done

# 8. Delete security groups
echo "=== Step 8: Deleting Security Groups ==="
openstack security group list -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting security group: $name ($id)"
openstack security group delete "$id" || true
done

# 9. Delete keypairs
echo "=== Step 9: Deleting Keypairs ==="
openstack keypair list -f value -c Name | grep -E "(hcp|e2e|cluster-api)" | while read name; do
echo "Deleting keypair: $name"
openstack keypair delete "$name" || true
done

# 10. Delete volumes
echo "=== Step 10: Deleting Volumes ==="
openstack volume list --status available -f value -c ID -c Name | grep -E "(hcp|e2e|cluster-api)" | while read id name; do
echo "Deleting volume: $name ($id)"
openstack volume delete "$id" || true
done

echo "✅ Cleanup completed!"
echo ""
echo "Verification commands:"
echo "openstack server list"
echo "openstack network list | grep -E '(hcp|e2e|cluster-api)'"
echo "openstack router list | grep -E '(hcp|e2e|cluster-api)'"
echo "openstack security group list | grep -E '(hcp|e2e|cluster-api)'"
30 changes: 30 additions & 0 deletions cleanup_openstack.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash
set -e

echo "=== Cleaning up Security Groups ==="
openstack security group list -f value -c ID -c Name | grep -E "(hcp|k8s-cluster.*e2e)" | awk '{print $1}' | xargs -I {} openstack security group delete {}

echo "=== Cleaning up Load Balancers ==="
openstack loadbalancer list -f value -c id -c name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack loadbalancer delete {} || true

echo "=== Cleaning up Routers ==="
openstack router list -f value -c ID -c Name | grep -E "(hcp|e2e)" | while read router_id router_name; do
echo "Cleaning router: $router_name ($router_id)"
# Remove external gateway
openstack router unset --external-gateway $router_id 2>/dev/null || true
# Remove all ports
openstack port list --router $router_id -f value -c ID | xargs -I {} openstack router remove port $router_id {} 2>/dev/null || true
# Delete router
openstack router delete $router_id
done

echo "=== Cleaning up Subnets ==="
openstack subnet list -f value -c ID -c Name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack subnet delete {}

echo "=== Cleaning up Networks ==="
openstack network list -f value -c ID -c Name | grep -E "(hcp|e2e)" | awk '{print $1}' | xargs -I {} openstack network delete {}

echo "=== Cleaning up Floating IPs ==="
openstack floating ip list -f value -c ID -c Description | grep -E "(hcp|e2e|cluster)" | awk '{print $1}' | xargs -I {} openstack floating ip delete {} || true

echo "=== Cleanup Complete ==="
19 changes: 14 additions & 5 deletions controllers/openstackcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,10 @@ func (r *OpenStackClusterReconciler) reconcileBastionServer(ctx context.Context,
}

// If the bastion is found but the spec has changed, we need to delete it and reconcile.
bastionServerSpec := bastionToOpenStackServerSpec(openStackCluster)
bastionServerSpec, err := bastionToOpenStackServerSpec(openStackCluster)
if err != nil {
return nil, true, err
}
if !bastionNotFound && server != nil && !apiequality.Semantic.DeepEqual(bastionServerSpec, &server.Spec) {
scope.Logger().Info("Bastion spec has changed, re-creating the OpenStackServer object")
if err := r.deleteBastion(ctx, scope, cluster, openStackCluster); err != nil {
Expand Down Expand Up @@ -543,7 +546,10 @@ func (r *OpenStackClusterReconciler) getBastionServer(ctx context.Context, openS
// createBastionServer creates the OpenStackServer object for the bastion server.
// It returns the OpenStackServer object and an error if any.
func (r *OpenStackClusterReconciler) createBastionServer(ctx context.Context, openStackCluster *infrav1.OpenStackCluster, cluster *clusterv1.Cluster) (*infrav1alpha1.OpenStackServer, error) {
bastionServerSpec := bastionToOpenStackServerSpec(openStackCluster)
bastionServerSpec, err := bastionToOpenStackServerSpec(openStackCluster)
if err != nil {
return nil, err
}
bastionServer := &infrav1alpha1.OpenStackServer{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
Expand Down Expand Up @@ -571,7 +577,7 @@ func (r *OpenStackClusterReconciler) createBastionServer(ctx context.Context, op

// bastionToOpenStackServerSpec converts the OpenStackMachineSpec for the bastion to an OpenStackServerSpec.
// It returns the OpenStackServerSpec and an error if any.
func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) *infrav1alpha1.OpenStackServerSpec {
func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) (*infrav1alpha1.OpenStackServerSpec, error) {
bastion := openStackCluster.Spec.Bastion
if bastion == nil {
bastion = &infrav1.Bastion{}
Expand All @@ -586,9 +592,12 @@ func bastionToOpenStackServerSpec(openStackCluster *infrav1.OpenStackCluster) *i
if bastion.AvailabilityZone != nil {
az = *bastion.AvailabilityZone
}
openStackServerSpec := openStackMachineSpecToOpenStackServerSpec(bastion.Spec, openStackCluster.Spec.IdentityRef, compute.InstanceTags(bastion.Spec, openStackCluster), az, nil, getBastionSecurityGroupID(openStackCluster), openStackCluster.Status.Network.ID)
openStackServerSpec, err := openStackMachineSpecToOpenStackServerSpec(bastion.Spec, openStackCluster.Spec.IdentityRef, compute.InstanceTags(bastion.Spec, openStackCluster), az, nil, getBastionSecurityGroupID(openStackCluster), openStackCluster.Status.Network)
if err != nil {
return nil, err
}

return openStackServerSpec
return openStackServerSpec, nil
}

func bastionName(clusterResourceName string) string {
Expand Down
40 changes: 25 additions & 15 deletions controllers/openstackmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,18 @@ func (r *OpenStackMachineReconciler) getMachineServer(ctx context.Context, openS

// openStackMachineSpecToOpenStackServerSpec converts an OpenStackMachineSpec to an OpenStackServerSpec.
// It returns the OpenStackServerSpec object and an error if there is any.
func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.OpenStackMachineSpec, identityRef infrav1.OpenStackIdentityReference, tags []string, failureDomain string, userDataRef *corev1.LocalObjectReference, defaultSecGroup *string, defaultNetworkID string) *infrav1alpha1.OpenStackServerSpec {
func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.OpenStackMachineSpec, identityRef infrav1.OpenStackIdentityReference, tags []string, failureDomain string, userDataRef *corev1.LocalObjectReference, defaultSecGroup *string, clusterNetwork *infrav1.NetworkStatusWithSubnets) (*infrav1alpha1.OpenStackServerSpec, error) {
// Determine default network ID if the cluster status exposes one.
var defaultNetworkID string
if clusterNetwork != nil {
defaultNetworkID = clusterNetwork.ID
}

// If no cluster network is available AND the machine spec did not define any ports with a network, we cannot choose a network.
if defaultNetworkID == "" && len(openStackMachineSpec.Ports) == 0 {
return nil, capoerrors.Terminal(infrav1.InvalidMachineSpecReason, "no network configured: cluster network is missing and machine spec does not define ports with a network")
}

openStackServerSpec := &infrav1alpha1.OpenStackServerSpec{
AdditionalBlockDevices: openStackMachineSpec.AdditionalBlockDevices,
ConfigDrive: openStackMachineSpec.ConfigDrive,
Expand Down Expand Up @@ -521,25 +532,21 @@ func openStackMachineSpecToOpenStackServerSpec(openStackMachineSpec *infrav1.Ope
serverPorts = make([]infrav1.PortOpts, 1)
}
for i := range serverPorts {
if serverPorts[i].Network == nil {
serverPorts[i].Network = &infrav1.NetworkParam{
ID: &defaultNetworkID,
}
}
if len(serverPorts[i].SecurityGroups) == 0 && defaultSecGroup != nil {
serverPorts[i].SecurityGroups = []infrav1.SecurityGroupParam{
{
ID: defaultSecGroup,
},
}
// Only inject the default network when we actually have an ID.
if serverPorts[i].Network == nil && defaultNetworkID != "" {
serverPorts[i].Network = &infrav1.NetworkParam{ID: &defaultNetworkID}
}
if len(openStackMachineSpec.SecurityGroups) > 0 {
serverPorts[i].SecurityGroups = append(serverPorts[i].SecurityGroups, openStackMachineSpec.SecurityGroups...)
// Machine level security groups override any cluster defaults.
serverPorts[i].SecurityGroups = openStackMachineSpec.SecurityGroups
} else if len(serverPorts[i].SecurityGroups) == 0 && defaultSecGroup != nil {
// Fall back to cluster-managed security group when nothing else specified.
serverPorts[i].SecurityGroups = []infrav1.SecurityGroupParam{{ID: defaultSecGroup}}
}
}
openStackServerSpec.Ports = serverPorts

return openStackServerSpec
return openStackServerSpec, nil
}

// reconcileMachineServer reconciles the OpenStackServer object for the OpenStackMachine.
Expand Down Expand Up @@ -588,7 +595,10 @@ func (r *OpenStackMachineReconciler) getOrCreateMachineServer(ctx context.Contex
}
return openStackCluster.Spec.IdentityRef
}()
machineServerSpec := openStackMachineSpecToOpenStackServerSpec(&openStackMachine.Spec, identityRef, compute.InstanceTags(&openStackMachine.Spec, openStackCluster), failureDomain, userDataRef, getManagedSecurityGroup(openStackCluster, machine), openStackCluster.Status.Network.ID)
machineServerSpec, err := openStackMachineSpecToOpenStackServerSpec(&openStackMachine.Spec, identityRef, compute.InstanceTags(&openStackMachine.Spec, openStackCluster), failureDomain, userDataRef, getManagedSecurityGroup(openStackCluster, machine), openStackCluster.Status.Network)
if err != nil {
return nil, err
}
machineServer = &infrav1alpha1.OpenStackServer{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
Expand Down
Loading