Skip to content

Commit 4fe0636

Browse files
fix(aws): implement cleanup on partial creation failures (NVIDIA#612)
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent 5efa2b0 commit 4fe0636

File tree

1 file changed

+55
-13
lines changed

1 file changed

+55
-13
lines changed

pkg/provider/aws/create.go

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ const (
3838
defaultWaiterTimeout = 15 * time.Minute
3939
)
4040

41+
type cleanupFunc func() error
42+
4143
// Create creates an EC2 instance with proper Network configuration
4244
// VPC, Subnet, Internet Gateway, Route Table, Security Group
4345
// If the environment specifies a cluster configuration, it delegates to CreateCluster()
@@ -49,70 +51,110 @@ func (p *Provider) Create() error {
4951

5052
// Single-node deployment
5153
cache := new(AWS)
54+
var cleanupStack []cleanupFunc
55+
var err error
56+
57+
// Defer cleanup on failure - execute cleanup functions in reverse order
58+
defer func() {
59+
if err != nil {
60+
p.log.Warning("Creation failed, rolling back created resources...")
61+
for i := len(cleanupStack) - 1; i >= 0; i-- {
62+
if cleanupErr := cleanupStack[i](); cleanupErr != nil {
63+
p.log.Warning("Cleanup failed: %v", cleanupErr)
64+
}
65+
}
66+
}
67+
}()
5268

53-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Creating AWS resources"); err != nil {
69+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Creating AWS resources"); err != nil {
5470
p.log.Warning("Failed to update progressing condition: %v", err)
5571
}
5672

57-
if err := p.createVPC(cache); err != nil {
73+
if err = p.createVPC(cache); err != nil {
5874
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating VPC"); updateErr != nil {
5975
p.log.Warning("Failed to update degraded condition: %v", updateErr)
6076
}
6177
return fmt.Errorf("error creating VPC: %w", err)
6278
}
63-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "VPC created"); err != nil {
79+
cleanupStack = append(cleanupStack, func() error {
80+
cleanupCache := &AWS{Vpcid: cache.Vpcid}
81+
return p.deleteVPC(cleanupCache)
82+
})
83+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "VPC created"); err != nil {
6484
p.log.Warning("Failed to update progressing condition: %v", err)
6585
}
6686

67-
if err := p.createSubnet(cache); err != nil {
87+
if err = p.createSubnet(cache); err != nil {
6888
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating subnet"); updateErr != nil {
6989
p.log.Warning("Failed to update degraded condition: %v", updateErr)
7090
}
7191
return fmt.Errorf("error creating subnet: %w", err)
7292
}
73-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Subnet created"); err != nil {
93+
cleanupStack = append(cleanupStack, func() error {
94+
cleanupCache := &AWS{Subnetid: cache.Subnetid}
95+
return p.deleteSubnet(cleanupCache)
96+
})
97+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Subnet created"); err != nil {
7498
p.log.Warning("Failed to update progressing condition: %v", err)
7599
}
76100

77-
if err := p.createInternetGateway(cache); err != nil {
101+
if err = p.createInternetGateway(cache); err != nil {
78102
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating Internet Gateway"); updateErr != nil {
79103
p.log.Warning("Failed to update degraded condition: %v", updateErr)
80104
}
81105
return fmt.Errorf("error creating Internet Gateway: %w", err)
82106
}
83-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Internet Gateway created"); err != nil {
107+
cleanupStack = append(cleanupStack, func() error {
108+
cleanupCache := &AWS{
109+
InternetGwid: cache.InternetGwid,
110+
Vpcid: cache.Vpcid,
111+
}
112+
return p.deleteInternetGateway(cleanupCache)
113+
})
114+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Internet Gateway created"); err != nil {
84115
p.log.Warning("Failed to update progressing condition: %v", err)
85116
}
86117

87-
if err := p.createRouteTable(cache); err != nil {
118+
if err = p.createRouteTable(cache); err != nil {
88119
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating route table"); updateErr != nil {
89120
p.log.Warning("Failed to update degraded condition: %v", updateErr)
90121
}
91122
return fmt.Errorf("error creating route table: %w", err)
92123
}
93-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Route Table created"); err != nil {
124+
cleanupStack = append(cleanupStack, func() error {
125+
cleanupCache := &AWS{
126+
RouteTable: cache.RouteTable,
127+
Vpcid: cache.Vpcid,
128+
}
129+
return p.deleteRouteTable(cleanupCache)
130+
})
131+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Route Table created"); err != nil {
94132
p.log.Warning("Failed to update progressing condition: %v", err)
95133
}
96134

97-
if err := p.createSecurityGroup(cache); err != nil {
135+
if err = p.createSecurityGroup(cache); err != nil {
98136
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating security group"); updateErr != nil {
99137
p.log.Warning("Failed to update degraded condition: %v", updateErr)
100138
}
101139
return fmt.Errorf("error creating security group: %w", err)
102140
}
103-
if err := p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Security Group created"); err != nil {
141+
cleanupStack = append(cleanupStack, func() error {
142+
cleanupCache := &AWS{SecurityGroupid: cache.SecurityGroupid}
143+
return p.deleteSecurityGroups(cleanupCache)
144+
})
145+
if err = p.updateProgressingCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Security Group created"); err != nil {
104146
p.log.Warning("Failed to update progressing condition: %v", err)
105147
}
106148

107-
if err := p.createEC2Instance(cache); err != nil {
149+
if err = p.createEC2Instance(cache); err != nil {
108150
if updateErr := p.updateDegradedCondition(*p.DeepCopy(), cache, "v1alpha1.Creating", "Error creating EC2 instance"); updateErr != nil {
109151
p.log.Warning("Failed to update degraded condition: %v", updateErr)
110152
}
111153
return fmt.Errorf("error creating EC2 instance: %w", err)
112154
}
113155

114156
// Save objects ID's into a cache file
115-
if err := p.updateAvailableCondition(*p.Environment, cache); err != nil {
157+
if err = p.updateAvailableCondition(*p.Environment, cache); err != nil {
116158
return fmt.Errorf("error creating cache file: %w", err)
117159
}
118160
return nil

0 commit comments

Comments
 (0)