Skip to content

Commit 0968ea6

Browse files
Allow IGM to resume operation after interruption (#3928) (#2446)
Signed-off-by: Modular Magician <[email protected]>
1 parent 3dc9dc7 commit 0968ea6

File tree

3 files changed

+51
-0
lines changed

3 files changed

+51
-0
lines changed

.changelog/3928.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:enhancement
2+
`compute`: Added graceful termination to `google_compute_instance_group_manager` create calls so that partially created instance group managers will resume the original operation if the Terraform process is killed mid create.
3+
```

google-beta/compute_operation.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@ package google
22

33
import (
44
"bytes"
5+
"context"
56
"encoding/json"
7+
"errors"
68
"fmt"
9+
"log"
710
"time"
811

912
computeBeta "google.golang.org/api/compute/v0.beta"
@@ -12,6 +15,7 @@ import (
1215
type ComputeOperationWaiter struct {
1316
Service *computeBeta.Service
1417
Op *computeBeta.Operation
18+
Context context.Context
1519
Project string
1620
Parent string
1721
}
@@ -55,6 +59,15 @@ func (w *ComputeOperationWaiter) QueryOp() (interface{}, error) {
5559
if w == nil || w.Op == nil {
5660
return nil, fmt.Errorf("Cannot query operation, it's unset or nil.")
5761
}
62+
if w.Context != nil {
63+
select {
64+
case <-w.Context.Done():
65+
log.Println("[WARN] request has been cancelled early")
66+
return w.Op, errors.New("unable to finish polling, context has been cancelled")
67+
default:
68+
// default must be here to keep the previous case from blocking
69+
}
70+
}
5871
if w.Op.Zone != "" {
5972
zone := GetResourceNameFromSelfLink(w.Op.Zone)
6073
return w.Service.ZoneOperations.Get(w.Project, zone, w.Op.Name).Do()
@@ -92,6 +105,7 @@ func computeOperationWaitTime(config *Config, res interface{}, project, activity
92105

93106
w := &ComputeOperationWaiter{
94107
Service: config.clientComputeBeta,
108+
Context: config.context,
95109
Op: op,
96110
Project: project,
97111
}

google-beta/resource_compute_instance_group_manager.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,10 @@ func resourceComputeInstanceGroupManager() *schema.Resource {
299299
},
300300
},
301301
},
302+
"operation": {
303+
Type: schema.TypeString,
304+
Computed: true,
305+
},
302306
},
303307
}
304308
}
@@ -376,6 +380,18 @@ func resourceComputeInstanceGroupManagerCreate(d *schema.ResourceData, meta inte
376380
// Wait for the operation to complete
377381
err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate))
378382
if err != nil {
383+
// Check if the create operation failed because Terraform was prematurely terminated. If it was we can persist the
384+
// operation id to state so that a subsequent refresh of this resource will wait until the operation has terminated
385+
// before attempting to Read the state of the manager. This allows a graceful resumption of a Create that was killed
386+
// by the upstream Terraform process exiting early such as a sigterm.
387+
select {
388+
case <-config.context.Done():
389+
log.Printf("[DEBUG] Persisting %s so this operation can be resumed \n", op.Name)
390+
d.Set("operation", op.Name)
391+
return nil
392+
default:
393+
// leaving default case to ensure this is non blocking
394+
}
379395
return err
380396
}
381397

@@ -453,6 +469,24 @@ func resourceComputeInstanceGroupManagerRead(d *schema.ResourceData, meta interf
453469
return err
454470
}
455471

472+
operation := d.Get("operation").(string)
473+
if operation != "" {
474+
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
475+
zone, _ := getZone(d, config)
476+
op := &computeBeta.Operation{
477+
Name: operation,
478+
Zone: zone,
479+
}
480+
d.Set("operation", "")
481+
err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate))
482+
if err != nil {
483+
// remove from state to allow refresh to finish
484+
log.Printf("[DEBUG] Resumed operation returned an error, removing from state: %s", err)
485+
d.SetId("")
486+
return nil
487+
}
488+
}
489+
456490
manager, err := getManager(d, meta)
457491
if err != nil {
458492
return err

0 commit comments

Comments
 (0)