Skip to content

Commit c7d3a7c

Browse files
committed
clean up orphans on scale-out
If a droplet is created by the autoscaler, but it fails to register with nomad as a client, it should be considered an orphan and deleted. Add a configuration setting to control this behaviour. Only droplets with the "name" tag which were not recently created will be considered for deletion.
1 parent 1f87c95 commit c7d3a7c

File tree

11 files changed

+238
-58
lines changed

11 files changed

+238
-58
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ check "hashistack-allocated-cpu" {
3535
# ...
3636
target "do-droplets" {
3737
create_reserved_addresses = "true"
38+
init_grace_period = "10m"
3839
ipv6 = "true"
3940
name = "hashi-worker"
4041
node_class = "hashistack"
@@ -56,6 +57,8 @@ check "hashistack-allocated-cpu" {
5657

5758
- `name` `(string: <required>)` - A logical name of a Droplet "group". Every managed Droplet will be tagged with this value and its name is this value with a random suffix
5859

60+
- `init_grace_period` `(duration: "0m")` Any droplets tagged with the `name` value which are older than this duration and still not recognised as a nomad client are considered to be orphans, and will be deleted at a subsequent scale-in event. Setting this to zero will disable this feature.
61+
5962
- `region` `(string: <required>)` - The region to start in.
6063

6164
- `vpc_uuid` `(string: <required>)` - The ID of the VPC where the Droplet will be located.

demo/jobs/templates/autoscaler.nomad

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ job "autoscaler" {
1111
driver = "docker"
1212

1313
artifact {
14-
source = "https://github.com/Aiven-Open/nomad-droplets-autoscaler/releases/download/v0.0.25/nomad-droplets-autoscaler_Linux_x86_64.tar.gz"
14+
source = "https://github.com/Aiven-Open/nomad-droplets-autoscaler/releases/download/v0.1.9/nomad-droplets-autoscaler_Linux_x86_64.tar.gz"
1515
destination = "local/plugins/"
1616
}
1717

@@ -87,21 +87,20 @@ scaling "batch" {
8787
}
8888
8989
target "do-droplets" {
90-
name = "hashi-batch"
91-
region = "${region}"
92-
size = "s-1vcpu-1gb"
93-
snapshot_id = ${snapshot_id}
94-
user_data = "local/batch-startup.sh"
95-
tags = "hashi-stack"
96-
97-
datacenter = "batch_workers"
98-
node_drain_deadline = "1h"
99-
node_selector_strategy = "empty_ignore_system"
100-
101-
reserve_ipv4_addresses = "false"
102-
reserve_ipv6_addresses = "false"
103-
ipv6 = "true"
10490
create_reserved_addresses = "false"
91+
datacenter = "batch_workers"
92+
init_grace_period = "10m"
93+
ipv6 = "true"
94+
name = "hashi-batch"
95+
node_drain_deadline = "1h"
96+
node_selector_strategy = "empty_ignore_system"
97+
region = "${region}"
98+
reserve_ipv4_addresses = "false"
99+
reserve_ipv6_addresses = "false"
100+
size = "s-1vcpu-1gb"
101+
snapshot_id = ${snapshot_id}
102+
tags = "hashi-stack"
103+
user_data = "local/batch-startup.sh"
105104
}
106105
}
107106
}
@@ -157,6 +156,8 @@ server:
157156
http_listen_port: {{ env "NOMAD_PORT_promtail" }}
158157
grpc_listen_port: 0
159158
159+
log_level: "DEBUG"
160+
160161
positions:
161162
filename: /tmp/positions.yaml
162163

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ require (
88
github.com/goccy/go-yaml v1.18.0
99
github.com/google/uuid v1.6.0
1010
github.com/hashicorp/go-hclog v1.6.3
11+
github.com/hashicorp/golang-lru/v2 v2.0.7
1112
github.com/hashicorp/nomad-autoscaler v0.4.7
1213
github.com/hashicorp/nomad/api v0.0.0-20250721135329-36b4aa79df33
1314
github.com/hashicorp/vault-client-go v0.4.3

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5O
5858
github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8=
5959
github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 h1:kes8mmyCpxJsI7FTwtzRqEy9CdjCtrXrXGuOpxEA7Ts=
6060
github.com/hashicorp/go-secure-stdlib/strutil v0.1.2/go.mod h1:Gou2R9+il93BqX25LAKCLuM+y9U2T4hlwvT1yprcna4=
61+
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
62+
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
6163
github.com/hashicorp/hcl/v2 v2.23.0 h1:Fphj1/gCylPxHutVSEOf2fBOh1VE4AuLV7+kbJf3qos=
6264
github.com/hashicorp/hcl/v2 v2.23.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA=
6365
github.com/hashicorp/nomad-autoscaler v0.4.7 h1:rjWGNeTdVJBo/iSrf+Oc4ecbFRas7vZrYl5CJ68X2lw=

plugin/digitalocean.go

Lines changed: 130 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/digitalocean/godo"
1515
"github.com/google/uuid"
1616
"github.com/hashicorp/go-hclog"
17+
"github.com/hashicorp/nomad-autoscaler/sdk/helper/nomad"
1718
"github.com/hashicorp/nomad/api"
1819
)
1920

@@ -24,6 +25,7 @@ const (
2425

2526
type dropletTemplate struct {
2627
createReservedAddresses bool
28+
initGracePeriod time.Duration
2729
ipv6 bool
2830
name string
2931
region string
@@ -42,6 +44,8 @@ type dropletTemplate struct {
4244
vpc string
4345
}
4446

47+
type DropletIDs map[int]struct{}
48+
4549
func (t *TargetPlugin) scaleOut(
4650
ctx context.Context,
4751
desired, diff int64,
@@ -52,8 +56,6 @@ func (t *TargetPlugin) scaleOut(
5256

5357
log.Debug("creating DigitalOcean droplets", "template", fmt.Sprintf("%+v", template))
5458

55-
ctx, cancel := context.WithCancelCause(ctx)
56-
defer cancel(nil)
5759
wg := &sync.WaitGroup{}
5860
var prereservedIPV4s []string
5961
var prereservedIPV6s []string
@@ -189,10 +191,22 @@ func (t *TargetPlugin) scaleOut(
189191
wg.Wait()
190192
close(errorChannel)
191193
}()
194+
192195
errorList := make([]error, 0)
193196
for err := range errorChannel {
194197
errorList = append(errorList, err)
195198
}
199+
200+
// Regardless of whether there were failures, schedule an orphaned-droplet check
201+
if template.initGracePeriod > time.Second {
202+
// Wait until enough time has passed so that if the newly-created droplet(s) are not yet
203+
// participating in the nomad cluster, they will be considered orphaned.
204+
// Add a minute to the grace period to allow for minor time disparities between
205+
// the autoscaler clock and the droplets' creation times.
206+
delay := template.initGracePeriod + time.Minute
207+
go deleteOrphanedDroplets(ctx, t.logger, t.client.Droplets(), t.getReadyNomadClients, template, delay)
208+
}
209+
196210
if len(errorList) > 0 {
197211
return errors.Join(errorList...)
198212
}
@@ -208,6 +222,119 @@ func (t *TargetPlugin) scaleOut(
208222
return nil
209223
}
210224

225+
// deleteOrphanedDroplets will destroy any droplets which are not whitelisted,
226+
// but only if they have the tag shared by all droplets managed by the autoscaler,
227+
// and which were not recently created.
228+
// whitelistGenerator: provides a set of droplet IDs which are expected; any other
229+
// droplets will be assessed and potentially deleted.
230+
// delay: the time to wait before looking for orphans
231+
func deleteOrphanedDroplets(ctx context.Context,
232+
logger hclog.Logger,
233+
dropletsService Droplets,
234+
whitelistGenerator func(ctx context.Context) (DropletIDs, error),
235+
template *dropletTemplate,
236+
delay time.Duration,
237+
) {
238+
if err := Sleep(ctx, delay); err != nil {
239+
logger.Info("context was cancelled, so not checking for orphaned droplets")
240+
return
241+
}
242+
243+
whitelist, err := whitelistGenerator(ctx)
244+
if err != nil {
245+
logger.Error("cannot determine which droplets are whitelisted: %w", err)
246+
return
247+
}
248+
logger.Info("checking for orphaned droplets", "whitelist size", len(whitelist))
249+
for droplet, err := range Unpaginate(ctx, Unarg(dropletsService.ListByTag, template.name), godo.ListOptions{}) {
250+
if err != nil {
251+
logger.Error("cannot retrieve droplets", "error", err)
252+
return
253+
}
254+
// This could be done in parallel, but shouldn't be necessary
255+
if _, exists := whitelist[droplet.ID]; exists {
256+
logger.Debug("droplet is a nomad client, so not considering an orphan", "droplet ID", droplet.ID)
257+
} else {
258+
dt, err := time.Parse(time.RFC3339, droplet.Created)
259+
if err != nil {
260+
logger.Error("cannot parse droplet creation time. Not treating as an orphan", "error", err, "droplet ID", droplet.ID)
261+
continue
262+
}
263+
if time.Since(dt) < template.initGracePeriod {
264+
logger.Debug("Droplet was very recently created. Not treating as an orphan", "droplet ID", droplet.ID)
265+
continue
266+
}
267+
268+
if _, err := dropletsService.Delete(ctx, droplet.ID); err == nil {
269+
logger.Info("deleted orphaned droplet", "droplet ID", droplet.ID)
270+
} else {
271+
logger.Error("cannot delete droplet", "error", err, "droplet ID", droplet.ID)
272+
}
273+
}
274+
}
275+
logger.Debug("finished checking for orphaned droplets")
276+
}
277+
278+
// getReadyNomadClients returns a set of droplet IDs
279+
// where the nomad client is running and has "ready" status.
280+
func (t *TargetPlugin) getReadyNomadClients(ctx context.Context) (DropletIDs, error) {
281+
result := make(DropletIDs)
282+
cfg := nomad.ConfigFromNamespacedMap(t.config)
283+
client, err := api.NewClient(cfg)
284+
if err != nil {
285+
return nil, fmt.Errorf("failed to instantiate Nomad client: %v", err)
286+
}
287+
nodes, _, err := client.Nodes().List(&api.QueryOptions{Params: map[string]string{"resources": "true"}})
288+
if err != nil {
289+
return nil, fmt.Errorf("failed to list Nomad nodes from API: %v", err)
290+
}
291+
292+
q := api.QueryOptions{
293+
AllowStale: true,
294+
}
295+
for _, node := range nodes {
296+
// TODO: filter out nodes which are not part of our node pool.
297+
// This is just an optimisation, as it's only droplets which aren't in any node pool
298+
// which are susceptible to being considered orphans.
299+
300+
t.logger.Debug("found node",
301+
"node_id", node.ID, "datacenter", node.Datacenter, "node_class", node.NodeClass, "node_pool", node.NodePool,
302+
"status", node.Status, "eligibility", node.SchedulingEligibility, "draining", node.Drain, "all", fmt.Sprintf("%+v", node),
303+
)
304+
if node.Status != "ready" {
305+
t.logger.Info("node is known as a nomad client but its status is not ready", "node ID", node.ID, "status", node.Status)
306+
continue
307+
}
308+
309+
if dropletID, exists := t.dropletMapping.Get(node.ID); exists {
310+
// this node's droplet ID is already known, so include it
311+
result[dropletID] = struct{}{}
312+
continue
313+
}
314+
315+
// The summary daa returned by client.Nodes() does not contain sufficient metadaa to determine
316+
// the droplet ID; a follow-up call to `Info()` is required.
317+
node, _, err := client.Nodes().Info(node.ID, &q)
318+
if err != nil {
319+
t.logger.Warn("cannot get node info", "node ID", node.ID, "err", err)
320+
continue
321+
}
322+
dropletID, ok := node.Attributes["unique.platform.digitalocean.id"]
323+
if !ok || dropletID == "" {
324+
t.logger.Warn("cannot find droplet ID", "NodeID", node.ID, "attributes", node.Attributes, "node", fmt.Sprintf("%+v", node), "err", err)
325+
continue
326+
}
327+
t.logger.Debug("Found droplet ID for node", "NodeID", node.ID, "droplet ID", dropletID)
328+
numericID, err := strconv.Atoi(dropletID)
329+
if err != nil {
330+
return nil, fmt.Errorf("cannot convert %v to an integer: %w", dropletID, err)
331+
}
332+
result[numericID] = struct{}{}
333+
t.dropletMapping.Add(node.ID, numericID)
334+
}
335+
return result, nil
336+
}
337+
211338
func (t *TargetPlugin) scaleIn(
212339
ctx context.Context,
213340
desired, diff int64,
@@ -319,7 +446,7 @@ func (t *TargetPlugin) ensureDropletsAreStable(
319446
cancel(err)
320447
return err
321448
} else {
322-
return errors.New("waiting for droplets to become stable")
449+
return fmt.Errorf("waiting for droplets to become stable. desired:%v active:%v", desired, active)
323450
}
324451
},
325452
)

plugin/digitalocean_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,54 @@ package plugin
22

33
import (
44
"context"
5+
"fmt"
56
"strings"
67
"testing"
78
"time"
89

10+
"github.com/digitalocean/godo"
911
"github.com/google/uuid"
1012
"github.com/hashicorp/go-hclog"
1113
"github.com/stretchr/testify/require"
1214
)
1315

16+
func TestDeleteDropletsWhenFailedToJoinNomadCluster(t *testing.T) {
17+
ctx, cancel := context.WithTimeout(t.Context(), time.Second*5)
18+
defer cancel()
19+
mock := createMockGodo()
20+
21+
whitelist := make(DropletIDs)
22+
dt := &dropletTemplate{name: "banana", initGracePeriod: 10 * time.Minute}
23+
24+
testCases := []struct {
25+
age time.Duration
26+
whitelisted bool
27+
expectDelete bool
28+
}{
29+
{age: time.Second, whitelisted: false, expectDelete: false},
30+
{age: time.Hour, whitelisted: true, expectDelete: false},
31+
{age: time.Hour, whitelisted: false, expectDelete: true},
32+
}
33+
34+
for _, tc := range testCases {
35+
t.Run(fmt.Sprintf("%+v", tc), func(t *testing.T) {
36+
droplet, _, err := mock.Droplets().Create(ctx, &godo.DropletCreateRequest{Region: "foo", Tags: []string{"banana"}})
37+
require.NoError(t, err)
38+
require.Contains(t, mock.droplets, droplet.ID)
39+
droplet.Created = time.Now().Add(-tc.age).Format(time.RFC3339)
40+
if tc.whitelisted {
41+
whitelist[droplet.ID] = struct{}{}
42+
}
43+
deleteOrphanedDroplets(ctx, hclog.Default(), mock.Droplets(), func(ctx context.Context) (DropletIDs, error) { return whitelist, nil }, dt, 0)
44+
if tc.expectDelete {
45+
require.NotContains(t, mock.droplets, droplet.ID)
46+
} else {
47+
require.Contains(t, mock.droplets, droplet.ID)
48+
}
49+
})
50+
}
51+
}
52+
1453
func TestScaleOut(t *testing.T) {
1554
ctx, cancel := context.WithTimeout(t.Context(), time.Second*5)
1655
defer cancel()

plugin/godo_mock.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ func (m *mockDroplets) Create(
196196
Tags: req.Tags,
197197
Status: "active",
198198
Networks: networks,
199+
Created: time.Now().Format(time.RFC3339),
199200
}
200201
m.mock.dropletUserData[droplet.ID] = req.UserData
201202
m.mock.droplets[droplet.ID] = droplet

0 commit comments

Comments
 (0)