Skip to content

Commit 63d8e85

Browse files
authored
Only allow one k0sctl to run simultaneously per host (#382)
* Only allow one k0sctl to run simultaneously per host Cant use configurer before detect OS Flock Wait a while No flock? Take a risk Seal window * Complete redo * Cleanup * Lint * Fall back to /tmp/k0sctl.lock if /run/lock does not exist
1 parent 88b097a commit 63d8e85

File tree

8 files changed

+219
-2
lines changed

8 files changed

+219
-2
lines changed

cmd/apply.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@ var applyCommand = &cli.Command{
4848
phase.NoWait = ctx.Bool("no-wait")
4949

5050
manager := phase.Manager{Config: ctx.Context.Value(ctxConfigKey{}).(*v1beta1.Cluster)}
51+
lockPhase := &phase.Lock{}
5152

5253
manager.AddPhase(
5354
&phase.Connect{},
5455
&phase.DetectOS{},
56+
lockPhase,
5557
&phase.PrepareHosts{},
5658
&phase.GatherFacts{},
5759
&phase.DownloadBinaries{},
@@ -75,19 +77,22 @@ var applyCommand = &cli.Command{
7577
NoDrain: ctx.Bool("no-drain"),
7678
},
7779
&phase.RunHooks{Stage: "after", Action: "apply"},
80+
&phase.Unlock{Cancel: lockPhase.Cancel},
7881
&phase.Disconnect{},
7982
)
8083

8184
analytics.Client.Publish("apply-start", map[string]interface{}{})
8285

83-
if err := manager.Run(); err != nil {
86+
var result error
87+
88+
if result = manager.Run(); result != nil {
8489
analytics.Client.Publish("apply-failure", map[string]interface{}{"clusterID": manager.Config.Spec.K0s.Metadata.ClusterID})
8590
if lf, err := LogFile(); err == nil {
8691
if ln, ok := lf.(interface{ Name() string }); ok {
8792
log.Errorf("apply failed - log file saved to %s", ln.Name())
8893
}
8994
}
90-
return err
95+
return result
9196
}
9297

9398
analytics.Client.Publish("apply-success", map[string]interface{}{"duration": time.Since(start), "clusterID": manager.Config.Spec.K0s.Metadata.ClusterID})

cmd/backup.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,18 @@ var backupCommand = &cli.Command{
2828
start := time.Now()
2929

3030
manager := phase.Manager{Config: ctx.Context.Value(ctxConfigKey{}).(*v1beta1.Cluster)}
31+
lockPhase := &phase.Lock{}
32+
3133
manager.AddPhase(
3234
&phase.Connect{},
3335
&phase.DetectOS{},
36+
lockPhase,
3437
&phase.GatherFacts{},
3538
&phase.GatherK0sFacts{},
3639
&phase.RunHooks{Stage: "before", Action: "backup"},
3740
&phase.Backup{},
3841
&phase.RunHooks{Stage: "after", Action: "backup"},
42+
&phase.Unlock{Cancel: lockPhase.Cancel},
3943
&phase.Disconnect{},
4044
)
4145

cmd/reset.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,17 @@ var resetCommand = &cli.Command{
5252

5353
manager := phase.Manager{Config: ctx.Context.Value(ctxConfigKey{}).(*v1beta1.Cluster)}
5454

55+
lockPhase := &phase.Lock{}
5556
manager.AddPhase(
5657
&phase.Connect{},
5758
&phase.DetectOS{},
59+
lockPhase,
5860
&phase.PrepareHosts{},
5961
&phase.GatherK0sFacts{},
6062
&phase.RunHooks{Stage: "before", Action: "reset"},
6163
&phase.Reset{},
6264
&phase.RunHooks{Stage: "after", Action: "reset"},
65+
&phase.Unlock{Cancel: lockPhase.Cancel},
6366
&phase.Disconnect{},
6467
)
6568

configurer/linux.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,15 @@ func (l Linux) K0sJoinTokenPath() string {
8787
return "/etc/k0s/k0stoken"
8888
}
8989

90+
// K0sctlLockFilePath returns a path to a lock file
91+
func (l Linux) K0sctlLockFilePath(h os.Host) string {
92+
if h.Exec("test -d /run/lock", exec.Sudo(h)) == nil {
93+
return "/run/lock/k0sctl"
94+
}
95+
96+
return "/tmp/k0sctl.lock"
97+
}
98+
9099
// TempFile returns a temp file path
91100
func (l Linux) TempFile(h os.Host) (string, error) {
92101
return h.ExecOutput("mktemp")
@@ -206,3 +215,30 @@ func (l Linux) PrivateAddress(h os.Host, iface, publicip string) (string, error)
206215

207216
return "", fmt.Errorf("not found")
208217
}
218+
219+
// UpsertFile creates a file in path with content only if the file does not exist already
220+
func (l Linux) UpsertFile(h os.Host, path, content string) error {
221+
tmpf, err := l.TempFile(h)
222+
if err != nil {
223+
return err
224+
}
225+
if err := h.Execf(`cat > "%s"`, tmpf, exec.Stdin(content), exec.Sudo(h)); err != nil {
226+
return err
227+
}
228+
229+
defer func() {
230+
_ = h.Execf(`rm -f "%s"`, tmpf, exec.Sudo(h))
231+
}()
232+
233+
// mv -n is atomic
234+
if err := h.Execf(`mv -n "%s" "%s"`, tmpf, path, exec.Sudo(h)); err != nil {
235+
return fmt.Errorf("upsert failed: %w", err)
236+
}
237+
238+
// if original tempfile still exists, error out
239+
if h.Execf(`test -f "%s"`, tmpf) == nil {
240+
return fmt.Errorf("upsert failed")
241+
}
242+
243+
return nil
244+
}

phase/lock.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
package phase
2+
3+
import (
4+
"context"
5+
"fmt"
6+
gos "os"
7+
"sync"
8+
"time"
9+
10+
retry "github.com/avast/retry-go"
11+
"github.com/k0sproject/k0sctl/analytics"
12+
"github.com/k0sproject/k0sctl/pkg/apis/k0sctl.k0sproject.io/v1beta1"
13+
"github.com/k0sproject/k0sctl/pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster"
14+
"github.com/k0sproject/rig/exec"
15+
log "github.com/sirupsen/logrus"
16+
)
17+
18+
// Lock acquires an exclusive k0sctl lock on hosts
19+
type Lock struct {
20+
GenericPhase
21+
cfs []func()
22+
instanceID string
23+
m sync.Mutex
24+
wg sync.WaitGroup
25+
}
26+
27+
// Prepare the phase
28+
func (p *Lock) Prepare(c *v1beta1.Cluster) error {
29+
p.Config = c
30+
mid, _ := analytics.MachineID()
31+
p.instanceID = fmt.Sprintf("%s-%d", mid, gos.Getpid())
32+
return nil
33+
}
34+
35+
// Title for the phase
36+
func (p *Lock) Title() string {
37+
return "Acquire exclusive host lock"
38+
}
39+
40+
func (p *Lock) Cancel() {
41+
p.m.Lock()
42+
defer p.m.Unlock()
43+
for _, f := range p.cfs {
44+
f()
45+
}
46+
p.wg.Wait()
47+
}
48+
49+
// Run the phase
50+
func (p *Lock) Run() error {
51+
if err := p.Config.Spec.Hosts.ParallelEach(p.startLock); err != nil {
52+
return err
53+
}
54+
return p.Config.Spec.Hosts.ParallelEach(p.startTicker)
55+
}
56+
57+
func (p *Lock) startTicker(h *cluster.Host) error {
58+
p.wg.Add(1)
59+
lfp := h.Configurer.K0sctlLockFilePath(h)
60+
ticker := time.NewTicker(10 * time.Second)
61+
ctx, cancel := context.WithCancel(context.Background())
62+
p.m.Lock()
63+
p.cfs = append(p.cfs, cancel)
64+
p.m.Unlock()
65+
66+
go func() {
67+
log.Debugf("%s: started periodic update of lock file %s timestamp", h, lfp)
68+
for {
69+
select {
70+
case <-ticker.C:
71+
if err := h.Configurer.Touch(h, lfp, time.Now(), exec.Sudo(h)); err != nil {
72+
log.Warnf("%s: failed to touch lock file: %s", h, err)
73+
}
74+
case <-ctx.Done():
75+
log.Debugf("%s: stopped lock cycle, removing file", h)
76+
if err := h.Configurer.DeleteFile(h, lfp); err != nil {
77+
log.Warnf("%s: failed to remove host lock file: %s", h, err)
78+
}
79+
p.wg.Done()
80+
return
81+
}
82+
}
83+
}()
84+
85+
return nil
86+
}
87+
88+
func (p *Lock) startLock(h *cluster.Host) error {
89+
return retry.Do(
90+
func() error {
91+
return p.tryLock(h)
92+
},
93+
retry.OnRetry(
94+
func(n uint, err error) {
95+
log.Errorf("%s: attempt %d of %d.. trying to obtain a lock on host: %s", h, n+1, retries, err.Error())
96+
},
97+
),
98+
retry.DelayType(retry.CombineDelay(retry.FixedDelay, retry.RandomDelay)),
99+
retry.MaxJitter(time.Second*2),
100+
retry.Delay(time.Second*3),
101+
retry.Attempts(5),
102+
retry.LastErrorOnly(true),
103+
)
104+
}
105+
106+
func (p *Lock) tryLock(h *cluster.Host) error {
107+
lfp := h.Configurer.K0sctlLockFilePath(h)
108+
109+
if err := h.Configurer.UpsertFile(h, lfp, p.instanceID); err != nil {
110+
stat, err := h.Configurer.Stat(h, lfp, exec.Sudo(h))
111+
if err != nil {
112+
return fmt.Errorf("lock file disappeared: %w", err)
113+
}
114+
content, err := h.Configurer.ReadFile(h, lfp)
115+
if err != nil {
116+
return fmt.Errorf("failed to read lock file: %w", err)
117+
}
118+
if content != p.instanceID {
119+
if time.Since(stat.ModTime()) < 30*time.Second {
120+
return fmt.Errorf("another instance of k0sctl is currently operating on the host")
121+
}
122+
_ = h.Configurer.DeleteFile(h, lfp)
123+
return fmt.Errorf("removed existing expired lock file")
124+
}
125+
}
126+
127+
return nil
128+
}

phase/prepare_hosts.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
// PrepareHosts installs required packages and so on on the hosts.
1212
type PrepareHosts struct {
1313
GenericPhase
14+
cancel func()
1415
}
1516

1617
// Title for the phase
@@ -27,6 +28,10 @@ type prepare interface {
2728
Prepare(os.Host) error
2829
}
2930

31+
func (p *PrepareHosts) CleanUp() {
32+
p.cancel()
33+
}
34+
3035
func (p *PrepareHosts) prepareHost(h *cluster.Host) error {
3136
if c, ok := h.Configurer.(prepare); ok {
3237
if err := c.Prepare(h); err != nil {

phase/unlock.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package phase
2+
3+
import (
4+
"github.com/k0sproject/k0sctl/pkg/apis/k0sctl.k0sproject.io/v1beta1"
5+
log "github.com/sirupsen/logrus"
6+
)
7+
8+
// Unlock acquires an exclusive k0sctl lock on hosts
9+
type Unlock struct {
10+
GenericPhase
11+
Cancel func()
12+
}
13+
14+
// Prepare the phase
15+
func (p *Unlock) Prepare(c *v1beta1.Cluster) error {
16+
p.Config = c
17+
if p.Cancel == nil {
18+
p.Cancel = func() {
19+
log.Fatalf("cancel function not defined")
20+
}
21+
}
22+
return nil
23+
}
24+
25+
// Title for the phase
26+
func (p *Unlock) Title() string {
27+
return "Release exclusive host lock"
28+
}
29+
30+
// Run the phase
31+
func (p *Unlock) Run() error {
32+
p.Cancel()
33+
return nil
34+
}

pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster/host.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ type configurer interface {
123123
CleanupServiceEnvironment(os.Host, string) error
124124
Stat(os.Host, string, ...exec.Option) (*os.FileInfo, error)
125125
Touch(os.Host, string, time.Time, ...exec.Option) error
126+
K0sctlLockFilePath(os.Host) string
127+
UpsertFile(os.Host, string, string) error
126128
}
127129

128130
// HostMetadata resolved metadata for host

0 commit comments

Comments
 (0)