@@ -5,12 +5,10 @@ import (
5
5
"encoding/binary"
6
6
"errors"
7
7
"fmt"
8
- "io/ioutil"
9
8
"math/rand"
10
9
"net"
11
10
"os"
12
11
"os/exec"
13
- "os/user"
14
12
"strconv"
15
13
"time"
16
14
@@ -119,7 +117,7 @@ func (n *Node) Init(ctx context.Context) error {
119
117
return err
120
118
}
121
119
122
- // Initiate a restore
120
+ // Check to see if we were just restored
123
121
if os .Getenv ("FLY_RESTORED_FROM" ) != "" {
124
122
// Check to see if there's an active restore.
125
123
active , err := isRestoreActive ()
@@ -134,64 +132,10 @@ func (n *Node) Init(ctx context.Context) error {
134
132
}
135
133
}
136
134
135
+ // Verify whether we are a booting zombie.
137
136
if ZombieLockExists () {
138
- fmt .Println ("Zombie lock detected!" )
139
- primaryStr , err := ReadZombieLock ()
140
- if err != nil {
141
- return fmt .Errorf ("failed to read zombie lock: %s" , primaryStr )
142
- }
143
-
144
- // If the zombie lock contains a hostname, it means we were able to resolve the real primary and
145
- // will attempt to rejoin it.
146
- if primaryStr != "" {
147
- ip := net .ParseIP (primaryStr )
148
- if ip == nil {
149
- return fmt .Errorf ("zombie.lock file contains an invalid ipv6 address" )
150
- }
151
-
152
- conn , err := n .RepMgr .NewRemoteConnection (ctx , ip .String ())
153
- if err != nil {
154
- return fmt .Errorf ("failed to establish a connection to our rejoin target %s: %s" , ip .String (), err )
155
- }
156
- defer conn .Close (ctx )
157
-
158
- primary , err := n .RepMgr .PrimaryMember (ctx , conn )
159
- if err != nil {
160
- return fmt .Errorf ("failed to confirm primary on recover target %s: %s" , ip .String (), err )
161
- }
162
-
163
- // Confirm that our rejoin target still identifies itself as the primary.
164
- if primary .Hostname != ip .String () {
165
- // Clear the zombie.lock file so we can attempt to re-resolve the correct primary.
166
- if err := RemoveZombieLock (); err != nil {
167
- return fmt .Errorf ("failed to remove zombie lock: %s" , err )
168
- }
169
-
170
- return ErrZombieLockPrimaryMismatch
171
- }
172
-
173
- // If the primary does not reside within our primary region, we cannot rejoin until it is.
174
- if primary .Region != n .PrimaryRegion {
175
- fmt .Printf ("Primary region mismatch detected. The primary lives in '%s', while PRIMARY_REGION is set to '%s'\n " , primary .Region , n .PrimaryRegion )
176
- return ErrZombieLockRegionMismatch
177
- }
178
-
179
- if err := n .RepMgr .rejoinCluster (primary .Hostname ); err != nil {
180
- return fmt .Errorf ("failed to rejoin cluster: %s" , err )
181
- }
182
-
183
- // TODO - Wait for target cluster to register self as a standby.
184
-
185
- if err := RemoveZombieLock (); err != nil {
186
- return fmt .Errorf ("failed to remove zombie lock: %s" , err )
187
- }
188
-
189
- // Ensure the single instance created with the --force-rewind process is cleaned up properly.
190
- utils .RunCommand ("pg_ctl -D /data/postgresql/ stop" , "postgres" )
191
- } else {
192
- // TODO - Provide link to documention on how to address this
193
- fmt .Println ("Zombie lock file does not contain a hostname." )
194
- fmt .Println ("This likely means that we were unable to determine who the real primary is." )
137
+ if err := handleZombieLock (ctx , n ); err != nil {
138
+ return err
195
139
}
196
140
}
197
141
@@ -205,18 +149,24 @@ func (n *Node) Init(ctx context.Context) error {
205
149
return fmt .Errorf ("failed initialize cluster state store: %s" , err )
206
150
}
207
151
208
- if err := n .configure (ctx , store ); err != nil {
209
- return fmt .Errorf ("failed to configure node: %s" , err )
152
+ if err := n .configureInternal (store ); err != nil {
153
+ return fmt .Errorf ("failed to set internal config: %s" , err )
154
+ }
155
+
156
+ if err := n .configureRepmgr (store ); err != nil {
157
+ return fmt .Errorf ("failed to configure repmgr config: %s" , err )
210
158
}
211
159
212
160
if ! n .isPGInitialized () {
213
- // Check to see if repmgr cluster has been initialized.
161
+ // Check to see if cluster has already been initialized.
214
162
clusterInitialized , err := store .IsInitializationFlagSet ()
215
163
if err != nil {
216
164
return fmt .Errorf ("failed to verify cluster state %s" , err )
217
165
}
218
166
219
167
if ! clusterInitialized {
168
+ fmt .Println ("Provisioning primary" )
169
+
220
170
// Initialize ourselves as the primary.
221
171
if err := n .initializePG (); err != nil {
222
172
return fmt .Errorf ("failed to initialize postgres %s" , err )
@@ -227,6 +177,8 @@ func (n *Node) Init(ctx context.Context) error {
227
177
}
228
178
229
179
} else {
180
+ fmt .Println ("Provisioning standby" )
181
+ // Initialize ourselves as a standby
230
182
cloneTarget , err := n .RepMgr .ResolveMemberOverDNS (ctx )
231
183
if err != nil {
232
184
return err
@@ -300,7 +252,8 @@ func (n *Node) PostInit(ctx context.Context) error {
300
252
return fmt .Errorf ("failed to register repmgr primary: %s" , err )
301
253
}
302
254
303
- // Set flag within consul to let future new members that the cluster exists
255
+ // Set initialization flag within consul so future members know they are joining
256
+ // an existing cluster.
304
257
if err := store .SetInitializationFlag (); err != nil {
305
258
return fmt .Errorf ("failed to register cluster with consul" )
306
259
}
@@ -326,7 +279,7 @@ func (n *Node) PostInit(ctx context.Context) error {
326
279
327
280
switch role {
328
281
case PrimaryRoleName :
329
- primary , err := n . EvaluateClusterState (ctx , conn )
282
+ primary , err := PerformScreening (ctx , conn , n )
330
283
if errors .Is (err , ErrZombieDiagnosisUndecided ) {
331
284
fmt .Println ("Unable to confirm that we are the true primary!" )
332
285
if err := Quarantine (ctx , conn , n , primary ); err != nil {
@@ -383,13 +336,15 @@ func (n *Node) initializePG() error {
383
336
return nil
384
337
}
385
338
386
- if err := ioutil .WriteFile ("/data/.default_password" , []byte (n .OperatorCredentials .Password ), 0644 ); err != nil {
339
+ if err := os .WriteFile ("/data/.default_password" , []byte (n .OperatorCredentials .Password ), 0644 ); err != nil {
387
340
return err
388
341
}
389
342
cmd := exec .Command ("gosu" , "postgres" , "initdb" , "--pgdata" , n .DataDir , "--pwfile=/data/.default_password" )
390
- _ , err := cmd .CombinedOutput ()
343
+ if _ , err := cmd .CombinedOutput (); err != nil {
344
+ return err
345
+ }
391
346
392
- return err
347
+ return nil
393
348
}
394
349
395
350
func (n * Node ) isPGInitialized () bool {
@@ -400,56 +355,6 @@ func (n *Node) isPGInitialized() bool {
400
355
return true
401
356
}
402
357
403
- func (n * Node ) configure (ctx context.Context , store * state.Store ) error {
404
- if err := n .configureInternal (store ); err != nil {
405
- return fmt .Errorf ("failed to set internal config: %s" , err )
406
- }
407
-
408
- if err := n .configureRepmgr (store ); err != nil {
409
- return fmt .Errorf ("failed to configure repmgr config: %s" , err )
410
- }
411
-
412
- return nil
413
- }
414
-
415
- func writeSSHKey () error {
416
- err := os .Mkdir ("/data/.ssh" , 0700 )
417
- if err != nil && ! os .IsExist (err ) {
418
- return err
419
- }
420
-
421
- key := os .Getenv ("SSH_KEY" )
422
-
423
- keyFile , err := os .Create ("/data/.ssh/id_rsa" )
424
- if err != nil {
425
- return err
426
- }
427
- defer keyFile .Close ()
428
- _ , err = keyFile .Write ([]byte (key ))
429
- if err != nil {
430
- return err
431
- }
432
-
433
- cert := os .Getenv ("SSH_CERT" )
434
-
435
- certFile , err := os .Create ("/data/.ssh/id_rsa-cert.pub" )
436
- if err != nil {
437
- return err
438
- }
439
- defer certFile .Close ()
440
- _ , err = certFile .Write ([]byte (cert ))
441
- if err != nil {
442
- return err
443
- }
444
-
445
- err = setSSHOwnership ()
446
- if err != nil {
447
- return err
448
- }
449
-
450
- return nil
451
- }
452
-
453
358
func (n * Node ) configureInternal (store * state.Store ) error {
454
359
if err := n .InternalConfig .initialize (); err != nil {
455
360
return fmt .Errorf ("failed to initialize internal config: %s" , err )
@@ -605,6 +510,7 @@ func (n *Node) setDefaultHBA() error {
605
510
if err != nil {
606
511
return err
607
512
}
513
+ defer file .Close ()
608
514
609
515
for _ , entry := range entries {
610
516
str := fmt .Sprintf ("%s %s %s %s %s\n " , entry .Type , entry .Database , entry .User , entry .Address , entry .Method )
@@ -634,47 +540,17 @@ func openConnection(parentCtx context.Context, host string, database string, cre
634
540
return pgx .ConnectConfig (ctx , conf )
635
541
}
636
542
637
- func setSSHOwnership () error {
638
- cmdStr := fmt .Sprintf ("chmod 600 %s %s" , "/data/.ssh/id_rsa" , "/data/.ssh/id_rsa-cert.pub" )
639
- cmd := exec .Command ("sh" , "-c" , cmdStr )
640
- _ , err := cmd .Output ()
641
- return err
642
- }
643
-
644
543
func setDirOwnership () error {
645
- pgUser , err := user .Lookup ("postgres" )
646
- if err != nil {
647
- return err
648
- }
649
- pgUID , err := strconv .Atoi (pgUser .Uid )
650
- if err != nil {
651
- return err
652
- }
653
- pgGID , err := strconv .Atoi (pgUser .Gid )
544
+ pgUID , pgGID , err := utils .SystemUserIDs ("postgres" )
654
545
if err != nil {
655
- return err
546
+ return fmt . Errorf ( "failed to find postgres user ids: %s" , err )
656
547
}
657
548
658
549
cmdStr := fmt .Sprintf ("chown -R %d:%d %s" , pgUID , pgGID , "/data" )
659
550
cmd := exec .Command ("sh" , "-c" , cmdStr )
660
- _ , err = cmd .Output ()
661
- return err
662
- }
663
-
664
- func (n * Node ) EvaluateClusterState (ctx context.Context , conn * pgx.Conn ) (string , error ) {
665
- standbys , err := n .RepMgr .StandbyMembers (ctx , conn )
666
- if err != nil {
667
- if ! errors .Is (err , pgx .ErrNoRows ) {
668
- return "" , fmt .Errorf ("failed to query standbys" )
669
- }
670
- }
671
-
672
- sample , err := TakeDNASample (ctx , n , standbys )
673
- if err != nil {
674
- return "" , fmt .Errorf ("failed to evaluate cluster data: %s" , err )
551
+ if _ , err = cmd .Output (); err != nil {
552
+ return err
675
553
}
676
554
677
- fmt .Println (DNASampleString (sample ))
678
-
679
- return ZombieDiagnosis (sample )
555
+ return nil
680
556
}
0 commit comments