Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changes/unreleased/Fixed-20250531-143632.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Fixed
body: Database operation errors when instance IPs change after restarting.
time: 2025-05-31T14:36:32.503043-04:00
112 changes: 45 additions & 67 deletions server/internal/database/instance_resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/samber/do"

"github.com/pgEdge/control-plane/server/internal/certificates"
"github.com/pgEdge/control-plane/server/internal/patroni"
"github.com/pgEdge/control-plane/server/internal/postgres"
"github.com/pgEdge/control-plane/server/internal/resource"
"github.com/pgEdge/control-plane/server/internal/utils"
Expand Down Expand Up @@ -74,36 +75,37 @@ func (r *InstanceResource) Dependencies() []resource.Identifier {
}

func (r *InstanceResource) Refresh(ctx context.Context, rc *resource.Context) error {
orch, err := do.Invoke[Orchestrator](rc.Injector)
if err != nil {
return err
if err := r.updateConnectionInfo(ctx, rc); err != nil {
return resource.ErrNotFound
}

primaryInstanceID, err := GetPrimaryInstanceID(ctx, orch, r.Spec.DatabaseID, r.Spec.InstanceID, 30*time.Second)
primaryInstanceID, err := GetPrimaryInstanceID(ctx, r.patroniClient(), 30*time.Second)
if err != nil {
return resource.ErrNotFound // TODO: Is this always the right choice?
return resource.ErrNotFound
}
r.PrimaryInstanceID = primaryInstanceID

return nil
}

func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) error {
orch, err := do.Invoke[Orchestrator](rc.Injector)
certs, err := do.Invoke[*certificates.Service](rc.Injector)
if err != nil {
return err
}
certs, err := do.Invoke[*certificates.Service](rc.Injector)
if err != nil {

if err := r.updateConnectionInfo(ctx, rc); err != nil {
return err
}

err = WaitForPatroniRunning(ctx, orch, r.Spec.DatabaseID, r.Spec.InstanceID, 12*time.Hour)
patroniClient := r.patroniClient()

err = WaitForPatroniRunning(ctx, patroniClient, 12*time.Hour)
if err != nil {
return fmt.Errorf("failed to wait for patroni to enter running state: %w", err)
}

primaryInstanceID, err := GetPrimaryInstanceID(ctx, orch, r.Spec.DatabaseID, r.Spec.InstanceID, time.Minute)
primaryInstanceID, err := GetPrimaryInstanceID(ctx, patroniClient, time.Minute)
if err != nil {
return err
}
Expand All @@ -119,40 +121,29 @@ func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) err
return fmt.Errorf("failed to get TLS config: %w", err)
}

connInfo, err := orch.GetInstanceConnectionInfo(ctx, r.Spec.DatabaseID, r.Spec.InstanceID)
if err != nil {
return fmt.Errorf("failed to get instance DSN: %w", err)
}
r.ConnectionInfo = connInfo

firstTimeSetup, err := r.isFirstTimeSetup(rc)
if err != nil {
return err
}

if r.Spec.RestoreConfig != nil && firstTimeSetup {
err = r.renameDB(ctx, connInfo, tlsCfg)
err = r.renameDB(ctx, r.ConnectionInfo, tlsCfg)
if err != nil {
return fmt.Errorf("failed to rename database %q: %w", r.Spec.DatabaseName, err)
}
err = r.dropSpock(ctx, connInfo, tlsCfg)
err = r.dropSpock(ctx, r.ConnectionInfo, tlsCfg)
if err != nil {
return fmt.Errorf("failed to drop spock: %w", err)
}
}

err = r.createDB(ctx, connInfo, tlsCfg)
err = r.createDB(ctx, tlsCfg)
if err != nil {
return fmt.Errorf("failed to create database %q: %w", r.Spec.DatabaseName, err)
}

conn, err := ConnectToInstance(ctx, &ConnectionOptions{
DSN: &postgres.DSN{
Host: connInfo.AdminHost,
Port: connInfo.AdminPort,
DBName: r.Spec.DatabaseName,
User: "pgedge",
},
DSN: r.ConnectionInfo.AdminDSN(r.Spec.DatabaseName),
TLS: tlsCfg,
})
if err != nil {
Expand All @@ -178,15 +169,10 @@ func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) err
}
}

err = postgres.InitializePgEdgeExtensions(r.Spec.NodeName, &postgres.DSN{
Host: connInfo.PeerHost,
Port: connInfo.PeerPort,
DBName: r.Spec.DatabaseName,
User: "pgedge",
SSLCert: connInfo.PeerSSLCert,
SSLKey: connInfo.PeerSSLKey,
SSLRootCert: connInfo.PeerSSLRootCert,
}).Exec(ctx, conn)
err = postgres.InitializePgEdgeExtensions(
r.Spec.NodeName,
r.ConnectionInfo.PeerDSN(r.Spec.DatabaseName),
).Exec(ctx, conn)
if err != nil {
return fmt.Errorf("failed to initialize pgedge extensions: %w", err)
}
Expand Down Expand Up @@ -226,17 +212,11 @@ func (r *InstanceResource) Create(ctx context.Context, rc *resource.Context) err
}

func (r *InstanceResource) Update(ctx context.Context, rc *resource.Context) error {
orch, err := do.Invoke[Orchestrator](rc.Injector)
if err != nil {
if err := r.updateConnectionInfo(ctx, rc); err != nil {
return err
}

client, err := GetPatroniClient(ctx, orch, r.Spec.DatabaseID, r.Spec.InstanceID)
if err != nil {
return fmt.Errorf("failed to get patroni client: %w", err)
}

if err := client.Reload(ctx); err != nil {
if err := r.patroniClient().Reload(ctx); err != nil {
return fmt.Errorf("failed to reload patroni conf: %w", err)
}

Expand All @@ -259,12 +239,7 @@ func (r *InstanceResource) Connection(ctx context.Context, rc *resource.Context,
}

conn, err := ConnectToInstance(ctx, &ConnectionOptions{
DSN: &postgres.DSN{
Host: r.ConnectionInfo.AdminHost,
Port: r.ConnectionInfo.AdminPort,
DBName: dbName,
User: "pgedge",
},
DSN: r.ConnectionInfo.AdminDSN(dbName),
TLS: tlsCfg,
})
if err != nil {
Expand All @@ -273,14 +248,27 @@ func (r *InstanceResource) Connection(ctx context.Context, rc *resource.Context,
return conn, nil
}

func (r *InstanceResource) createDB(ctx context.Context, connInfo *ConnectionInfo, tlsCfg *tls.Config) error {
func (r *InstanceResource) updateConnectionInfo(ctx context.Context, rc *resource.Context) error {
orch, err := do.Invoke[Orchestrator](rc.Injector)
if err != nil {
return err
}
connInfo, err := orch.GetInstanceConnectionInfo(ctx, r.Spec.DatabaseID, r.Spec.InstanceID)
if err != nil {
return fmt.Errorf("failed to get instance connection info: %w", err)
}
r.ConnectionInfo = connInfo

return nil
}

func (r *InstanceResource) patroniClient() *patroni.Client {
return patroni.NewClient(r.ConnectionInfo.PatroniURL(), nil)
}

func (r *InstanceResource) createDB(ctx context.Context, tlsCfg *tls.Config) error {
createDBConn, err := ConnectToInstance(ctx, &ConnectionOptions{
DSN: &postgres.DSN{
Host: connInfo.AdminHost,
Port: connInfo.AdminPort,
DBName: "postgres",
User: "pgedge",
},
DSN: r.ConnectionInfo.AdminDSN("postgres"),
TLS: tlsCfg,
})
if err != nil {
Expand Down Expand Up @@ -308,12 +296,7 @@ func (r *InstanceResource) renameDB(ctx context.Context, connInfo *ConnectionInf
// operation.
err := utils.Retry(3, 500*time.Millisecond, func() error {
createDBConn, err := ConnectToInstance(ctx, &ConnectionOptions{
DSN: &postgres.DSN{
Host: connInfo.AdminHost,
Port: connInfo.AdminPort,
DBName: "postgres",
User: "pgedge",
},
DSN: r.ConnectionInfo.AdminDSN("postgres"),
TLS: tlsCfg,
})
if err != nil {
Expand All @@ -334,12 +317,7 @@ func (r *InstanceResource) renameDB(ctx context.Context, connInfo *ConnectionInf

func (r *InstanceResource) dropSpock(ctx context.Context, connInfo *ConnectionInfo, tlsCfg *tls.Config) error {
conn, err := ConnectToInstance(ctx, &ConnectionOptions{
DSN: &postgres.DSN{
Host: connInfo.AdminHost,
Port: connInfo.AdminPort,
DBName: r.Spec.DatabaseName,
User: "pgedge",
},
DSN: r.ConnectionInfo.AdminDSN(r.Spec.DatabaseName),
TLS: tlsCfg,
})
if err != nil {
Expand Down
32 changes: 32 additions & 0 deletions server/internal/database/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@ import (
"context"
"fmt"
"io"
"net/url"

"github.com/google/uuid"
"github.com/pgEdge/control-plane/server/internal/pgbackrest"
"github.com/pgEdge/control-plane/server/internal/postgres"
"github.com/pgEdge/control-plane/server/internal/resource"
)

const pgEdgeUser = "pgedge"

type InstanceResources struct {
Instance *InstanceResource
Resources []*resource.ResourceData
Expand Down Expand Up @@ -42,6 +46,34 @@ type ConnectionInfo struct {
PatroniPort int
}

func (c *ConnectionInfo) PatroniURL() *url.URL {
return &url.URL{
Scheme: "http",
Host: fmt.Sprintf("%s:%d", c.AdminHost, c.PatroniPort),
}
}

func (c *ConnectionInfo) AdminDSN(dbName string) *postgres.DSN {
return &postgres.DSN{
Host: c.AdminHost,
Port: c.AdminPort,
DBName: dbName,
User: pgEdgeUser,
}
}

func (c *ConnectionInfo) PeerDSN(dbName string) *postgres.DSN {
return &postgres.DSN{
Host: c.PeerHost,
Port: c.PeerPort,
DBName: dbName,
User: pgEdgeUser,
SSLCert: c.PeerSSLCert,
SSLKey: c.PeerSSLKey,
SSLRootCert: c.PeerSSLRootCert,
}
}

type Orchestrator interface {
GenerateInstanceResources(spec *InstanceSpec) (*InstanceResources, error)
GenerateInstanceRestoreResources(spec *InstanceSpec, taskID uuid.UUID) (*InstanceResources, error)
Expand Down
37 changes: 11 additions & 26 deletions server/internal/database/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,36 @@ package database
import (
"context"
"fmt"
"net/url"
"time"

"github.com/google/uuid"
"github.com/pgEdge/control-plane/server/internal/patroni"
"github.com/pgEdge/control-plane/server/internal/utils"
)

func GetPatroniClient(ctx context.Context, orch Orchestrator, databaseID, instanceID uuid.UUID) (*patroni.Client, error) {
connInfo, err := orch.GetInstanceConnectionInfo(ctx, databaseID, instanceID)
if err != nil {
return nil, fmt.Errorf("failed to get instance DSN: %w", err)
}
patroniURL := &url.URL{
Scheme: "http",
Host: fmt.Sprintf("%s:%d", connInfo.AdminHost, connInfo.PatroniPort),
}
return patroni.NewClient(patroniURL, nil), nil
}

func WaitForPatroniRunning(ctx context.Context, orch Orchestrator, databaseID, instanceID uuid.UUID, timeout time.Duration) error {
func WaitForPatroniRunning(ctx context.Context, patroniClient *patroni.Client, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

patroniClient, err := GetPatroniClient(ctx, orch, databaseID, instanceID)
if err != nil {
return fmt.Errorf("failed to get patroni client: %w", err)
}

ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()

// We want some tolerance to transient connection errors.
const maxConnectionErrors = 3
var errCount int

for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
status, err := patroniClient.GetInstanceStatus(ctx)
if err != nil {
return fmt.Errorf("failed to get cluster status: %w", err)
errCount++
if errCount >= maxConnectionErrors {
return fmt.Errorf("failed to get cluster status: %w", err)
}
continue
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we reset errCount to 0 after a successful connection attempt?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. The danger of doing that is that if the service is "flapping", we could end up stuck in an infinite loop. This is meant to be a very small tolerance to handle cases where Patroni becomes unavailable after we call its reload endpoint.

Does that make sense to you too?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay

if status.InRunningState() {
return nil
Expand All @@ -53,15 +43,10 @@ func WaitForPatroniRunning(ctx context.Context, orch Orchestrator, databaseID, i
}
}

func GetPrimaryInstanceID(ctx context.Context, orch Orchestrator, databaseID, instanceID uuid.UUID, timeout time.Duration) (uuid.UUID, error) {
func GetPrimaryInstanceID(ctx context.Context, patroniClient *patroni.Client, timeout time.Duration) (uuid.UUID, error) {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()

patroniClient, err := GetPatroniClient(ctx, orch, databaseID, instanceID)
if err != nil {
return uuid.Nil, fmt.Errorf("failed to get patroni client: %w", err)
}

ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()

Expand Down
Loading