From 8c9ac1cca057fd72c6fc248fa334e6957f7f8f66 Mon Sep 17 00:00:00 2001 From: uriwang Date: Mon, 2 Mar 2026 15:52:26 +0800 Subject: [PATCH] feat(dbha-v2): add daemon-start guard for auto-restart on crash. issue: #16394 - Add RunWithGuard and GuardOptions in process/daemon.go to watch child process and restart on abnormal exit - Skip SavePid in pid.go when DBHA_UNDER_GUARD is set - Add DaemonStartCmdRunE in process/cmds.go - Add daemon-start subcommand to probe, receiver, admin, analysis - Add config.Load in admin, receiver, analysis config packages --- dbm-services/common/dbha-v2/Makefile | 14 +- dbm-services/common/dbha-v2/cmd/admin/main.go | 1 + .../common/dbha-v2/cmd/analysis/main.go | 1 + dbm-services/common/dbha-v2/cmd/probe/main.go | 1 + .../common/dbha-v2/cmd/receiver/main.go | 1 + .../dbha-v2/internal/admin/cmds/cmds.go | 8 ++ .../common/dbha-v2/internal/admin/command.go | 7 + .../dbha-v2/internal/admin/config/config.go | 19 +++ .../dbha-v2/internal/analysis/cmds/cmds.go | 8 ++ .../dbha-v2/internal/analysis/command.go | 7 + .../internal/analysis/config/config.go | 19 +++ .../internal/analysis/detector/detector.go | 1 + .../internal/analysis/storage/dbha_data.go | 3 + .../dbha-v2/internal/probe/cmds/cmds.go | 8 ++ .../common/dbha-v2/internal/probe/command.go | 7 + .../dbha-v2/internal/receiver/cmds/cmds.go | 8 ++ .../dbha-v2/internal/receiver/command.go | 7 + .../internal/receiver/config/config.go | 19 +++ .../common/dbha-v2/pkg/process/cmds.go | 136 ++++++++++++++---- .../common/dbha-v2/pkg/process/daemon.go | 123 +++++++++++++++- .../common/dbha-v2/pkg/process/pid.go | 4 + 21 files changed, 363 insertions(+), 39 deletions(-) diff --git a/dbm-services/common/dbha-v2/Makefile b/dbm-services/common/dbha-v2/Makefile index 7627f443e9..56f7029d2d 100644 --- a/dbm-services/common/dbha-v2/Makefile +++ b/dbm-services/common/dbha-v2/Makefile @@ -50,15 +50,17 @@ PROTO_FILES = $(wildcard $(PROTO_DIR)/*.proto) # generate go code files GO_GEN_FILES=$(PROTO_FILES:$(PROTO_DIR)/%.proto=$(GEN_DIR)/%.pb.go) -# services list -SERVICES := admin analysis receiver probe +# services list and binary names with dbha- prefix +SERVICES := admin analysis receiver probe +BIN_PREFIX := dbha- +BINARIES := $(addprefix $(BIN_PREFIX),$(SERVICES)) # define common go build command define GO_BUILD @mkdir -p $(BUILD_DIR) CGO_ENABLED=0 GOOS=$(GO_OS) GOARCH=amd64 go build -ldflags=$(BUILD_FLAG) \ -gcflags="all=-trimpath=$(PWD)" -asmflags="all=-trimpath=$(PWD)" \ - -o $(BUILD_DIR)/$(1) cmd/$(1)/main.go + -o $(BUILD_DIR)/$(BIN_PREFIX)$(1) cmd/$(1)/main.go endef .PHONY: all proto $(SERVICES) clean format test package help @@ -100,7 +102,7 @@ toolkits: cluster cluster: @mkdir -p $(BUILD_DIR) CGO_ENABLED=0 GOOS=${GO_OS} GOARCH=amd64 go build -ldflags=$(BUILD_FLAG) -gcflags="all=-trimpath=$(PWD)" \ - -asmflags="all=-trimpath=$(PWD)" -o $(BUILD_DIR)/$@ tools/cmd/cluster.go + -asmflags="all=-trimpath=$(PWD)" -o $(BUILD_DIR)/$(BIN_PREFIX)$@ tools/cmd/cluster.go # build protobuf to go $(GEN_DIR)/%.pb.go: $(PROTO_DIR)/%.proto @@ -119,8 +121,8 @@ package: $(SERVICES) toolkits @echo "Packaging $(PKG_NAME)..." @rm -rf $(PKG_DIR) @mkdir -p $(PKG_DIR)/etc $(PKG_DIR)/logs $(PKG_DIR)/pids $(PKG_DIR)/toolkits - @cp $(addprefix $(BUILD_DIR)/,$(SERVICES)) $(PKG_DIR)/ - @cp $(BUILD_DIR)/cluster $(PKG_DIR)/toolkits/ + @cp $(addprefix $(BUILD_DIR)/,$(BINARIES)) $(PKG_DIR)/ + @cp $(BUILD_DIR)/$(BIN_PREFIX)cluster $(PKG_DIR)/toolkits/ @-cp etc/*.yaml $(PKG_DIR)/etc/ 2>/dev/null || true @cd $(BUILD_DIR) && tar -czvf $(PKG_NAME) $(PROJECT) @rm -rf $(PKG_DIR) diff --git a/dbm-services/common/dbha-v2/cmd/admin/main.go b/dbm-services/common/dbha-v2/cmd/admin/main.go index c748c23668..7722b8d098 100644 --- a/dbm-services/common/dbha-v2/cmd/admin/main.go +++ b/dbm-services/common/dbha-v2/cmd/admin/main.go @@ -46,6 +46,7 @@ func main() { rootCmd.AddCommand(admin.MigrateCmd) rootCmd.AddCommand(admin.HealthCmd) rootCmd.AddCommand(admin.StartCmd) + rootCmd.AddCommand(admin.DaemonStartCmd) rootCmd.AddCommand(admin.StopCmd) rootCmd.AddCommand(admin.RestartCmd) rootCmd.AddCommand(admin.ReloadCmd) diff --git a/dbm-services/common/dbha-v2/cmd/analysis/main.go b/dbm-services/common/dbha-v2/cmd/analysis/main.go index fa08dd0105..3f4bd4d772 100644 --- a/dbm-services/common/dbha-v2/cmd/analysis/main.go +++ b/dbm-services/common/dbha-v2/cmd/analysis/main.go @@ -45,6 +45,7 @@ func main() { rootCmd.AddCommand(analysis.VersionCmd) rootCmd.AddCommand(analysis.HealthCmd) rootCmd.AddCommand(analysis.StartCmd) + rootCmd.AddCommand(analysis.DaemonStartCmd) rootCmd.AddCommand(analysis.StopCmd) rootCmd.AddCommand(analysis.RestartCmd) rootCmd.AddCommand(analysis.ReloadCmd) diff --git a/dbm-services/common/dbha-v2/cmd/probe/main.go b/dbm-services/common/dbha-v2/cmd/probe/main.go index 04a96a9ac6..75a4164cba 100644 --- a/dbm-services/common/dbha-v2/cmd/probe/main.go +++ b/dbm-services/common/dbha-v2/cmd/probe/main.go @@ -46,6 +46,7 @@ func main() { rootCmd.AddCommand(probe.VersionCmd) rootCmd.AddCommand(probe.HealthCmd) rootCmd.AddCommand(probe.StartCmd) + rootCmd.AddCommand(probe.DaemonStartCmd) rootCmd.AddCommand(probe.StopCmd) rootCmd.AddCommand(probe.RestartCmd) rootCmd.AddCommand(probe.ReloadCmd) diff --git a/dbm-services/common/dbha-v2/cmd/receiver/main.go b/dbm-services/common/dbha-v2/cmd/receiver/main.go index c13f2b6246..6c5c868512 100644 --- a/dbm-services/common/dbha-v2/cmd/receiver/main.go +++ b/dbm-services/common/dbha-v2/cmd/receiver/main.go @@ -44,6 +44,7 @@ func main() { rootCmd.AddCommand(receiver.VersionCmd) rootCmd.AddCommand(receiver.HealthCmd) rootCmd.AddCommand(receiver.StartCmd) + rootCmd.AddCommand(receiver.DaemonStartCmd) rootCmd.AddCommand(receiver.StopCmd) rootCmd.AddCommand(receiver.RestartCmd) rootCmd.AddCommand(receiver.ReloadCmd) diff --git a/dbm-services/common/dbha-v2/internal/admin/cmds/cmds.go b/dbm-services/common/dbha-v2/internal/admin/cmds/cmds.go index 1bb0311164..8c6b4b63ee 100644 --- a/dbm-services/common/dbha-v2/internal/admin/cmds/cmds.go +++ b/dbm-services/common/dbha-v2/internal/admin/cmds/cmds.go @@ -56,6 +56,14 @@ func ReloadCmdRunE(cmd *cobra.Command, args []string) error { return process.ReloadCmdRunE(cmd, args, config.Cfg.PidFile, process.NameAdmin, StopTimeout, ForceStop) } +func DaemonStartCmdRunE(cmd *cobra.Command, args []string) error { + configPath, _ := cmd.Root().PersistentFlags().GetString("config") + if err := config.Load(configPath); err != nil { + return err + } + return process.DaemonStartCmdRunE(cmd, args, config.Cfg.PidFile, process.NameAdmin, process.DefaultGuardRestartDelay) +} + func HealthCmdRunE(cmd *cobra.Command, _ []string) error { baseHealth := process.GetBaseHealthInfo(config.Cfg.PidFile, process.NameAdmin) diff --git a/dbm-services/common/dbha-v2/internal/admin/command.go b/dbm-services/common/dbha-v2/internal/admin/command.go index 7fc900a85f..84a7f18033 100644 --- a/dbm-services/common/dbha-v2/internal/admin/command.go +++ b/dbm-services/common/dbha-v2/internal/admin/command.go @@ -89,6 +89,13 @@ var StartCmd = &cobra.Command{ RunE: cmds.StartCmdRunE, } +// DaemonStartCmd is used to start this process with a guard that restarts it on abnormal exit. +var DaemonStartCmd = &cobra.Command{ + Use: "daemon-start", + Short: "Start this process with guard (auto-restart on crash).", + RunE: cmds.DaemonStartCmdRunE, +} + // StopCmd is used to stop this process. var StopCmd = &cobra.Command{ Use: "stop", diff --git a/dbm-services/common/dbha-v2/internal/admin/config/config.go b/dbm-services/common/dbha-v2/internal/admin/config/config.go index 90e1b545e4..092ba86f0a 100644 --- a/dbm-services/common/dbha-v2/internal/admin/config/config.go +++ b/dbm-services/common/dbha-v2/internal/admin/config/config.go @@ -29,6 +29,8 @@ import ( "time" "dbm-services/common/dbha-v2/pkg/logger" + + "github.com/spf13/viper" ) var Cfg = Configuration{ @@ -107,3 +109,20 @@ type Configuration struct { Storage StorageConfig `yaml:"storage" mapstructure:"storage"` Log LogConfig `yaml:"log" mapstructure:"log"` } + +// Load loads admin configuration from file +func Load(configFilePath string) error { + viper.SetConfigName("admin") + viper.SetConfigType("yaml") + viper.AddConfigPath("./etc") + + if configFilePath != "" { + viper.SetConfigFile(configFilePath) + } + + if err := viper.ReadInConfig(); err != nil { + return err + } + + return viper.Unmarshal(&Cfg) +} diff --git a/dbm-services/common/dbha-v2/internal/analysis/cmds/cmds.go b/dbm-services/common/dbha-v2/internal/analysis/cmds/cmds.go index 655a47393e..3d181eb8c8 100644 --- a/dbm-services/common/dbha-v2/internal/analysis/cmds/cmds.go +++ b/dbm-services/common/dbha-v2/internal/analysis/cmds/cmds.go @@ -56,6 +56,14 @@ func ReloadCmdRunE(cmd *cobra.Command, args []string) error { return process.ReloadCmdRunE(cmd, args, config.Cfg.PidFile, process.NameAnalysis, StopTimeout, ForceStop) } +func DaemonStartCmdRunE(cmd *cobra.Command, args []string) error { + configPath, _ := cmd.Root().PersistentFlags().GetString("config") + if err := config.Load(configPath); err != nil { + return err + } + return process.DaemonStartCmdRunE(cmd, args, config.Cfg.PidFile, process.NameAnalysis, process.DefaultGuardRestartDelay) +} + func HealthCmdRunE(cmd *cobra.Command, _ []string) error { baseHealth := process.GetBaseHealthInfo(config.Cfg.PidFile, process.NameAnalysis) diff --git a/dbm-services/common/dbha-v2/internal/analysis/command.go b/dbm-services/common/dbha-v2/internal/analysis/command.go index 3e4dfaf830..83a174896f 100644 --- a/dbm-services/common/dbha-v2/internal/analysis/command.go +++ b/dbm-services/common/dbha-v2/internal/analysis/command.go @@ -53,6 +53,13 @@ var StartCmd = &cobra.Command{ RunE: cmds.StartCmdRunE, } +// DaemonStartCmd is used to start this process with a guard that restarts it on abnormal exit. +var DaemonStartCmd = &cobra.Command{ + Use: "daemon-start", + Short: "Start this process with guard (auto-restart on crash).", + RunE: cmds.DaemonStartCmdRunE, +} + // StopCmd is used to stop this process. var StopCmd = &cobra.Command{ Use: "stop", diff --git a/dbm-services/common/dbha-v2/internal/analysis/config/config.go b/dbm-services/common/dbha-v2/internal/analysis/config/config.go index afc8b39089..86bcb80405 100644 --- a/dbm-services/common/dbha-v2/internal/analysis/config/config.go +++ b/dbm-services/common/dbha-v2/internal/analysis/config/config.go @@ -29,6 +29,8 @@ import ( "time" "dbm-services/common/dbha-v2/pkg/logger" + + "github.com/spf13/viper" ) var Cfg = Configuration{ @@ -162,6 +164,23 @@ type Configuration struct { Log LogConfig `yaml:"log" mapstructure:"log"` } +// Load loads analysis configuration from file +func Load(configFilePath string) error { + viper.SetConfigName("analysis") + viper.SetConfigType("yaml") + viper.AddConfigPath("./etc") + + if configFilePath != "" { + viper.SetConfigFile(configFilePath) + } + + if err := viper.ReadInConfig(); err != nil { + return err + } + + return viper.Unmarshal(&Cfg) +} + func init() { Cfg.Detector.Ssh.Port = 22 Cfg.Detector.Ssh.User = "root" diff --git a/dbm-services/common/dbha-v2/internal/analysis/detector/detector.go b/dbm-services/common/dbha-v2/internal/analysis/detector/detector.go index 085c439ff7..0589d44225 100644 --- a/dbm-services/common/dbha-v2/internal/analysis/detector/detector.go +++ b/dbm-services/common/dbha-v2/internal/analysis/detector/detector.go @@ -22,6 +22,7 @@ * SOFTWARE. */ +// Package detector provides probe health detection and target selection for DBHA analysis. package detector import ( diff --git a/dbm-services/common/dbha-v2/internal/analysis/storage/dbha_data.go b/dbm-services/common/dbha-v2/internal/analysis/storage/dbha_data.go index d5f4acded0..6f360e56bf 100644 --- a/dbm-services/common/dbha-v2/internal/analysis/storage/dbha_data.go +++ b/dbm-services/common/dbha-v2/internal/analysis/storage/dbha_data.go @@ -44,6 +44,7 @@ type DbhaData struct { DB *hamysql.GormDB } +// GetBizIDs returns all distinct business IDs from DBM metadata. func (ha *DbhaData) GetBizIDs() ([]int, error) { bkBizIDs := []int{} @@ -58,6 +59,7 @@ func (ha *DbhaData) GetBizIDs() ([]int, error) { return bkBizIDs, nil } +// ReadMetadataCacheWithBizID reads metadata cache in batches for the given bizID. func (ha *DbhaData) ReadMetadataCacheWithBizID(bizID int, batchCnt int, offsetDuration time.Duration) (metaData []*hamodel.DbmMetadata, err error) { @@ -164,6 +166,7 @@ func (ha *DbhaData) SaveSwitchingLog(ctx context.Context, records ...*hamodel.Db return err } +// ReadSwitchingStrategyWithBkBizId returns switching strategies for the given business ID. func (ha *DbhaData) ReadSwitchingStrategyWithBkBizId(bkBizId int) ([]*hamodel.DbSwitchingStrategy, error) { var strategies []*hamodel.DbSwitchingStrategy diff --git a/dbm-services/common/dbha-v2/internal/probe/cmds/cmds.go b/dbm-services/common/dbha-v2/internal/probe/cmds/cmds.go index f17a853bee..8890423e57 100644 --- a/dbm-services/common/dbha-v2/internal/probe/cmds/cmds.go +++ b/dbm-services/common/dbha-v2/internal/probe/cmds/cmds.go @@ -74,6 +74,14 @@ func ReloadCmdRunE(cmd *cobra.Command, args []string) error { return process.ReloadCmdRunE(cmd, args, config.Cfg.PidFile, process.NameProbe, StopTimeout, ForceStop) } +func DaemonStartCmdRunE(cmd *cobra.Command, args []string) error { + configPath, _ := cmd.Root().PersistentFlags().GetString("config") + if err := config.Load(configPath); err != nil { + return err + } + return process.DaemonStartCmdRunE(cmd, args, config.Cfg.PidFile, process.NameProbe, process.DefaultGuardRestartDelay) +} + func HealthCmdRunE(cmd *cobra.Command, _ []string) error { if err := config.Load(ConfigFilePath); err != nil { baseHealth := process.GetBaseHealthInfo(config.Cfg.PidFile, process.NameProbe) diff --git a/dbm-services/common/dbha-v2/internal/probe/command.go b/dbm-services/common/dbha-v2/internal/probe/command.go index 523d05e7be..1653cc318c 100644 --- a/dbm-services/common/dbha-v2/internal/probe/command.go +++ b/dbm-services/common/dbha-v2/internal/probe/command.go @@ -54,6 +54,13 @@ var StartCmd = &cobra.Command{ RunE: cmds.StartCmdRunE, } +// DaemonStartCmd is used to start this process with a guard that restarts it on abnormal exit. +var DaemonStartCmd = &cobra.Command{ + Use: "daemon-start", + Short: "Start this process with guard (auto-restart on crash).", + RunE: cmds.DaemonStartCmdRunE, +} + // StopCmd is used to stop this process. var StopCmd = &cobra.Command{ Use: "stop", diff --git a/dbm-services/common/dbha-v2/internal/receiver/cmds/cmds.go b/dbm-services/common/dbha-v2/internal/receiver/cmds/cmds.go index 51d29a91f3..dbe7ad141b 100644 --- a/dbm-services/common/dbha-v2/internal/receiver/cmds/cmds.go +++ b/dbm-services/common/dbha-v2/internal/receiver/cmds/cmds.go @@ -56,6 +56,14 @@ func ReloadCmdRunE(cmd *cobra.Command, args []string) error { return process.ReloadCmdRunE(cmd, args, config.Cfg.PidFile, process.NameReceiver, StopTimeout, ForceStop) } +func DaemonStartCmdRunE(cmd *cobra.Command, args []string) error { + configPath, _ := cmd.Root().PersistentFlags().GetString("config") + if err := config.Load(configPath); err != nil { + return err + } + return process.DaemonStartCmdRunE(cmd, args, config.Cfg.PidFile, process.NameReceiver, process.DefaultGuardRestartDelay) +} + func HealthCmdRunE(cmd *cobra.Command, _ []string) error { baseHealth := process.GetBaseHealthInfo(config.Cfg.PidFile, process.NameReceiver) diff --git a/dbm-services/common/dbha-v2/internal/receiver/command.go b/dbm-services/common/dbha-v2/internal/receiver/command.go index 9b4cdda809..358930b391 100644 --- a/dbm-services/common/dbha-v2/internal/receiver/command.go +++ b/dbm-services/common/dbha-v2/internal/receiver/command.go @@ -53,6 +53,13 @@ var StartCmd = &cobra.Command{ RunE: cmds.StartCmdRunE, } +// DaemonStartCmd is used to start this process with a guard that restarts it on abnormal exit. +var DaemonStartCmd = &cobra.Command{ + Use: "daemon-start", + Short: "Start this process with guard (auto-restart on crash).", + RunE: cmds.DaemonStartCmdRunE, +} + // StopCmd is used to stop this process. var StopCmd = &cobra.Command{ Use: "stop", diff --git a/dbm-services/common/dbha-v2/internal/receiver/config/config.go b/dbm-services/common/dbha-v2/internal/receiver/config/config.go index 718a41a57e..b3b1ec1582 100644 --- a/dbm-services/common/dbha-v2/internal/receiver/config/config.go +++ b/dbm-services/common/dbha-v2/internal/receiver/config/config.go @@ -28,6 +28,8 @@ import ( "time" "dbm-services/common/dbha-v2/pkg/logger" + + "github.com/spf13/viper" ) var Cfg = Configuration{ @@ -102,3 +104,20 @@ type Configuration struct { Service ServiceConfig `yaml:"service" mapstructure:"service"` Log LogConfig `yaml:"log" mapstructure:"log"` } + +// Load loads receiver configuration from file +func Load(configFilePath string) error { + viper.SetConfigName("receiver") + viper.SetConfigType("yaml") + viper.AddConfigPath("./etc") + + if configFilePath != "" { + viper.SetConfigFile(configFilePath) + } + + if err := viper.ReadInConfig(); err != nil { + return err + } + + return viper.Unmarshal(&Cfg) +} diff --git a/dbm-services/common/dbha-v2/pkg/process/cmds.go b/dbm-services/common/dbha-v2/pkg/process/cmds.go index 646d5e5ae8..97fd2441bf 100644 --- a/dbm-services/common/dbha-v2/pkg/process/cmds.go +++ b/dbm-services/common/dbha-v2/pkg/process/cmds.go @@ -37,6 +37,9 @@ import ( "github.com/spf13/cobra" ) +// DefaultGuardRestartDelay is the default delay before restarting a crashed child. +const DefaultGuardRestartDelay = 3 * time.Second + // StartCmdRunE handles the start command. func StartCmdRunE(cmd *cobra.Command, _ []string, pidFile, procName string) error { pid, err := ReadPid(pidFile) @@ -78,6 +81,79 @@ func StartCmdRunE(cmd *cobra.Command, _ []string, pidFile, procName string) erro return err } +// DaemonStartCmdRunE handles the daemon-start command. It forks a guard process that launches the target +// via StartDaemon, monitors it, and restarts on abnormal exit. The launcher returns immediately; the guard runs in background. +func DaemonStartCmdRunE(cmd *cobra.Command, _ []string, pidFile, procName string, restartDelay time.Duration) error { + pid, err := ReadPid(pidFile) + if err != nil { + if !errors.Is(err, ErrPidFileNotExist) && !errors.Is(err, ErrInvalidFile) { + return err + } + } else { + alive, aliveErr := IsAliveWithProcessName(pid, procName) + if aliveErr != nil { + return aliveErr + } + if alive { + fmt.Fprintf(cmd.OutOrStdout(), "%s is already running (guard mode), pid:%d\n", procName, pid) + return nil + } + } + + exePath, err := os.Executable() + if err != nil { + return err + } + + rootCmd := cmd.Root() + configPath, err := rootCmd.PersistentFlags().GetString("config") + if err != nil { + return err + } + + var serviceArgs []string + if configPath != "" { + serviceArgs = append(serviceArgs, "-c", configPath) + } + + var guardArgs []string + subcmd := "daemon-start" + if cmd != nil && cmd.Name() != "" { + subcmd = cmd.Name() + } + guardArgs = append(guardArgs, subcmd) + if configPath != "" { + guardArgs = append(guardArgs, "-c", configPath) + } + + if restartDelay <= 0 { + restartDelay = DefaultGuardRestartDelay + } + + guardOpt := GuardOptions{ + DaemonOptions: DaemonOptions{ + Executable: exePath, + Args: serviceArgs, // child of guard runs the service (e.g. ./probe -c config) + }, + PidFile: pidFile, + ProcName: procName, + RestartDelay: restartDelay, + } + + // If we're the forked guard process, run directly without forking again + if os.Getenv(EnvGuardProcess) == "1" { + return RunWithGuard(guardOpt) + } + + // Fork guard process and return immediately (parent exits) + _, err = StartDaemon(DaemonOptions{ + Executable: exePath, + Args: guardArgs, // guard runs daemon-start (e.g. ./probe daemon-start -c config) + Env: []string{EnvGuardProcess + "=1"}, + }) + return err +} + // StopCmdRunE handles the stop command. func StopCmdRunE(cmd *cobra.Command, _ []string, pidFile, procName string, timeout int, force bool) error { pid, err := ReadPid(pidFile) @@ -131,36 +207,6 @@ func RestartCmdRunE(cmd *cobra.Command, args []string, pidFile, procName string, return StartCmdRunE(cmd, args, pidFile, procName) } -func waitForProcessExit(pidFile, procName string, timeout time.Duration) error { - ticker := time.NewTicker(100 * time.Millisecond) - defer ticker.Stop() - - timeoutCh := time.After(timeout) - for { - select { - case <-timeoutCh: - return gerrors.Newf(gerrors.Failure, "timeout waiting for %s to exit", procName) - case <-ticker.C: - } - - pid, err := ReadPid(pidFile) - if errors.Is(err, ErrPidFileNotExist) || errors.Is(err, ErrInvalidFile) { - return nil - } - if err != nil { - return err - } - - alive, aliveErr := IsAliveWithProcessName(pid, procName) - if aliveErr != nil { - return aliveErr - } - if !alive { - return nil - } - } -} - // ReloadCmdRunE handles the reload command. // Sends SIGHUP to the process to trigger configuration reload. func ReloadCmdRunE(cmd *cobra.Command, _ []string, pidFile, procName string, _ int, _ bool) error { @@ -237,3 +283,33 @@ func PrintBaseHealth(w io.Writer, health *HealthInfo) { fmt.Fprintln(w, "Status:", health.Status) fmt.Fprintln(w, "ErrMsg:", health.ErrMsg) } + +func waitForProcessExit(pidFile, procName string, timeout time.Duration) error { + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() + + timeoutCh := time.After(timeout) + for { + select { + case <-timeoutCh: + return gerrors.Newf(gerrors.Failure, "timeout waiting for %s to exit", procName) + case <-ticker.C: + } + + pid, err := ReadPid(pidFile) + if errors.Is(err, ErrPidFileNotExist) || errors.Is(err, ErrInvalidFile) { + return nil + } + if err != nil { + return err + } + + alive, aliveErr := IsAliveWithProcessName(pid, procName) + if aliveErr != nil { + return aliveErr + } + if !alive { + return nil + } + } +} diff --git a/dbm-services/common/dbha-v2/pkg/process/daemon.go b/dbm-services/common/dbha-v2/pkg/process/daemon.go index e2e9ee203b..0597b23729 100644 --- a/dbm-services/common/dbha-v2/pkg/process/daemon.go +++ b/dbm-services/common/dbha-v2/pkg/process/daemon.go @@ -3,20 +3,40 @@ package process import ( "os" "os/exec" + "os/signal" "syscall" + "time" "dbm-services/common/dbha-v2/pkg/gerrors" ) +const ( + // EnvUnderGuard is set when the child process is started by the guard. + // When set, SavePid skips writing to avoid overwriting the guard's pid file. + EnvUnderGuard = "DBHA_UNDER_GUARD" + // EnvGuardProcess is set when the process is the forked guard (not the launcher). + // When set, DaemonStartCmdRunE runs RunWithGuard directly instead of forking again. + EnvGuardProcess = "DBHA_GUARD_PROCESS" +) + +var ( + ErrExecutableEmpty = gerrors.Newf(gerrors.Failure, "Executable is empty") +) + type DaemonOptions struct { Executable string Args []string Env []string } -var ( - ErrExecutableEmpty = gerrors.Newf(gerrors.Failure, "Executable is empty") -) +// GuardOptions extends DaemonOptions with guard-specific settings. +type GuardOptions struct { + DaemonOptions + PidFile string + ProcName string + RestartDelay time.Duration + OnRestart func(exitCode int, restartCount int) +} // StartDaemon starts a new background process using the given executable func StartDaemon(opt DaemonOptions) (*os.Process, error) { @@ -38,3 +58,100 @@ func StartDaemon(opt DaemonOptions) (*os.Process, error) { return cmd.Process, nil } + +// RunWithGuard runs a guard process that starts the target, monitors it, and restarts on abnormal exit. +// It blocks until the guard receives SIGTERM or SIGINT. +func RunWithGuard(opt GuardOptions) error { + if opt.Executable == "" { + return ErrExecutableEmpty + } + if opt.PidFile == "" { + return gerrors.Newf(gerrors.InvalidParameter, "PidFile is required for guard mode") + } + if opt.RestartDelay <= 0 { + opt.RestartDelay = 3 * time.Second + } + + // Inject DBHA_UNDER_GUARD so child skips SavePid + env := append(opt.Env, EnvUnderGuard+"=1") + daemonOpt := DaemonOptions{ + Executable: opt.Executable, + Args: opt.Args, + Env: env, + } + + if err := SavePid(opt.PidFile); err != nil { + return err + } + defer func() { _ = os.Remove(opt.PidFile) }() + + sigC := make(chan os.Signal, 1) + signal.Notify(sigC, syscall.SIGTERM, syscall.SIGINT) + + var childProc *os.Process + restartCount := 0 + + for { + proc, err := StartDaemon(daemonOpt) + if err != nil { + return err + } + childProc = proc + + // Wait for either child exit or stop signal + waitDone := make(chan struct { + state *os.ProcessState + err error + }, 1) + go func() { + state, waitErr := proc.Wait() + waitDone <- struct { + state *os.ProcessState + err error + }{state, waitErr} + }() + + var state *os.ProcessState + var waitErr error + select { + case result := <-waitDone: + state = result.state + waitErr = result.err + case <-sigC: + // Kill child and wait for it + _ = childProc.Signal(syscall.SIGTERM) + <-waitDone // drain Wait + + // Ensure child is gone + for i := 0; i < 15; i++ { + alive, _ := IsAlive(int32(childProc.Pid)) + if !alive { + break + } + time.Sleep(200 * time.Millisecond) + } + + alive, _ := IsAlive(int32(childProc.Pid)) + if alive { + _ = childProc.Signal(syscall.SIGKILL) + } + return nil + } + + exitCode := -1 + if state != nil { + exitCode = state.ExitCode() + } + + if waitErr != nil { + exitCode = -1 + } + + restartCount++ + if opt.OnRestart != nil { + opt.OnRestart(exitCode, restartCount) + } + + time.Sleep(opt.RestartDelay) + } +} diff --git a/dbm-services/common/dbha-v2/pkg/process/pid.go b/dbm-services/common/dbha-v2/pkg/process/pid.go index d30407653c..1a738b272c 100644 --- a/dbm-services/common/dbha-v2/pkg/process/pid.go +++ b/dbm-services/common/dbha-v2/pkg/process/pid.go @@ -65,7 +65,11 @@ func Name(pid int32) (string, error) { } // SavePid is used to save the process pid into a file. +// When DBHA_UNDER_GUARD is set (child running under guard), skip writing to avoid overwriting guard's pid file. func SavePid(filename string) error { + if os.Getenv(EnvUnderGuard) != "" { + return nil + } if filename == "" { return ErrInvalidFile }