-
Notifications
You must be signed in to change notification settings - Fork 32
xrootd: fix new daemon being terminated immediately after restart on log-level change #3160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
9df7015
31ff855
8b4003f
0a28b9c
7086929
dc91a0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,13 +38,16 @@ import ( | |
| ) | ||
|
|
||
| type restartInfo struct { | ||
| launchers []daemon.Launcher | ||
| egrp *errgroup.Group | ||
| callback func(int) | ||
| isCache bool | ||
| useCMSD bool | ||
| privileged bool | ||
| pids []int | ||
| ctx context.Context | ||
| launchers []daemon.Launcher | ||
| egrp *errgroup.Group | ||
| callback func(int) | ||
| preRestartHook func(ctx context.Context) | ||
| postRestartHook func(ctx context.Context) | ||
| isCache bool | ||
| useCMSD bool | ||
| privileged bool | ||
| pids []int | ||
| } | ||
|
|
||
| var ( | ||
|
|
@@ -68,15 +71,18 @@ func ResetRestartState() { | |
|
|
||
| // StoreRestartInfo stores the information needed for restarting XRootD | ||
| // This should be called during initial launch after PIDs are known. | ||
| func StoreRestartInfo(launchers []daemon.Launcher, pids []int, egrp *errgroup.Group, callback func(int), cache bool, cmsd bool, priv bool) { | ||
| func StoreRestartInfo(ctx context.Context, launchers []daemon.Launcher, pids []int, egrp *errgroup.Group, callback func(int), cache bool, cmsd bool, priv bool, preRestartHook func(ctx context.Context), postRestartHook func(ctx context.Context)) { | ||
| info := restartInfo{ | ||
| launchers: launchers, | ||
| egrp: egrp, | ||
| callback: callback, | ||
| isCache: cache, | ||
| useCMSD: cmsd, | ||
| privileged: priv, | ||
| pids: append([]int(nil), pids...), | ||
| ctx: ctx, | ||
| launchers: launchers, | ||
| egrp: egrp, | ||
| callback: callback, | ||
| preRestartHook: preRestartHook, | ||
| postRestartHook: postRestartHook, | ||
| isCache: cache, | ||
| useCMSD: cmsd, | ||
| privileged: priv, | ||
| pids: append([]int(nil), pids...), | ||
| } | ||
|
|
||
| // Replace any existing entry for the same server role; otherwise append. | ||
|
|
@@ -138,6 +144,14 @@ func RestartXrootd(ctx context.Context, oldPids []int) (newPids []int, err error | |
| return nil, errors.New("restart requested but no tracked PIDs are available") | ||
| } | ||
|
|
||
| // Run any pre-restart hooks (e.g., advertise shutdown to the Director and | ||
| // wait for in-flight transfers to drain) before sending signals. | ||
| for _, info := range storedInfos { | ||
| if info.preRestartHook != nil { | ||
| info.preRestartHook(info.ctx) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want the long-lived context from |
||
| } | ||
| } | ||
|
|
||
| // Step 1: Gracefully shutdown existing XRootD processes | ||
| log.Debug("Sending SIGTERM to existing XRootD processes") | ||
| for _, pid := range oldPids { | ||
|
|
@@ -210,7 +224,7 @@ func RestartXrootd(ctx context.Context, oldPids []int) (newPids []int, err error | |
| } | ||
|
|
||
| log.Info("Launching new XRootD daemons") | ||
| pids, launchErr := LaunchDaemons(ctx, newLaunchers, info.egrp, info.callback) | ||
| pids, launchErr := LaunchDaemons(info.ctx, newLaunchers, info.egrp, info.callback) | ||
| if launchErr != nil { | ||
| return nil, errors.Wrap(launchErr, "Failed to launch XRootD daemons") | ||
| } | ||
|
|
@@ -231,6 +245,14 @@ func RestartXrootd(ctx context.Context, oldPids []int) (newPids []int, err error | |
|
|
||
| metrics.SetComponentHealthStatus(metrics.OriginCache_XRootD, metrics.StatusOK, "XRootD restart complete") | ||
|
|
||
| // Run any post-restart hooks (e.g., re-advertise the server to the Director so | ||
| // clients can resume routing requests to this server immediately). | ||
| for _, info := range updatedInfos { | ||
| if info.postRestartHook != nil { | ||
| info.postRestartHook(info.ctx) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want the long-lived context from info, or the short-lived one that was passed to RestartXrootd? |
||
| } | ||
| } | ||
|
|
||
| log.Infof("XRootD restart complete with new PIDs: %v", newPids) | ||
| return newPids, nil | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This sounds plausible to me, but will this interfere with tests outside of
launchersandxrootd?