Skip to content

Commit c65199b

Browse files
jmarrerojlebon
authored andcommitted
kola: Add soft-reboot support for external tests
Implements soft-reboot capabilities for Kola, it enables tests to use systemd's soft-reboot functionality. The implementation follows the same pattern as regular reboots but for `systemctl soft-reboot`, tracks systemd boot timestamps rather than kernel boot IDs for state detection. Co-Authored-By: Colin Walters <[email protected]> Co-Authored-By: Claude <[email protected]> Signed-off-by: Colin Walters <[email protected]> Signed-off-by: Joseph Marrero Corchado <[email protected]>
1 parent 40fa01d commit c65199b

File tree

15 files changed

+358
-18
lines changed

15 files changed

+358
-18
lines changed

docs/kola/external-tests.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,37 @@ it out.
110110
(Previously the API for this was to send `SIGTERM` to the current process; that
111111
method is deprecated and will be removed at some point)
112112

113+
## Support for soft-rebooting
114+
115+
Kola also supports soft-rebooting using systemd's `systemctl soft-reboot` command.
116+
Soft-reboot restarts the userspace while keeping the kernel and hardware state intact.
117+
This is useful for testing userspace updates without a full system reboot.
118+
119+
The soft-reboot API is similar to the regular reboot API:
120+
121+
```
122+
#!/bin/bash
123+
# Example of soft-reboot test
124+
set -xeuo pipefail
125+
case "${AUTOPKGTEST_REBOOT_MARK:-}" in
126+
"") echo "test beginning"; /tmp/autopkgtest-soft-reboot mark1 ;;
127+
mark1) echo "test in mark1"; /tmp/autopkgtest-soft-reboot mark2 ;;
128+
mark2) echo "test in mark2" ;;
129+
*) echo "unexpected mark: ${AUTOPKGTEST_REBOOT_MARK}"; exit 1;;
130+
esac
131+
echo "ok autopkgtest soft-rebooting"
132+
```
133+
134+
Key differences with soft-reboot:
135+
- The kernel boot ID (`/proc/sys/kernel/random/boot_id`) remains the same
136+
- Hardware state and kernel memory are preserved
137+
- `/run` is not cycled.
138+
- Only userspace is restarted
139+
- Uses `systemctl soft-reboot` instead of `reboot`
140+
141+
Both `/tmp/autopkgtest-soft-reboot` and `/tmp/autopkgtest-soft-reboot-prepare` scripts are available,
142+
analogous to their regular reboot counterparts.
143+
113144
## HTTP Server
114145

115146
The `kolet` binary is copied into the `/usr/local/bin/` directory on the CoreOS

mantle/cmd/kola/devshell.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ func runDevShellSSH(ctx context.Context, builder *platform.QemuBuilder, conf *co
239239
_ = inst.Kill()
240240
case guestStateInReboot:
241241
statusMsg = "QEMU guest initiated reboot"
242+
case guestStateInSoftReboot:
243+
statusMsg = "QEMU guest initiated soft-reboot"
242244
case guestStateOpenSshStopped:
243245
statusMsg = "QEMU openssh is not listening"
244246
case guestStateSshDisconnected:
@@ -285,6 +287,8 @@ const (
285287
guestStateInShutdown
286288
// guestStateInReboot indicates that the guest has started a reboot
287289
guestStateInReboot
290+
// guestStateInSoftReboot indicates that the guest has started a soft-reboot
291+
guestStateInSoftReboot
288292
// guestStateHalted indicates that the guest has halted or shutdown
289293
guestStateHalted
290294
// guestStateBooting indicates that the instance is in early boot
@@ -325,6 +329,9 @@ func checkWriteState(msg string, c chan<- guestState) {
325329
if strings.Contains(msg, "Starting Reboot...") {
326330
c <- guestStateInReboot
327331
}
332+
if strings.Contains(msg, "Reached target soft-reboot") {
333+
c <- guestStateInSoftReboot
334+
}
328335
}
329336

330337
type systemdEventMessage struct {
@@ -428,6 +435,11 @@ func watchJournal(builder *platform.QemuBuilder, conf *conf.Conf, stateChan chan
428435
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
429436
guestState: guestStateInShutdown,
430437
},
438+
{
439+
unit: "systemd-soft-reboot.service",
440+
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
441+
guestState: guestStateInSoftReboot,
442+
},
431443
}
432444

433445
r, err := builder.VirtioJournal(conf, "-o json --system")

mantle/cmd/kolet/kolet.go

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,25 @@ reboot
105105
autopkgtestRebootPrepareScript = `#!/bin/bash
106106
set -euo pipefail
107107
exec /usr/local/bin/kolet reboot-request "$1"
108+
`
109+
110+
// Soft-reboot support
111+
autopkgTestSoftRebootPath = "/tmp/autopkgtest-soft-reboot"
112+
autopkgtestSoftRebootScript = `#!/bin/bash
113+
set -xeuo pipefail
114+
/usr/local/bin/kolet soft-reboot-request "$1"
115+
systemctl soft-reboot
116+
`
117+
autopkgTestSoftRebootPreparePath = "/tmp/autopkgtest-soft-reboot-prepare"
118+
119+
autopkgtestSoftRebootPrepareScript = `#!/bin/bash
120+
set -euo pipefail
121+
exec /usr/local/bin/kolet soft-reboot-request "$1"
108122
`
109123

110124
// File used to communicate between the script and the kolet runner internally
111-
rebootRequestFifo = "/run/kolet-reboot"
125+
rebootRequestFifo = "/run/kolet-reboot"
126+
softRebootRequestFifo = "/run/kolet-soft-reboot"
112127
)
113128

114129
var (
@@ -140,6 +155,13 @@ var (
140155
SilenceUsage: true,
141156
}
142157

158+
cmdSoftReboot = &cobra.Command{
159+
Use: "soft-reboot-request MARK",
160+
Short: "Request a soft reboot",
161+
RunE: runSoftReboot,
162+
SilenceUsage: true,
163+
}
164+
143165
cmdHttpd = &cobra.Command{
144166
Use: "httpd",
145167
Short: "Start an HTTP server to serve the contents of the file system",
@@ -260,6 +282,11 @@ func initiateReboot(mark string) error {
260282
}
261283

262284
func mkfifo(path string) error {
285+
// Create a FIFO in an idempotent fashion
286+
// as /run survives soft-reboots.
287+
if _, err := os.Stat(path); err == nil {
288+
return nil
289+
}
263290
c := exec.Command("mkfifo", path)
264291
c.Stderr = os.Stderr
265292
err := c.Run()
@@ -269,6 +296,20 @@ func mkfifo(path string) error {
269296
return nil
270297
}
271298

299+
func initiateSoftReboot(mark string) error {
300+
systemdjournal.Print(systemdjournal.PriInfo, "Processing soft-reboot request")
301+
res := kola.KoletResult{
302+
SoftReboot: string(mark),
303+
}
304+
buf, err := json.Marshal(&res)
305+
if err != nil {
306+
return errors.Wrapf(err, "serializing KoletResult")
307+
}
308+
fmt.Println(string(buf))
309+
systemdjournal.Print(systemdjournal.PriInfo, "Acknowledged soft-reboot request with mark: %s", buf)
310+
return nil
311+
}
312+
272313
func runExtUnit(cmd *cobra.Command, args []string) error {
273314
rebootOff, _ := cmd.Flags().GetBool("deny-reboots")
274315
// Write the autopkgtest wrappers
@@ -278,10 +319,18 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
278319
if err := os.WriteFile(autopkgTestRebootPreparePath, []byte(autopkgtestRebootPrepareScript), 0755); err != nil {
279320
return err
280321
}
322+
// Write the soft-reboot autopkgtest wrappers
323+
if err := os.WriteFile(autopkgTestSoftRebootPath, []byte(autopkgtestSoftRebootScript), 0755); err != nil {
324+
return err
325+
}
326+
if err := os.WriteFile(autopkgTestSoftRebootPreparePath, []byte(autopkgtestSoftRebootPrepareScript), 0755); err != nil {
327+
return err
328+
}
281329

282330
// Create the reboot cmdline -> login FIFO for the reboot mark and
283331
// proxy it into a channel
284332
rebootChan := make(chan string)
333+
softRebootChan := make(chan string)
285334
errChan := make(chan error)
286335

287336
// We want to prevent certain tests (like non-exclusive tests) from rebooting
@@ -303,6 +352,25 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
303352
}
304353
rebootChan <- string(buf)
305354
}()
355+
356+
// Create soft-reboot FIFO and channel
357+
err = mkfifo(softRebootRequestFifo)
358+
if err != nil {
359+
return err
360+
}
361+
go func() {
362+
softRebootReader, err := os.Open(softRebootRequestFifo)
363+
if err != nil {
364+
errChan <- err
365+
return
366+
}
367+
defer softRebootReader.Close()
368+
buf, err := io.ReadAll(softRebootReader)
369+
if err != nil {
370+
errChan <- err
371+
}
372+
softRebootChan <- string(buf)
373+
}()
306374
}
307375

308376
ctx := context.Background()
@@ -344,6 +412,8 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
344412
return err
345413
case reboot := <-rebootChan:
346414
return initiateReboot(reboot)
415+
case softReboot := <-softRebootChan:
416+
return initiateSoftReboot(softReboot)
347417
case m := <-unitevents:
348418
for n := range m {
349419
if n == unitname {
@@ -397,6 +467,35 @@ func runReboot(cmd *cobra.Command, args []string) error {
397467
return nil
398468
}
399469

470+
// runSoftReboot handles soft-reboot requests similar to runReboot but for systemctl soft-reboot
471+
func runSoftReboot(cmd *cobra.Command, args []string) error {
472+
if _, err := os.Stat(softRebootRequestFifo); os.IsNotExist(err) {
473+
return errors.New("Soft-reboots are not supported for this test, softRebootRequestFifo does not exist.")
474+
}
475+
476+
mark := args[0]
477+
systemdjournal.Print(systemdjournal.PriInfo, "Requesting soft-reboot with mark: %s", mark)
478+
err := mkfifo(kola.KoletRebootAckFifo)
479+
if err != nil {
480+
return err
481+
}
482+
err = os.WriteFile(softRebootRequestFifo, []byte(mark), 0644)
483+
if err != nil {
484+
return err
485+
}
486+
f, err := os.Open(kola.KoletRebootAckFifo)
487+
if err != nil {
488+
return err
489+
}
490+
buf := make([]byte, 1)
491+
_, err = f.Read(buf)
492+
if err != nil {
493+
return err
494+
}
495+
systemdjournal.Print(systemdjournal.PriInfo, "Soft-reboot request acknowledged")
496+
return nil
497+
}
498+
400499
func runHttpd(cmd *cobra.Command, args []string) error {
401500
port, _ := cmd.Flags().GetString("port")
402501
path, _ := cmd.Flags().GetString("path")
@@ -413,6 +512,8 @@ func main() {
413512
root.AddCommand(cmdRunExtUnit)
414513
cmdReboot.Args = cobra.ExactArgs(1)
415514
root.AddCommand(cmdReboot)
515+
cmdSoftReboot.Args = cobra.ExactArgs(1)
516+
root.AddCommand(cmdSoftReboot)
416517
cmdHttpd.Flags().StringP("port", "", "80", "port")
417518
cmdHttpd.Flags().StringP("path", "", "./", "path to filesystem contents to serve")
418519
cmdHttpd.Args = cobra.ExactArgs(0)

mantle/kola/harness.go

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ const (
255255

256256
// KoletResult is serialized JSON passed from kolet to the harness
257257
type KoletResult struct {
258-
Reboot string
258+
Reboot string
259+
SoftReboot string
259260
}
260261

261262
const KoletExtTestUnit = "kola-runext"
@@ -1105,6 +1106,10 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11051106
if err != nil {
11061107
return errors.Wrapf(err, "getting boot id")
11071108
}
1109+
softrebootCount, err := platform.GetMachineSoftRebootCount(mach)
1110+
if err != nil {
1111+
return errors.Wrapf(err, "getting soft reboot count")
1112+
}
11081113
plog.Debug("Starting kolet run-test-unit")
11091114
if previousRebootState != "" {
11101115
// quote around the value for systemd
@@ -1137,27 +1142,47 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11371142
return errors.Wrapf(err, "parsing kolet json %s", string(stdout))
11381143
}
11391144
}
1140-
// If no reboot is requested, we're done
1141-
if koletRes.Reboot == "" {
1145+
// If no reboot or soft-reboot is requested, we're done
1146+
if koletRes.Reboot == "" && koletRes.SoftReboot == "" {
11421147
return nil
11431148
}
11441149

1145-
// A reboot is requested
1146-
previousRebootState = koletRes.Reboot
1147-
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1148-
// This signals to the subject that we have saved the mark, and the subject
1149-
// can proceed with rebooting. We stop sshd to ensure that the wait below
1150-
// doesn't log in while ssh is shutting down.
1151-
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1152-
if err != nil {
1153-
return errors.Wrapf(err, "failed to acknowledge reboot")
1150+
// Handle regular reboot
1151+
if koletRes.Reboot != "" {
1152+
previousRebootState = koletRes.Reboot
1153+
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1154+
// This signals to the subject that we have saved the mark, and the subject
1155+
// can proceed with rebooting. We stop sshd to ensure that the wait below
1156+
// doesn't log in while ssh is shutting down.
1157+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1158+
if err != nil {
1159+
return errors.Wrapf(err, "failed to acknowledge reboot")
1160+
}
1161+
plog.Debug("Waiting for reboot")
1162+
err = mach.WaitForReboot(120*time.Second, bootID)
1163+
if err != nil {
1164+
return errors.Wrapf(err, "Waiting for reboot")
1165+
}
1166+
plog.Debug("Reboot complete")
11541167
}
1155-
plog.Debug("Waiting for reboot")
1156-
err = mach.WaitForReboot(120*time.Second, bootID)
1157-
if err != nil {
1158-
return errors.Wrapf(err, "Waiting for reboot")
1168+
1169+
// Handle soft-reboot
1170+
if koletRes.SoftReboot != "" {
1171+
previousRebootState = koletRes.SoftReboot
1172+
plog.Debugf("Soft-reboot request with mark='%s'", previousRebootState)
1173+
// Use the soft reboot count we collected at the beginning of this loop iteration
1174+
// Acknowledge the soft-reboot request
1175+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'echo > %s'", KoletRebootAckFifo))
1176+
if err != nil {
1177+
return errors.Wrapf(err, "failed to acknowledge soft-reboot")
1178+
}
1179+
plog.Debug("Waiting for soft-reboot")
1180+
err = mach.WaitForSoftReboot(120*time.Second, softrebootCount)
1181+
if err != nil {
1182+
return errors.Wrapf(err, "Waiting for soft-reboot")
1183+
}
1184+
plog.Debug("Soft-reboot complete")
11591185
}
1160-
plog.Debug("Reboot complete")
11611186
}
11621187
}
11631188

mantle/platform/machine/aws/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
8080
return platform.WaitForMachineReboot(am, am.journal, timeout, oldBootId)
8181
}
8282

83+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
84+
return platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldSoftRebootsCount)
85+
}
86+
8387
func (am *machine) Destroy() {
8488
origConsole, err := am.cluster.flight.api.GetConsoleOutput(am.ID())
8589
if err != nil {

mantle/platform/machine/azure/machine.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
108108
return am.refetchIPs()
109109
}
110110

111+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
112+
err := platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldSoftRebootsCount)
113+
if err != nil {
114+
return err
115+
}
116+
// For soft-reboot, IP addresses should not change, but let's refetch to be safe
117+
return am.refetchIPs()
118+
}
119+
111120
func (am *machine) Destroy() {
112121
if err := am.saveConsole(); err != nil {
113122
// log error, but do not fail to terminate instance

mantle/platform/machine/do/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ func (dm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7777
return platform.WaitForMachineReboot(dm, dm.journal, timeout, oldBootId)
7878
}
7979

80+
func (dm *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
81+
return platform.WaitForMachineSoftReboot(dm, dm.journal, timeout, oldSoftRebootsCount)
82+
}
83+
8084
func (dm *machine) Destroy() {
8185
if err := dm.cluster.flight.api.DeleteDroplet(context.TODO(), dm.droplet.ID); err != nil {
8286
plog.Errorf("Error deleting droplet %v: %v", dm.droplet.ID, err)

mantle/platform/machine/esx/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (em *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(em, em.journal, timeout, oldBootId)
7979
}
8080

81+
func (em *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
82+
return platform.WaitForMachineSoftReboot(em, em.journal, timeout, oldSoftRebootsCount)
83+
}
84+
8185
func (em *machine) Destroy() {
8286
if err := em.cluster.flight.api.TerminateDevice(em.ID()); err != nil {
8387
plog.Errorf("Error terminating device %v: %v", em.ID(), err)

mantle/platform/machine/gcloud/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (gm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(gm, gm.journal, timeout, oldBootId)
7979
}
8080

81+
func (gm *machine) WaitForSoftReboot(timeout time.Duration, oldSoftRebootsCount string) error {
82+
return platform.WaitForMachineSoftReboot(gm, gm.journal, timeout, oldSoftRebootsCount)
83+
}
84+
8185
func (gm *machine) Destroy() {
8286
if err := gm.saveConsole(); err != nil {
8387
plog.Errorf("Error saving console for instance %v: %v", gm.ID(), err)

0 commit comments

Comments
 (0)