Skip to content

Commit 23adfa5

Browse files
committed
mantle/kola: Add function to enhance upgrade stability
This commit introduces the `waitForUpgradeToBeStaged` function to improve the stability of kola upgrade test by reducing timeout-related failures. The new function sets up a systemd path unit to monitor updates in the `/ostree/repo/refs/heads/ostree/1/1` directory, triggering a stop on `wait.service` once changes are detected. By ensuring we wait later in the upgrade process, we minimize the waiting period in `runFnAndWaitForRebootIntoVersion`, focusing only on the actual reboot phase. Author : Dusty Mabe <[email protected]> Ref: coreos/fedora-coreos-tracker#1805
1 parent de10fe1 commit 23adfa5

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

mantle/kola/tests/upgrade/basic.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,10 +313,31 @@ func runFnAndWaitForRebootIntoVersion(c cluster.TestCluster, m platform.Machine,
313313
}
314314
}
315315

316+
func waitForUpgradeToBeStaged(c cluster.TestCluster, m platform.Machine) {
317+
// Here we set up a systemd path unit to watch for when ostree
318+
// behind the scenes updates the refs in the repo under the
319+
// /ostree/repo/refs/heads/ostree/1/1 directory. refchanged.path
320+
// will trigger when it gets updated and will then stop wait.service.
321+
// The systemd-run --wait causes it to not return here (and thus
322+
// continue execution of code here) until wait.service has been
323+
// stopped by refchanged.service. This is an effort to make us
324+
// start waiting inside runFnAndWaitForRebootIntoVersion until
325+
// later in the upgrade process because we are seeing failures due
326+
// to timeouts and we're trying to reduce the variability by
327+
// minimizing the wait inside that function to just the actual reboot.
328+
// https://github.com/coreos/fedora-coreos-tracker/issues/1805
329+
//
330+
// Note: if systemd-run ever gains the ability to --wait when
331+
// generating a path unit then the below can be simplified.
332+
c.RunCmdSync(m, "sudo systemd-run -u refchanged --path-property=PathChanged=/ostree/repo/refs/heads/ostree/1/1 systemctl stop wait.service")
333+
c.RunCmdSync(m, "sudo systemd-run --wait -u wait sleep infinity")
334+
}
335+
316336
func waitForUpgradeToVersion(c cluster.TestCluster, m platform.Machine, version string) {
317337
runFnAndWaitForRebootIntoVersion(c, m, version, func() {
318338
// Start Zincati so it will apply the update
319339
c.RunCmdSync(m, "sudo systemctl start zincati.service")
340+
waitForUpgradeToBeStaged(c, m)
320341
})
321342
}
322343

@@ -328,6 +349,7 @@ func rpmostreeRebase(c cluster.TestCluster, m platform.Machine, ref, version str
328349
// we use systemd-run here so that we can test the --reboot path
329350
// without having SSH not exit cleanly, which would cause an error
330351
c.RunCmdSyncf(m, "sudo systemd-run rpm-ostree rebase --reboot %s", ref)
352+
waitForUpgradeToBeStaged(c, m)
331353
})
332354
}
333355

0 commit comments

Comments
 (0)