Skip to content

Commit 4035385

Browse files
committed
SetupRootless handle case where conmon pid are not valid
When trying to join the conmon pid to recreate the pause process based on the namespace it can be that the pid is no longer valid, i.e. when conmon crashed or was killed. Currently we have a big issue that can be reproduced using: $ podman run -d quay.io/libpod/testimage:20241011 sleep 100 $ killall -9 conmon $ killall catatonit All commands would fail as we keep trying to rejoin the namespace of the non existing conmon process. So to address that fall back to creating a new namespace if we fail to join the conmon pids. Signed-off-by: Paul Holzinger <[email protected]>
1 parent 677c999 commit 4035385

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

pkg/domain/infra/abi/system_linux.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package abi
44

55
import (
66
"context"
7+
"errors"
78
"fmt"
89
"os"
910

@@ -14,6 +15,7 @@ import (
1415
"go.podman.io/common/pkg/config"
1516
"go.podman.io/common/pkg/systemd"
1617
"go.podman.io/storage/pkg/unshare"
18+
"golang.org/x/sys/unix"
1719
)
1820

1921
// Default path for system runtime state
@@ -90,14 +92,22 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool,
9092

9193
if len(paths) > 0 {
9294
became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths)
95+
// TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore
96+
// In this case create a new userns.
97+
if errors.Is(err, unix.ESRCH) {
98+
logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate")
99+
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
100+
}
93101
} else {
102+
logrus.Info("Creating a new rootless user namespace")
94103
became, ret, err = rootless.BecomeRootInUserNS(pausePidPath)
95-
if err == nil && !noMoveProcess {
96-
systemd.MovePauseProcessToScope(pausePidPath)
97-
}
98104
}
105+
99106
if err != nil {
100-
return fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err)
107+
return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate")
108+
}
109+
if !noMoveProcess {
110+
systemd.MovePauseProcessToScope(pausePidPath)
101111
}
102112
if became {
103113
os.Exit(ret)

test/system/550-pause-process.bats

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,30 @@ function _check_pause_process() {
149149
# This used to hang trying to unmount the netns.
150150
run_podman rm -f -t0 $cname
151151
}
152+
153+
# regression test for https://issues.redhat.com/browse/RHEL-130252
154+
@test "podman system migrate works with conmon being killed" {
155+
skip_if_not_rootless "pause process is only used as rootless"
156+
skip_if_remote "system migrate not supported via remote"
157+
158+
local cname=c-$(safename)
159+
run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100
160+
161+
run_podman inspect --format '{{.State.ConmonPid}}' $cname
162+
conmon_pid="$output"
163+
164+
# check for pause pid and then kill it
165+
_check_pause_process
166+
kill -9 $pause_pid
167+
168+
# kill conmon
169+
kill -9 $conmon_pid
170+
171+
# Use podman system migrate to stop the currently running pause process
172+
run_podman 125 system migrate
173+
assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating"
174+
assert "$output" =~ "conmon process killed"
175+
176+
# Now the removal command should work fine without errors.
177+
run_podman rm $cname
178+
}

0 commit comments

Comments
 (0)