diff --git a/pkg/domain/infra/abi/system_linux.go b/pkg/domain/infra/abi/system_linux.go index 7acaa9bd19f..7c345aeed3b 100644 --- a/pkg/domain/infra/abi/system_linux.go +++ b/pkg/domain/infra/abi/system_linux.go @@ -4,6 +4,7 @@ package abi import ( "context" + "errors" "fmt" "os" @@ -14,6 +15,7 @@ import ( "go.podman.io/common/pkg/config" "go.podman.io/common/pkg/systemd" "go.podman.io/storage/pkg/unshare" + "golang.org/x/sys/unix" ) // Default path for system runtime state @@ -59,6 +61,8 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, } } } + + // return early as we are already re-exec or root here so no need to join the rootless userns. return nil } @@ -74,36 +78,41 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, if became { os.Exit(ret) } - if noMoveProcess { - return nil - } // if there is no pid file, try to join existing containers, and create a pause process. ctrs, err := ic.Libpod.GetRunningContainers() if err != nil { - logrus.Error(err.Error()) - os.Exit(1) + return err } - paths := []string{} + paths := make([]string, 0, len(ctrs)) for _, ctr := range ctrs { paths = append(paths, ctr.ConfigNoCopy().ConmonPidFile) } if len(paths) > 0 { became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths) + // TryJoinFromFilePaths fails with ESRCH when the PID are all not valid anymore + // In this case create a new userns. + if errors.Is(err, unix.ESRCH) { + logrus.Warnf("Failed to join existing conmon namespace, creating a new rootless podman user namespace. If there are existing container running please stop them with %q to reset the namespace", os.Args[0]+" system migrate") + became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) + } } else { + logrus.Info("Creating a new rootless user namespace") became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) - if err == nil { - systemd.MovePauseProcessToScope(pausePidPath) - } } + if err != nil { - logrus.Error(fmt.Errorf("invalid internal status, try resetting the pause process with %q: %w", os.Args[0]+" system migrate", err)) - os.Exit(1) + return fmt.Errorf("fatal error, invalid internal status, unable to create a new pause process: %w. Try running %q and if that doesn't work reboot to recover", err, os.Args[0]+" system migrate") + } + if !noMoveProcess { + systemd.MovePauseProcessToScope(pausePidPath) } if became { os.Exit(ret) } + + logrus.Error("Internal error, failed to re-exec podman into user namespace without error. This should never happen, if you see this please report a bug") return nil } diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c index 3d74af6a6ca..644c8ef9a20 100644 --- a/pkg/rootless/rootless_linux.c +++ b/pkg/rootless/rootless_linux.c @@ -384,8 +384,7 @@ can_use_shortcut (char **argv) || strcmp (argv[argc], "version") == 0 || strcmp (argv[argc], "context") == 0 || strcmp (argv[argc], "search") == 0 - || strcmp (argv[argc], "compose") == 0 - || (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0)) + || strcmp (argv[argc], "compose") == 0) { ret = false; break; diff --git a/test/system/550-pause-process.bats b/test/system/550-pause-process.bats index da657e77e1c..7818f36bb9f 100644 --- a/test/system/550-pause-process.bats +++ b/test/system/550-pause-process.bats @@ -149,3 +149,30 @@ function _check_pause_process() { # This used to hang trying to unmount the netns. run_podman rm -f -t0 $cname } + +# regression test for https://issues.redhat.com/browse/RHEL-130252 +@test "podman system migrate works with conmon being killed" { + skip_if_not_rootless "pause process is only used as rootless" + skip_if_remote "system migrate not supported via remote" + + local cname=c-$(safename) + run_podman run --name $cname --stop-signal SIGKILL -d $IMAGE sleep 100 + + run_podman inspect --format '{{.State.ConmonPid}}' $cname + conmon_pid="$output" + + # check for pause pid and then kill it + _check_pause_process + kill -9 $pause_pid + + # kill conmon + kill -9 $conmon_pid + + # Use podman system migrate to stop the currently running pause process + run_podman 125 system migrate + assert "$output" =~ "Failed to join existing conmon namespace" "fallback to userns creating" + assert "$output" =~ "conmon process killed" + + # Now the removal command should work fine without errors. + run_podman rm $cname +}