diff --git a/reposerver/repository/repository.go b/reposerver/repository/repository.go index 1abccf6bcd2dd..950a307789bb0 100644 --- a/reposerver/repository/repository.go +++ b/reposerver/repository/repository.go @@ -2533,7 +2533,23 @@ func fetch(gitClient git.Client, targetRevisions []string) error { return nil } +// removeStaleGitLocks best-effort removes common stale git lock files in the repository. +// It is safe to call while holding the per-repo lock and before running any git commands. + +// func removeStaleGitLock(repoRoot string) { +// path := filepath.Join(repoRoot, ".git", "HEAD.lock") +// log.Infof("Checking whether HEAD.lock exists %s", path) +// if _, err := os.Stat(path); err == nil { +// log.Warnf("HEAD.lock present in git repository %s, removing it", repoRoot) +// if rmErr := os.Remove(path); rmErr != nil { +// log.Errorf("Failed to remove git lock %s: %v", path, rmErr) +// } +// } +// } + func checkoutRevision(gitClient git.Client, revision string, submoduleEnabled bool) error { + // removeStaleGitLock(gitClient.Root()) + err := gitClient.Init() if err != nil { return status.Errorf(codes.Internal, "Failed to initialize git repo: %v", err) diff --git a/reposerver/repository/repository_test.go b/reposerver/repository/repository_test.go index e96fc3540a95e..006d2cef3f508 100644 --- a/reposerver/repository/repository_test.go +++ b/reposerver/repository/repository_test.go @@ -3016,6 +3016,29 @@ func TestInit(t *testing.T) { initGitRepo(t, newGitRepoOptions{path: path.Join(dir, "repo2"), remote: "https://github.com/argo-cd/test-repo2", createPath: true, addEmptyCommit: false}) } +// func TestRemoveStaleGitHeadLock(t *testing.T) { +// dir := t.TempDir() +// // create a fake repo structure with .git/HEAD.lock +// gitDir := path.Join(dir, ".git") +// require.NoError(t, os.MkdirAll(gitDir, 0o755)) +// headLock := path.Join(gitDir, "HEAD.lock") +// require.NoError(t, os.WriteFile(headLock, []byte("test"), 0o644)) + +// // sanity: lock exists before +// _, err := os.Stat(headLock) +// require.NoError(t, err) + +// removeStaleGitLock(dir) + +// // lock should be gone after cleanup +// _, err = os.Stat(headLock) +// require.Error(t, err) +// require.True(t, os.IsNotExist(err)) + +// // calling again should be a no-op (no error) +// removeStaleGitLock(dir) +// } + // TestCheckoutRevisionCanGetNonstandardRefs shows that we can fetch a revision that points to a non-standard ref. In // other words, we haven't regressed and caused this issue again: https://github.com/argoproj/argo-cd/issues/4935 func TestCheckoutRevisionCanGetNonstandardRefs(t *testing.T) { @@ -3057,6 +3080,7 @@ func TestCheckoutRevisionPresentSkipFetch(t *testing.T) { gitClient := &gitmocks.Client{} gitClient.On("Init").Return(nil) + // gitClient.On("Root").Return("") gitClient.On("IsRevisionPresent", revision).Return(true) gitClient.On("Checkout", revision, mock.Anything).Return("", nil) @@ -3069,6 +3093,7 @@ func TestCheckoutRevisionNotPresentCallFetch(t *testing.T) { gitClient := &gitmocks.Client{} gitClient.On("Init").Return(nil) + // gitClient.On("Root").Return("") gitClient.On("IsRevisionPresent", revision).Return(false) gitClient.On("Fetch", "").Return(nil) gitClient.On("Checkout", revision, mock.Anything).Return("", nil) @@ -3083,6 +3108,7 @@ func TestFetch(t *testing.T) { gitClient := &gitmocks.Client{} gitClient.On("Init").Return(nil) + // gitClient.On("Root").Return("") gitClient.On("IsRevisionPresent", revision1).Once().Return(true) gitClient.On("IsRevisionPresent", revision2).Once().Return(false) gitClient.On("Fetch", "").Return(nil) diff --git a/util/exec/exec.go b/util/exec/exec.go index 0df69aeba7f9b..024191aaae2b0 100644 --- a/util/exec/exec.go +++ b/util/exec/exec.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strconv" "strings" "syscall" @@ -183,17 +184,97 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { args := strings.Join(cmd.Args, " ") logCtx.WithFields(logrus.Fields{"dir": cmd.Dir}).Info(redactor(args)) + // Best-effort cleanup of a stale HEAD.lock after the command finishes. + defer func() { + if cmd.Dir == "" { + return + } + lockPath := filepath.Join(cmd.Dir, ".git", "HEAD.lock") + logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Info("Checking HEAD.lock presence post-exec") + if _, err := os.Stat(lockPath); err == nil { + // Log and attempt removal; ignore ENOENT races + logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warn("HEAD.lock present post-exec, removing it") + if rmErr := os.Remove(lockPath); rmErr != nil && !os.IsNotExist(rmErr) { + logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warnf("Failed to remove HEAD.lock: %v", rmErr) + } + } else if !os.IsNotExist(err) { + logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warnf("Failed to stat HEAD.lock: %v", err) + } else { + logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Info("HEAD.lock not present post-exec") + } + }() + + // Helper: debug whether HEAD.lock exists under the current working directory + logHeadLockStatus := func(where string) { + if cmd.Dir == "" { + return + } + lockPath := filepath.Join(cmd.Dir, ".git", "HEAD.lock") + fileInfo, statErr := os.Stat(lockPath) + exists := statErr == nil + fields := logrus.Fields{ + "headLockPath": lockPath, + "headLockExists": exists, + "where": where, + } + + pgid, pgErr := syscall.Getpgid(cmd.Process.Pid) + if pgErr == nil && pgid > 0 { + // Portable ps: list all processes, print needed columns without headers, then filter by PGID in Go. + out, _ := exec.Command( + "ps", + "-ax", + "-o", "pid=,ppid=,pgid=,etime=,comm=,args=", + ).Output() + if len(out) > 0 { + var b strings.Builder + want := strconv.Itoa(pgid) + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + fieldsSlice := strings.Fields(line) + if len(fieldsSlice) < 3 { + continue + } + if fieldsSlice[2] == want { + b.WriteString(line) + b.WriteByte('\n') + } + } + fields["gitProcsInGroup"] = strings.TrimSpace(b.String()) + } + } + + if exists { + fields["headLockSize"] = fileInfo.Size() + fields["headLockMode"] = fileInfo.Mode().String() + fields["headLockModTime"] = fileInfo.ModTime() + fields["headLockIsDir"] = fileInfo.IsDir() + + } + logCtx.WithFields(fields).Info("HEAD.lock status") + } var stdout bytes.Buffer var stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr + // Configure the child to run in its own process group so we can signal the whole group on timeout/cancel. + // On Unix this sets Setpgid; on Windows this is a no-op. + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = newSysProcAttr(true) + } + start := time.Now() err = cmd.Start() if err != nil { return "", err } + logHeadLockStatus("start-exec") + done := make(chan error) go func() { done <- cmd.Wait() }() @@ -229,14 +310,20 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { // noinspection ALL case <-timoutCh: // send timeout signal - _ = cmd.Process.Signal(timeoutBehavior.Signal) + // signal the process group (negative PID) so children are terminated as well + if cmd.Process != nil { + _ = sysCallSignal(-cmd.Process.Pid, timeoutBehavior.Signal) + } // wait on timeout signal and fallback to fatal timeout signal if timeoutBehavior.ShouldWait { select { case <-done: + logHeadLockStatus("timeout-waited-done") case <-fatalTimeoutCh: - // upgrades to SIGKILL if cmd does not respect SIGTERM - _ = cmd.Process.Signal(fatalTimeoutBehaviour) + // upgrades to fatal signal (default SIGKILL) if cmd does not respect the initial signal + if cmd.Process != nil { + _ = sysCallSignal(-cmd.Process.Pid, fatalTimeoutBehaviour) + } // now original cmd should exit immediately after SIGKILL <-done // return error with a marker indicating that cmd exited only after fatal SIGKILL @@ -245,6 +332,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("fatal-timeout") err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "") logCtx.Error(err.Error()) return strings.TrimSuffix(output, "\n"), err @@ -256,6 +344,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("timeout") err = newCmdError(redactor(args), fmt.Errorf("timeout after %v", timeout), "") logCtx.Error(err.Error()) return strings.TrimSuffix(output, "\n"), err @@ -270,6 +359,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { if !opts.SkipErrorLogging { logCtx.Error(err.Error()) } + logHeadLockStatus("done-error") return strings.TrimSuffix(output, "\n"), err } } @@ -278,6 +368,7 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { output += stderr.String() } logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + logHeadLockStatus("done-success") return strings.TrimSuffix(output, "\n"), nil } diff --git a/util/exec/exec_test.go b/util/exec/exec_test.go index 6fc284aaa129f..5eb8159b77f14 100644 --- a/util/exec/exec_test.go +++ b/util/exec/exec_test.go @@ -1,7 +1,9 @@ package exec import ( + "os" "os/exec" + "path/filepath" "regexp" "syscall" "testing" @@ -215,3 +217,51 @@ func TestRunCaptureStderr(t *testing.T) { assert.Equal(t, "hello world\nmy-error", output) assert.NoError(t, err) } + +// This test demonstrates that when a process group is signaled, all child processes are also terminated and file locks released. +func TestProcessGroupSignalRemovesChildLock(t *testing.T) { + hook := test.NewGlobal() + log.SetLevel(log.DebugLevel) + defer log.SetLevel(log.InfoLevel) + + dir := t.TempDir() + lockFile := filepath.Join(dir, "lockfile") + childScript := filepath.Join(dir, "child.sh") + parentScript := filepath.Join(dir, "parent.sh") + + // Child: create lock file; on SIGTERM remove it and exit + child := "#!/bin/sh\n" + + "trap 'rm -f lockfile; exit 0' TERM\n" + + "touch lockfile\n" + + "sleep 100\n" + require.NoError(t, os.WriteFile(childScript, []byte(child), 0o755)) + + // Parent: start child in background and sleep + parent := "#!/bin/sh\n" + + "./child.sh &\n" + + "sleep 100\n" + require.NoError(t, os.WriteFile(parentScript, []byte(parent), 0o755)) + + // Run parent with a short timeout; our implementation signals the process group + opts := CmdOpts{ + Timeout: 500 * time.Millisecond, + FatalTimeout: 500 * time.Millisecond, + TimeoutBehavior: TimeoutBehavior{ + Signal: syscall.SIGTERM, + ShouldWait: true, + }, + } + _, err := RunCommand("sh", opts, "-c", "cd "+dir+" && ./parent.sh") + require.Error(t, err) + + // Give a bit of time for traps to run and for the process tree to settle + time.Sleep(200 * time.Millisecond) + + // Because the process group was signaled, the child should have removed the lock + _, statErr := os.Stat(lockFile) + require.Error(t, statErr, "expected lock file to be removed when process group is signaled") + assert.True(t, os.IsNotExist(statErr)) + + // basic sanity: logs produced + require.GreaterOrEqual(t, len(hook.Entries), 1) +} diff --git a/util/exec/exec_unix.go b/util/exec/exec_unix.go new file mode 100644 index 0000000000000..dd6614ed7c375 --- /dev/null +++ b/util/exec/exec_unix.go @@ -0,0 +1,14 @@ +//go:build !windows +// +build !windows + +package exec + +import "syscall" + +func newSysProcAttr(setpgid bool) *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setpgid: setpgid} +} + +func sysCallSignal(pid int, sig syscall.Signal) error { + return syscall.Kill(pid, sig) +} diff --git a/util/exec/exec_windows.go b/util/exec/exec_windows.go new file mode 100644 index 0000000000000..229ca37f31a5a --- /dev/null +++ b/util/exec/exec_windows.go @@ -0,0 +1,10 @@ +//go:build windows +// +build windows + +package exec + +import "syscall" + +func newSysProcAttr(_ bool) *syscall.SysProcAttr { return &syscall.SysProcAttr{} } + +func sysCallSignal(_ int, _ syscall.Signal) error { return nil }