Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions reposerver/repository/repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -2533,7 +2533,23 @@ func fetch(gitClient git.Client, targetRevisions []string) error {
return nil
}

// removeStaleGitLocks best-effort removes common stale git lock files in the repository.
// It is safe to call while holding the per-repo lock and before running any git commands.

// func removeStaleGitLock(repoRoot string) {
// path := filepath.Join(repoRoot, ".git", "HEAD.lock")
// log.Infof("Checking whether HEAD.lock exists %s", path)
// if _, err := os.Stat(path); err == nil {
// log.Warnf("HEAD.lock present in git repository %s, removing it", repoRoot)
// if rmErr := os.Remove(path); rmErr != nil {
// log.Errorf("Failed to remove git lock %s: %v", path, rmErr)
// }
// }
// }

func checkoutRevision(gitClient git.Client, revision string, submoduleEnabled bool) error {
// removeStaleGitLock(gitClient.Root())

err := gitClient.Init()
if err != nil {
return status.Errorf(codes.Internal, "Failed to initialize git repo: %v", err)
Expand Down
26 changes: 26 additions & 0 deletions reposerver/repository/repository_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3016,6 +3016,29 @@ func TestInit(t *testing.T) {
initGitRepo(t, newGitRepoOptions{path: path.Join(dir, "repo2"), remote: "https://github.com/argo-cd/test-repo2", createPath: true, addEmptyCommit: false})
}

// func TestRemoveStaleGitHeadLock(t *testing.T) {
// dir := t.TempDir()
// // create a fake repo structure with .git/HEAD.lock
// gitDir := path.Join(dir, ".git")
// require.NoError(t, os.MkdirAll(gitDir, 0o755))
// headLock := path.Join(gitDir, "HEAD.lock")
// require.NoError(t, os.WriteFile(headLock, []byte("test"), 0o644))

// // sanity: lock exists before
// _, err := os.Stat(headLock)
// require.NoError(t, err)

// removeStaleGitLock(dir)

// // lock should be gone after cleanup
// _, err = os.Stat(headLock)
// require.Error(t, err)
// require.True(t, os.IsNotExist(err))

// // calling again should be a no-op (no error)
// removeStaleGitLock(dir)
// }

// TestCheckoutRevisionCanGetNonstandardRefs shows that we can fetch a revision that points to a non-standard ref. In
// other words, we haven't regressed and caused this issue again: https://github.com/argoproj/argo-cd/issues/4935
func TestCheckoutRevisionCanGetNonstandardRefs(t *testing.T) {
Expand Down Expand Up @@ -3057,6 +3080,7 @@ func TestCheckoutRevisionPresentSkipFetch(t *testing.T) {

gitClient := &gitmocks.Client{}
gitClient.On("Init").Return(nil)
// gitClient.On("Root").Return("<repo-root>")
gitClient.On("IsRevisionPresent", revision).Return(true)
gitClient.On("Checkout", revision, mock.Anything).Return("", nil)

Expand All @@ -3069,6 +3093,7 @@ func TestCheckoutRevisionNotPresentCallFetch(t *testing.T) {

gitClient := &gitmocks.Client{}
gitClient.On("Init").Return(nil)
// gitClient.On("Root").Return("<repo-root>")
gitClient.On("IsRevisionPresent", revision).Return(false)
gitClient.On("Fetch", "").Return(nil)
gitClient.On("Checkout", revision, mock.Anything).Return("", nil)
Expand All @@ -3083,6 +3108,7 @@ func TestFetch(t *testing.T) {

gitClient := &gitmocks.Client{}
gitClient.On("Init").Return(nil)
// gitClient.On("Root").Return("<repo-root>")
gitClient.On("IsRevisionPresent", revision1).Once().Return(true)
gitClient.On("IsRevisionPresent", revision2).Once().Return(false)
gitClient.On("Fetch", "").Return(nil)
Expand Down
97 changes: 94 additions & 3 deletions util/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
Expand Down Expand Up @@ -183,17 +184,97 @@
args := strings.Join(cmd.Args, " ")
logCtx.WithFields(logrus.Fields{"dir": cmd.Dir}).Info(redactor(args))

// Best-effort cleanup of a stale HEAD.lock after the command finishes.
defer func() {
if cmd.Dir == "" {
return
}
lockPath := filepath.Join(cmd.Dir, ".git", "HEAD.lock")
logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Info("Checking HEAD.lock presence post-exec")
if _, err := os.Stat(lockPath); err == nil {
// Log and attempt removal; ignore ENOENT races
logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warn("HEAD.lock present post-exec, removing it")
if rmErr := os.Remove(lockPath); rmErr != nil && !os.IsNotExist(rmErr) {
logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warnf("Failed to remove HEAD.lock: %v", rmErr)
}
} else if !os.IsNotExist(err) {
logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Warnf("Failed to stat HEAD.lock: %v", err)
} else {
logCtx.WithFields(logrus.Fields{"headLockPath": lockPath}).Info("HEAD.lock not present post-exec")
}
}()

// Helper: debug whether HEAD.lock exists under the current working directory
logHeadLockStatus := func(where string) {
if cmd.Dir == "" {
return
}
lockPath := filepath.Join(cmd.Dir, ".git", "HEAD.lock")
fileInfo, statErr := os.Stat(lockPath)
exists := statErr == nil
fields := logrus.Fields{
"headLockPath": lockPath,
"headLockExists": exists,
"where": where,
}

pgid, pgErr := syscall.Getpgid(cmd.Process.Pid)
if pgErr == nil && pgid > 0 {
// Portable ps: list all processes, print needed columns without headers, then filter by PGID in Go.
out, _ := exec.Command(
"ps",
"-ax",
"-o", "pid=,ppid=,pgid=,etime=,comm=,args=",
).Output()
if len(out) > 0 {
var b strings.Builder
want := strconv.Itoa(pgid)
for _, line := range strings.Split(string(out), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
fieldsSlice := strings.Fields(line)
if len(fieldsSlice) < 3 {
continue
}
if fieldsSlice[2] == want {
b.WriteString(line)
b.WriteByte('\n')
}
}
fields["gitProcsInGroup"] = strings.TrimSpace(b.String())
}
}

if exists {
fields["headLockSize"] = fileInfo.Size()
fields["headLockMode"] = fileInfo.Mode().String()
fields["headLockModTime"] = fileInfo.ModTime()
fields["headLockIsDir"] = fileInfo.IsDir()

}

Check failure on line 256 in util/exec/exec.go

View workflow job for this annotation

GitHub Actions / Lint Go code

unnecessary trailing newline (whitespace)
logCtx.WithFields(fields).Info("HEAD.lock status")
}
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr

// Configure the child to run in its own process group so we can signal the whole group on timeout/cancel.
// On Unix this sets Setpgid; on Windows this is a no-op.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = newSysProcAttr(true)
}

start := time.Now()
err = cmd.Start()
if err != nil {
return "", err
}

logHeadLockStatus("start-exec")

done := make(chan error)
go func() { done <- cmd.Wait() }()

Expand Down Expand Up @@ -229,14 +310,20 @@
// noinspection ALL
case <-timoutCh:
// send timeout signal
_ = cmd.Process.Signal(timeoutBehavior.Signal)
// signal the process group (negative PID) so children are terminated as well
if cmd.Process != nil {
_ = sysCallSignal(-cmd.Process.Pid, timeoutBehavior.Signal)
}
// wait on timeout signal and fallback to fatal timeout signal
if timeoutBehavior.ShouldWait {
select {
case <-done:
logHeadLockStatus("timeout-waited-done")
case <-fatalTimeoutCh:
// upgrades to SIGKILL if cmd does not respect SIGTERM
_ = cmd.Process.Signal(fatalTimeoutBehaviour)
// upgrades to fatal signal (default SIGKILL) if cmd does not respect the initial signal
if cmd.Process != nil {
_ = sysCallSignal(-cmd.Process.Pid, fatalTimeoutBehaviour)
}
// now original cmd should exit immediately after SIGKILL
<-done
// return error with a marker indicating that cmd exited only after fatal SIGKILL
Expand All @@ -245,6 +332,7 @@
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
logHeadLockStatus("fatal-timeout")
err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "")
logCtx.Error(err.Error())
return strings.TrimSuffix(output, "\n"), err
Expand All @@ -256,6 +344,7 @@
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
logHeadLockStatus("timeout")
err = newCmdError(redactor(args), fmt.Errorf("timeout after %v", timeout), "")
logCtx.Error(err.Error())
return strings.TrimSuffix(output, "\n"), err
Expand All @@ -270,6 +359,7 @@
if !opts.SkipErrorLogging {
logCtx.Error(err.Error())
}
logHeadLockStatus("done-error")
return strings.TrimSuffix(output, "\n"), err
}
}
Expand All @@ -278,6 +368,7 @@
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
logHeadLockStatus("done-success")

return strings.TrimSuffix(output, "\n"), nil
}
Expand Down
50 changes: 50 additions & 0 deletions util/exec/exec_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package exec

import (
"os"
"os/exec"
"path/filepath"
"regexp"
"syscall"
"testing"
Expand Down Expand Up @@ -215,3 +217,51 @@ func TestRunCaptureStderr(t *testing.T) {
assert.Equal(t, "hello world\nmy-error", output)
assert.NoError(t, err)
}

// This test demonstrates that when a process group is signaled, all child processes are also terminated and file locks released.
func TestProcessGroupSignalRemovesChildLock(t *testing.T) {
hook := test.NewGlobal()
log.SetLevel(log.DebugLevel)
defer log.SetLevel(log.InfoLevel)

dir := t.TempDir()
lockFile := filepath.Join(dir, "lockfile")
childScript := filepath.Join(dir, "child.sh")
parentScript := filepath.Join(dir, "parent.sh")

// Child: create lock file; on SIGTERM remove it and exit
child := "#!/bin/sh\n" +
"trap 'rm -f lockfile; exit 0' TERM\n" +
"touch lockfile\n" +
"sleep 100\n"
require.NoError(t, os.WriteFile(childScript, []byte(child), 0o755))

// Parent: start child in background and sleep
parent := "#!/bin/sh\n" +
"./child.sh &\n" +
"sleep 100\n"
require.NoError(t, os.WriteFile(parentScript, []byte(parent), 0o755))

// Run parent with a short timeout; our implementation signals the process group
opts := CmdOpts{
Timeout: 500 * time.Millisecond,
FatalTimeout: 500 * time.Millisecond,
TimeoutBehavior: TimeoutBehavior{
Signal: syscall.SIGTERM,
ShouldWait: true,
},
}
_, err := RunCommand("sh", opts, "-c", "cd "+dir+" && ./parent.sh")
require.Error(t, err)

// Give a bit of time for traps to run and for the process tree to settle
time.Sleep(200 * time.Millisecond)

// Because the process group was signaled, the child should have removed the lock
_, statErr := os.Stat(lockFile)
require.Error(t, statErr, "expected lock file to be removed when process group is signaled")
assert.True(t, os.IsNotExist(statErr))

// basic sanity: logs produced
require.GreaterOrEqual(t, len(hook.Entries), 1)
}
14 changes: 14 additions & 0 deletions util/exec/exec_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//go:build !windows
// +build !windows

package exec

import "syscall"

func newSysProcAttr(setpgid bool) *syscall.SysProcAttr {
return &syscall.SysProcAttr{Setpgid: setpgid}
}

func sysCallSignal(pid int, sig syscall.Signal) error {
return syscall.Kill(pid, sig)
}
10 changes: 10 additions & 0 deletions util/exec/exec_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//go:build windows
// +build windows

package exec

import "syscall"

func newSysProcAttr(_ bool) *syscall.SysProcAttr { return &syscall.SysProcAttr{} }

func sysCallSignal(_ int, _ syscall.Signal) error { return nil }
Loading