Skip to content

Commit f3aa252

Browse files
committed
feat(collect): Add timeout per object (-u) to the collect command
1 parent 909933b commit f3aa252

File tree

15 files changed

+239
-80
lines changed

15 files changed

+239
-80
lines changed

.github/workflows/test.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ jobs:
99
name: Lint
1010
runs-on: ubuntu-latest
1111
steps:
12-
- uses: actions/checkout@v2
12+
- uses: actions/setup-go@v3
13+
with:
14+
go-version: 1.16
15+
- uses: actions/checkout@v3
1316
- name: golangci-lint
14-
uses: golangci/golangci-lint-action@v2
17+
uses: golangci/golangci-lint-action@v3
1518
with:
1619
# Optional: version of golangci-lint to use in form of v1.2 or v1.2.3 or `latest` to use the latest version
17-
version: v1.43
20+
version: v1.45
1821
build:
1922
name: Build & Test
2023
runs-on: ubuntu-latest
@@ -77,6 +80,7 @@ jobs:
7780
GH_INTEGRATION_TESTS_TOKEN: ${{ secrets.GH_INTEGRATION_TESTS_TOKEN }}
7881
BITBUCKET_INTEGRATION_TESTS_TOKEN: ${{ secrets.BITBUCKET_INTEGRATION_TESTS_TOKEN }}
7982
BITBUCKET_INTEGRATION_TESTS_URL: ${{ secrets.BITBUCKET_INTEGRATION_TESTS_URL }}
83+
REPOSITORY_TRIGGERING_TIMEOUT: ${{ secrets.REPOSITORY_TRIGGERING_TIMEOUT }}
8084
run: |
8185
python3 -m pip install pytest
8286
pytest tests

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ The default format is `gzip-jsonl` to minimize the size of the output file.
103103
The default output filepath is `./fingerprints.jsonl.gz`. Use `--output` to override this behavior.
104104
Also, note that if you were to download fingerprints for repositories of a big organization, `src-fingerprint` has a limit to process no more than 100
105105
repositories. You can override this limit with the option `--limit`, a limit of 0 will process all repos of the organization.
106-
Note that if multiple organizations are passed, the limit is applied to each one independently.
106+
Note that if multiple organizations are passed, the limit is applied to each one independently.
107+
There is no default timeout, it can be set with the option `--timeout`. Similarly to the limit, it is applied to each source independently.
107108

108109
### Sample output
109110

cloner/clone_repository.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package cloner
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"errors"
7+
"os"
8+
"os/exec"
9+
"strings"
10+
11+
"github.com/shirou/gopsutil/v3/process"
12+
13+
log "github.com/sirupsen/logrus"
14+
)
15+
16+
func terminateProcessAndChildren(pid int32) error {
17+
var (
18+
proc *process.Process
19+
children []*process.Process
20+
err error
21+
)
22+
23+
proc, err = process.NewProcess(pid)
24+
if err != nil {
25+
return err
26+
}
27+
28+
children, err = proc.Children()
29+
if err != nil {
30+
log.WithError(err).Warnf("could not get children of pid#%v", pid)
31+
32+
children = []*process.Process{}
33+
}
34+
35+
err = proc.Terminate()
36+
if err != nil {
37+
return err
38+
}
39+
40+
for _, child := range children {
41+
err = terminateProcessAndChildren(child.Pid)
42+
if err != nil {
43+
log.WithError(err).Warnf("could not terminate pid#%v", pid)
44+
}
45+
}
46+
47+
return nil
48+
}
49+
50+
func cloneGitRepository(ctx context.Context, destDir, gitRepoURL string) error {
51+
var outbuf, errbuf bytes.Buffer
52+
// git clone github.com/author/name.git /tmp/workdir/author-name/clone
53+
cmd := exec.Command("git", "clone", gitRepoURL, destDir)
54+
cmd.Stdout = &outbuf
55+
cmd.Stderr = &errbuf
56+
57+
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
58+
59+
go func() {
60+
<-ctx.Done()
61+
62+
if cmd.Process != nil && (cmd.ProcessState == nil || !cmd.ProcessState.Exited()) {
63+
if err := terminateProcessAndChildren(int32(cmd.Process.Pid)); err != nil {
64+
log.WithError(err).Error("Could not terminate git")
65+
}
66+
}
67+
}()
68+
69+
err := cmd.Run()
70+
if exitError, ok := err.(*exec.ExitError); ok {
71+
stderr := strings.TrimSpace(errbuf.String())
72+
73+
if exitError.ExitCode() == gitExitUnclean {
74+
log.WithError(err).WithFields(log.Fields{
75+
"op": "gitError",
76+
"stderr": stderr,
77+
}).Warnf("missing repo")
78+
} else if ctx.Err() != nil {
79+
log.Errorf("timeout reached while cloning the repository")
80+
} else {
81+
log.WithError(err).WithFields(log.Fields{
82+
"op": "gitError",
83+
"stderr": stderr,
84+
}).Errorf("unhandled git error")
85+
}
86+
87+
return errors.New("")
88+
}
89+
90+
return err
91+
}

cloner/cloner.go

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
package cloner
22

33
import (
4-
"bytes"
5-
"errors"
4+
"context"
65
"os"
7-
"os/exec"
86
"path/filepath"
9-
"strings"
107

118
log "github.com/sirupsen/logrus"
129
)
@@ -15,7 +12,7 @@ const gitExitUnclean = 128
1512

1613
// Cloner represents a cloner of git repository.
1714
type Cloner interface {
18-
CloneRepository(url string) (string, error)
15+
CloneRepository(ctx context.Context, url string) (string, error)
1916
}
2017

2118
// DiskCloner closes a git repository on disk in a temporary file.
@@ -52,48 +49,17 @@ func NewDiskCloner(baseDir string) *DiskCloner {
5249
}
5350

5451
// CloneRepository clones a git repository given its information.
55-
func (d *DiskCloner) CloneRepository(url string) (string, error) {
52+
func (d *DiskCloner) CloneRepository(ctx context.Context, url string) (string, error) {
5653
tmpDir, err := os.MkdirTemp(d.BaseDir, "srcfingerprint-")
5754
if err != nil {
5855
return "", err
5956
}
6057

61-
if err := cloneGitRepository(tmpDir, url); err != nil {
58+
if err := cloneGitRepository(ctx, tmpDir, url); err != nil {
6259
os.RemoveAll(tmpDir)
6360

6461
return "", err
6562
}
6663

6764
return tmpDir, nil
6865
}
69-
70-
func cloneGitRepository(destDir, gitRepoURL string) error {
71-
var outbuf, errbuf bytes.Buffer
72-
// git clone github.com/author/name.git /tmp/workdir/author-name/clone
73-
cmd := exec.Command("git", "clone", gitRepoURL, destDir)
74-
cmd.Stdout = &outbuf
75-
cmd.Stderr = &errbuf
76-
77-
cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0")
78-
79-
err := cmd.Run()
80-
if exitError, ok := err.(*exec.ExitError); ok {
81-
stderr := strings.TrimSpace(errbuf.String())
82-
83-
if exitError.ExitCode() == gitExitUnclean {
84-
log.WithError(err).WithFields(log.Fields{
85-
"op": "gitError",
86-
"stderr": stderr,
87-
}).Warnf("missing repo")
88-
} else {
89-
log.WithError(err).WithFields(log.Fields{
90-
"op": "gitError",
91-
"stderr": stderr,
92-
}).Errorf("unhandled git error")
93-
}
94-
95-
return errors.New("")
96-
}
97-
98-
return err
99-
}

cmd/src-fingerprint/main.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ func runExtract(
2525
pipeline *srcfingerprint.Pipeline,
2626
objects []string,
2727
after string,
28-
limit int) chan srcfingerprint.PipelineEvent {
28+
limit int,
29+
timeout time.Duration) chan srcfingerprint.PipelineEvent {
2930
// If there is no object, default to an empty object
3031
if len(objects) == 0 {
3132
objects = []string{""}
@@ -38,7 +39,7 @@ func runExtract(
3839
defer close(eventChannel)
3940

4041
for _, object := range objects {
41-
pipeline.ExtractRepositories(object, after, eventChannel, limit)
42+
pipeline.ExtractRepositories(object, after, eventChannel, limit, timeout)
4243
}
4344
}(ch)
4445

@@ -84,6 +85,7 @@ type authorInfo struct {
8485

8586
const DefaultClonerN = 8
8687
const DefaultLimit = 100
88+
const DefaultTimeout = 0
8789

8890
func main() {
8991
cli.VersionFlag = &cli.BoolFlag{
@@ -203,6 +205,11 @@ func main() {
203205
Usage: "Maximum number of repositories to analyze (0 for unlimited). " +
204206
"The limit is applied for independently to each object.",
205207
},
208+
&cli.DurationFlag{
209+
Name: "timeout",
210+
Value: DefaultTimeout,
211+
Usage: "Maximum time to process each object (0 for unlimited, min. 1s).",
212+
},
206213
},
207214
},
208215
},
@@ -267,6 +274,12 @@ func collectAction(c *cli.Context) error {
267274
}
268275
}()
269276

277+
timeout := c.Duration("timeout")
278+
if timeout != 0 && timeout < time.Second {
279+
log.Error("timeout must be 0 or >= 1")
280+
cli.ShowAppHelpAndExit(c, 1)
281+
}
282+
270283
srcProvider, err := getProvider(c.String("provider"), c.String("token"), providerOptions)
271284
if err != nil {
272285
log.Errorln(err)
@@ -288,7 +301,13 @@ func collectAction(c *cli.Context) error {
288301

289302
ticker := time.Tick(1 * time.Second)
290303

291-
eventChannel := runExtract(&pipeline, c.StringSlice("object"), c.String("after"), c.Int("limit"))
304+
eventChannel := runExtract(
305+
&pipeline,
306+
c.StringSlice("object"),
307+
c.String("after"),
308+
c.Int("limit"),
309+
timeout,
310+
)
292311

293312
// runtime stats
294313
var (

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module srcfingerprint
22

3-
go 1.15
3+
go 1.16
44

55
require (
66
github.com/Microsoft/go-winio v0.5.0 // indirect
@@ -13,16 +13,16 @@ require (
1313
github.com/kevinburke/ssh_config v1.1.0 // indirect
1414
github.com/russross/blackfriday/v2 v2.1.0 // indirect
1515
github.com/sergi/go-diff v1.2.0 // indirect
16+
github.com/shirou/gopsutil/v3 v3.22.4 // indirect
1617
github.com/sirupsen/logrus v1.8.1
17-
github.com/stretchr/testify v1.7.0
18+
github.com/stretchr/testify v1.7.1
1819
github.com/suhaibmujahid/go-bitbucket-server v0.1.0
1920
github.com/urfave/cli/v2 v2.3.0
2021
github.com/xanzy/go-gitlab v0.50.1
2122
github.com/xanzy/ssh-agent v0.3.0 // indirect
2223
golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e // indirect
2324
golang.org/x/net v0.0.0-20210614182718-04defd469f4e // indirect
2425
golang.org/x/oauth2 v0.0.0-20210628180205-a41e5a781914
25-
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect
2626
golang.org/x/time v0.0.0-20210611083556-38a9dc6acbc6 // indirect
2727
google.golang.org/appengine v1.6.7 // indirect
2828
google.golang.org/protobuf v1.27.1 // indirect

0 commit comments

Comments
 (0)