Skip to content

Commit fdda390

Browse files
committed
feat: add git files shas collection capabilities
1 parent 18e92d2 commit fdda390

File tree

7 files changed

+404
-29
lines changed

7 files changed

+404
-29
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ It supports 3 main on premise version control service:
88

99
* GitHub Enterprise
1010
* Gitlab CE and EE
11-
* BitBucket
11+
* BitBucket (not supported yet)
1212

1313
## Use the package
1414

@@ -31,6 +31,12 @@ It supports 3 main on premise version control service:
3131
./dna-collector -provider-url http://gitlab.example.com gitlab Groupe
3232
```
3333

34+
## Some examples
35+
36+
* Don't forget to build the package: `go build ./cmd/dna-collector`
37+
38+
1. Export all files sha from a GitHub Org to a file with logs: `./dna-collector -verbose -output file_shas_collected_dna.json github GitGuardian`
39+
2. Name params should be passed before positional parameters.
3440
## Architecture
3541
3642
### Main overview

cmd/dna-collector/main.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ type authorInfo struct {
4141
LastCommitDate time.Time
4242
}
4343

44+
type AugmentedGitFile struct {
45+
46+
RepositoryName string `json:"repository_name"`
47+
dnacollector.GitFile
48+
}
49+
4450
func main() {
4551
var (
4652
verbose = flag.Bool("verbose", false, "set to add verbose logging")
@@ -123,9 +129,11 @@ func main() {
123129
doneRepo int
124130
authors map[string]*authorInfo
125131
commitsCount int
132+
gitFilesCount int
126133
)
127134

128135
authors = make(map[string]*authorInfo)
136+
gitFilesArray := make([]*AugmentedGitFile, 0)
129137
loop:
130138
for {
131139
select {
@@ -141,7 +149,7 @@ loop:
141149
if typedEvent.Finished {
142150
doneRepo++
143151
}
144-
case dnacollector.ResultPipelineEvent:
152+
case dnacollector.ResultCommitPipelineEvent:
145153
commitsCount++
146154

147155
identity := typedEvent.Author.Name + typedEvent.Author.Email
@@ -155,6 +163,11 @@ loop:
155163
if commit.Author.When.UTC().After(authors[identity].LastCommitDate) {
156164
authors[identity].LastCommitDate = commit.Author.When.UTC()
157165
}
166+
// Collecting gitFiles
167+
case dnacollector.ResultGitFilePipelineEvent:
168+
gitFilesCount++
169+
gitFilesArray = append(gitFilesArray, &AugmentedGitFile{typedEvent.Repository.GetName(), *typedEvent.GitFile})
170+
158171
}
159172
case <-ticker:
160173
if totalRepo == 0 {
@@ -163,28 +176,29 @@ loop:
163176

164177
log.Infof("%v/%v repos: ", doneRepo, totalRepo)
165178
log.Infof("%v distinct authors, %v commit analyzed\n", len(authors), commitsCount)
179+
log.Infof("%v files analyzed\n", gitFilesCount)
166180
}
167181
}
168182

169183
log.Infoln("Final stats:")
170184
log.Infof("%v/%v repos: ", doneRepo, totalRepo)
171185
log.Infof("%v distinct authors, %v commit analyzed\n", len(authors), commitsCount)
172-
186+
log.Infof("%v git files analyzed\n", gitFilesCount)
173187
log.Infof("Dumping to output %v", *outputFilename)
174188

175-
authorsList := make([]*authorInfo, 0, len(authors))
176-
for _, author := range authors {
177-
authorsList = append(authorsList, author)
178-
}
189+
//authorsList := make([]*authorInfo, 0, len(authors))
190+
//for _, author := range authors {
191+
// authorsList = append(authorsList, author)
192+
//}
179193

180194
var (
181195
jsonBytes []byte
182196
err error
183197
)
184198
if *prettyPrint {
185-
jsonBytes, err = json.MarshalIndent(authorsList, "", "\t")
199+
jsonBytes, err = json.MarshalIndent(gitFilesArray, "", "\t")
186200
} else {
187-
jsonBytes, err = json.Marshal(authorsList)
201+
jsonBytes, err = json.Marshal(gitFilesArray)
188202
}
189203

190204
if err != nil {

cmd/test2/main.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package main
2+
3+
import (
4+
"dnacollector"
5+
"github.com/caarlos0/env"
6+
log "github.com/sirupsen/logrus"
7+
git2 "gopkg.in/src-d/go-git.v4"
8+
)
9+
10+
type config struct {
11+
GithubToken string `env:"GITHUB_TOKEN"`
12+
GitlabToken string `env:"GITLAB_TOKEN"`
13+
}
14+
15+
func main() {
16+
conf := config{}
17+
if err := env.Parse(&conf); err != nil {
18+
log.Fatalf("Could not parse env: %v\n", err)
19+
}
20+
21+
// Config log
22+
log.SetFormatter(&log.TextFormatter{
23+
DisableColors: true,
24+
FullTimestamp: true,
25+
TimestampFormat: "Jan _2 15:04:05.000000000",
26+
})
27+
log.SetReportCaller(true)
28+
log.SetLevel(log.InfoLevel)
29+
//var cloner dnacollector.Cloner = &dnacollector.DiskCloner{}
30+
//auth := &http.BasicAuth{
31+
// Username: "ericfourrier",
32+
// Password: conf.GithubToken,
33+
//}
34+
35+
//repository, err := cloner.CloneRepository("https://github.com/uber/cadence.git", auth)
36+
//if err != nil {
37+
// log.Panic(err)
38+
//
39+
//}
40+
repository, err := git2.PlainOpen("/Users/ericfourrier/Documents/GGCode/dna-collector/testdata/cadence")
41+
if err != nil {
42+
log.Panic(err)
43+
}
44+
extractor := dnacollector.NewFastExtractor()
45+
extractor.Run(repository)
46+
arrGitFiles := make([]*dnacollector.GitFile, 0)
47+
for gitFile := range extractor.ChanGitFiles {
48+
arrGitFiles = append(arrGitFiles, gitFile)
49+
log.Debug(gitFile)
50+
51+
}
52+
log.Infof("length of files collected %d", len(arrGitFiles))
53+
}

extractor.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
package dnacollector
22

33
import (
4+
"bufio"
5+
"encoding/json"
6+
"errors"
7+
log "github.com/sirupsen/logrus"
8+
"gopkg.in/src-d/go-billy.v4/helper/chroot"
49
git "gopkg.in/src-d/go-git.v4"
510
"gopkg.in/src-d/go-git.v4/plumbing/object"
11+
"gopkg.in/src-d/go-git.v4/storage/filesystem"
12+
"os"
13+
"os/exec"
614
)
715

816
// Extractor extracts commits for a given repository
@@ -26,3 +34,115 @@ func NewExtractor(repository *git.Repository) (*Extractor, error) {
2634
func (e *Extractor) ExtractNextCommit() (*object.Commit, error) {
2735
return e.iter.Next()
2836
}
37+
38+
// FastExtractor will directly extract the information without using an Analyzer
39+
// There are designed to use raw git commands to get what is needed
40+
41+
func GetBasePathGoGitRepo(r *git.Repository) (string, error) {
42+
//
43+
// Try to grab the repository Storer
44+
s, ok := r.Storer.(*filesystem.Storage)
45+
if !ok {
46+
return "", errors.New("Repository storage is not filesystem.Storage")
47+
}
48+
49+
// Try to get the underlying billy.Filesystem
50+
fs, ok := s.Filesystem().(*chroot.ChrootHelper)
51+
if !ok {
52+
return "", errors.New("Filesystem is not chroot.ChrootHelper")
53+
}
54+
55+
return fs.Root(), nil
56+
}
57+
58+
//type BaseGitFileIterator struct {
59+
// repository *git.Repository
60+
// filesChan chan *GitFile
61+
//}
62+
//
63+
//func (i *BaseGitFileIterator) Compute(repository *git.Repositor) error {
64+
//
65+
//}
66+
//
67+
//func (i *BaseGitFileIterator) Close() error {
68+
//
69+
//}
70+
71+
type BaseExtractor interface {
72+
Next() (interface{}, error)
73+
}
74+
75+
76+
77+
type GitFile struct {
78+
Sha string `json:"sha"`
79+
Type string `json:"type"`
80+
Filepath string `json:"filepath"`
81+
Size string `json:"size"`
82+
}
83+
84+
85+
func NewFastExtractor() *FastExtractor {
86+
return &FastExtractor{make(chan *GitFile)}
87+
}
88+
type FastExtractor struct{
89+
ChanGitFiles chan *GitFile
90+
}
91+
92+
func (fe *FastExtractor) Run(repository *git.Repository) chan *GitFile{
93+
// https://gist.github.com/ochinchina/9e409a88e77c3cfd94c3
94+
path, err := GetBasePathGoGitRepo(repository)
95+
if err != nil {
96+
log.Fatal(err)
97+
}
98+
err = os.Chdir(path)
99+
if err != nil {
100+
log.Fatal(err)
101+
}
102+
log.Infof("Extracting commits from path %s", path)
103+
cmdBase := "git rev-list --objects --all | git cat-file --batch-check='{\"sha\": \"%(objectname)\", \"type\": \"%(objecttype)\", \"filepath\": \"%(rest)\", \"size\": \"%(objectsize:disk)\"}' | grep '\"type\": \"blob\"'"
104+
cmd := exec.Command("bash", "-c", cmdBase)
105+
stdout, err := cmd.StdoutPipe()
106+
if err != nil {
107+
log.Fatal(err)
108+
}
109+
err = cmd.Start()
110+
if err != nil {
111+
log.Fatal(err)
112+
}
113+
buf := bufio.NewReader(stdout) // Notice that this is not in a loop
114+
num := 0
115+
go func() {
116+
for {
117+
line, _, _ := buf.ReadLine()
118+
if len(line) == 0 {
119+
log.Info("finish reading all files from stdout from git")
120+
break
121+
}
122+
num += 1
123+
log.Debugf("parsing line %s", line)
124+
var gitFile GitFile
125+
json.Unmarshal(line, &gitFile)
126+
fe.ChanGitFiles <- &gitFile
127+
128+
}
129+
130+
close(fe.ChanGitFiles)
131+
log.Info("channel is closed")
132+
log.Infof("finishing iterating over files, we have collected %d files", num)
133+
134+
}()
135+
return fe.ChanGitFiles
136+
}
137+
138+
139+
140+
//targetFile := fe.BaseDirStorage + "/" + "sha_files.jsonl"
141+
//log.Infof("Saving to file %s", targetFile)
142+
//dataFile, err := os.Create(targetFile)
143+
//if err != nil {
144+
// log.Error(err)
145+
//}
146+
//defer dataFile.Close()
147+
//_, err = dataFile.WriteString(out.String())
148+
//return err

0 commit comments

Comments
 (0)