Skip to content

Commit 18e92d2

Browse files
committed
feat: first commit
1 parent 0e0b789 commit 18e92d2

File tree

16 files changed

+1772
-0
lines changed

16 files changed

+1772
-0
lines changed

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
GITHUB_TOKEN=------FILL ME-------

.gometalinter.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"Enable": ["goimports","gosec", "staticcheck"]
3+
}

README.md

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# git-users-extractor
2+
3+
## Introduction
4+
5+
The purpose of this package is to extract all git user information of your developers from your hosted source version control system
6+
7+
It supports 3 main on premise version control service:
8+
9+
* GitHub Enterprise
10+
* Gitlab CE and EE
11+
* BitBucket
12+
13+
## Use the package
14+
15+
* Build binary
16+
```sh
17+
go build ./cmd/dna-collector
18+
```
19+
* Set env var `GITHUB_TOKEN` or `GITLAB_TOKEN`
20+
```sh
21+
export GITHUB_TOKEN="<token>"
22+
export GITLAB_TOKEN="<token>"
23+
```
24+
* Run and read doc
25+
```sh
26+
./dna-collector -help
27+
```
28+
* Run on a given user/group
29+
```sh
30+
./dna-collector github Uber
31+
./dna-collector -provider-url http://gitlab.example.com gitlab Groupe
32+
```
33+
34+
## Architecture
35+
36+
### Main overview
37+
All the git information can be found inside commit that are located inside git repositories
38+
Our tree element step are the following:
39+
* Collect all repositories URL from the company.
40+
* Clone them with the appropriate authentication.
41+
* Clone them and iterate over commits to extract git config information.
42+
* Store this information in a json file.
43+
44+
### Implementation
45+
The root package is the abstract implementation of the extractor. It contains a Cloner, that clones a git repository.
46+
It contains a Pipeline that extracts git information for every commit, of every repository of an organization.
47+
48+
The github package contains the implementation of the Github Provider.
49+
The gitlab package contains the implementation of the Gitlab Provider.
50+
51+
The cmd/guser-extractor package contains the binary code. It reads from CLI and environment the configuration and run the Pipeline on an organization.
52+
53+
### Library we use
54+
55+
#### Providers
56+
* GitHub: "github.com/google/go-github/v18/github"
57+
* Gitlab go wrapper: "github.com/xanzy/go-gitlab"
58+
* bitbucket not supported yet
59+
60+
#### Cloning
61+
* go-git: https://github.com/src-d/go-git
62+
63+
64+
### Issues
65+
* Repo size seems not to work on go gitlab wrapper.
66+
* Channels are cheap. Complex design overloading semantics isn't.
67+
68+
69+
### Notes eric
70+
71+
* We want to add more extractors -> We need to extract file shas
72+
* We will add more analyzer -> From the commits we want to get commits shas

analyzer.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package dnacollector
2+
3+
import (
4+
git "gopkg.in/src-d/go-git.v4/plumbing/object"
5+
)
6+
7+
// Analyzer analyzer a commit to extract its author
8+
type Analyzer struct {
9+
}
10+
11+
// AnalyzeCommit extracts author and commiter from a commit
12+
func (a *Analyzer) AnalyzeCommit(commit *git.Commit) (author git.Signature, commiter git.Signature) {
13+
return commit.Author, commit.Committer
14+
}

analyzer_test.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package dnacollector_test
2+
3+
import (
4+
"dnacollector"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
"github.com/stretchr/testify/suite"
9+
"gopkg.in/src-d/go-git.v4/plumbing/object"
10+
)
11+
12+
type AnalyzerTestSuite struct {
13+
suite.Suite
14+
}
15+
16+
func (suite *AnalyzerTestSuite) TestAnalyzeCommit() {
17+
analyzer := dnacollector.Analyzer{}
18+
commit := &object.Commit{
19+
Author: object.Signature{
20+
Name: "Author",
21+
22+
},
23+
Committer: object.Signature{
24+
Name: "Committer",
25+
26+
},
27+
}
28+
29+
author, committer := analyzer.AnalyzeCommit(commit)
30+
31+
assert.Equal(suite.T(), object.Signature{Name: "Author", Email: "[email protected]"}, author)
32+
assert.Equal(suite.T(), object.Signature{Name: "Committer", Email: "[email protected]"}, committer)
33+
}
34+
35+
func TestAnalyzer(t *testing.T) {
36+
suite.Run(t, new(AnalyzerTestSuite))
37+
}

cloner.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package dnacollector
2+
3+
import (
4+
"io/ioutil"
5+
6+
"gopkg.in/src-d/go-billy.v4/osfs"
7+
git "gopkg.in/src-d/go-git.v4"
8+
"gopkg.in/src-d/go-git.v4/plumbing/cache"
9+
"gopkg.in/src-d/go-git.v4/plumbing/transport"
10+
"gopkg.in/src-d/go-git.v4/storage/filesystem"
11+
"gopkg.in/src-d/go-git.v4/storage/memory"
12+
)
13+
14+
// Cloner represents a cloner of git repository
15+
type Cloner interface {
16+
CloneRepository(url string, auth transport.AuthMethod) (*git.Repository, error)
17+
}
18+
19+
// MemoryCloner clones a git repository in memory
20+
type MemoryCloner struct{}
21+
22+
// CloneRepository clones a git repository given its information
23+
func (*MemoryCloner) CloneRepository(url string, auth transport.AuthMethod) (*git.Repository, error) {
24+
return git.Clone(memory.NewStorage(), nil, &git.CloneOptions{
25+
URL: url,
26+
Progress: ioutil.Discard,
27+
Auth: auth,
28+
})
29+
}
30+
31+
// DiskCloner closes a git repository on disk in a temporary file
32+
type DiskCloner struct{}
33+
34+
// CloneRepository clones a git repository given its information
35+
func (*DiskCloner) CloneRepository(url string, auth transport.AuthMethod) (*git.Repository, error) {
36+
tmpDir, err := ioutil.TempDir("", "fs-")
37+
if err != nil {
38+
return nil, err
39+
}
40+
fs := osfs.New(tmpDir)
41+
return git.Clone(filesystem.NewStorage(fs, cache.NewObjectLRUDefault()), nil, &git.CloneOptions{
42+
URL: url,
43+
Progress: ioutil.Discard,
44+
Auth: auth,
45+
})
46+
}

cmd/dna-collector-test/main.go

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
package main
2+
3+
import (
4+
"dnacollector"
5+
"encoding/json"
6+
"errors"
7+
"fmt"
8+
"github.com/caarlos0/env"
9+
log "github.com/sirupsen/logrus"
10+
git2 "gopkg.in/src-d/go-git.v4"
11+
"gopkg.in/src-d/go-git.v4/plumbing/format/diff"
12+
git "gopkg.in/src-d/go-git.v4/plumbing/object"
13+
"io"
14+
)
15+
16+
type config struct {
17+
GithubToken string `env:"GITHUB_TOKEN"`
18+
GitlabToken string `env:"GITLAB_TOKEN"`
19+
}
20+
21+
var (
22+
// ErrGroupNotFound is the error returned when group can not be found
23+
ErrFileSimplifiedCreation = errors.New("we could not instantiate GitFileSimplified from GitFile")
24+
)
25+
26+
type GitFileSimplified struct {
27+
Name string `json:"name"`
28+
Sha string `json:"sha"`
29+
IsBinary bool `json:"is_binary"`
30+
Size int64 `json:"size"`
31+
}
32+
33+
type CommitSimplified struct {
34+
Message string `json:"message"`
35+
Sha string `json:"sha"`
36+
Author git.Signature `json:"author"`
37+
Committer git.Signature `json:"committer"`
38+
Files []*GitFileSimplified `json:"files"`
39+
}
40+
41+
func NewFromGitFile(file *git.File) *GitFileSimplified {
42+
isBinary, _ := file.IsBinary()
43+
return &GitFileSimplified{Name: file.Name, Sha: file.Hash.String(), IsBinary: isBinary, Size: file.Size}
44+
}
45+
46+
func NewFromCommit(commit *git.Commit, files []*GitFileSimplified) *CommitSimplified {
47+
return &CommitSimplified{Message: commit.Message, Sha: commit.Hash.String(), Author: commit.Author, Committer: commit.Committer, Files: files}
48+
}
49+
50+
func NewFromFilePatch(filePatch diff.FilePatch) (*GitFileSimplified, error) {
51+
isBinary := filePatch.IsBinary()
52+
from, to := filePatch.Files()
53+
// If the patch creates a new file, "from" will be nil.
54+
// If the patch deletes a file, "to" will be nil.
55+
56+
// Rare usecase
57+
if to == nil && from == nil {
58+
return nil, ErrFileSimplifiedCreation
59+
} else if to != nil {
60+
// File creation
61+
return &GitFileSimplified{Name: to.Path(), Sha: to.Hash().String(), IsBinary: isBinary, Size: 0}, nil
62+
} else {
63+
// File deletion
64+
return &GitFileSimplified{Name: from.Path(), Sha: from.Hash().String(), IsBinary: isBinary, Size: 0}, nil
65+
}
66+
}
67+
68+
func NewAnalyzer() *Analyzer {
69+
return &Analyzer{make([]*CommitSimplified, 0)}
70+
}
71+
72+
type Analyzer struct {
73+
CommitsList []*CommitSimplified
74+
}
75+
76+
func (a *Analyzer) GetFilesFromCommit(commit *git.Commit) ([]*GitFileSimplified, error) {
77+
var files []*GitFileSimplified
78+
79+
parent, err := commit.Parent(0)
80+
81+
// There is no parent, so we take all the files
82+
if err != nil {
83+
filesIter, err := commit.Files()
84+
if err != nil {
85+
return nil, err
86+
}
87+
88+
filesIter.ForEach(func(file *git.File) error {
89+
fileSimplified := NewFromGitFile(file)
90+
log.Debugf("Appending file %s", fileSimplified.Name)
91+
if fileSimplified.Size > 0 {
92+
/* fileSimplifiedJson, _ := json.Marshal(fileSimplified)*/
93+
log.Info(fileSimplified)
94+
}
95+
96+
files = append(files, fileSimplified)
97+
return nil
98+
})
99+
// There is a parent, so we consider only the diff
100+
} else {
101+
patch, _ := commit.Patch(parent)
102+
filePatches := patch.FilePatches()
103+
//log.Info(patch.Stats())
104+
for _, fp := range filePatches {
105+
fileSimplified, err := NewFromFilePatch(fp)
106+
//for _, chunk := range fp.Chunks() {
107+
// log.Debug(chunk)
108+
//}
109+
log.Debugf("Appending file %s", fileSimplified.Name)
110+
if err != nil {
111+
log.Warn(fileSimplified)
112+
files = append(files, fileSimplified)
113+
} else {
114+
continue
115+
//log.Error(ErrFileSimplifiedCreation)
116+
//log.Warn(commit)
117+
//log.Warn(fp)
118+
119+
}
120+
}
121+
}
122+
123+
return files, nil
124+
}
125+
126+
// AnalyzeCommit extracts author and committer from a commit
127+
func (a *Analyzer) AnalyzeCommit(commit *git.Commit) string {
128+
// Store commmit sha
129+
files, _ := a.GetFilesFromCommit(commit)
130+
a.CommitsList = append(a.CommitsList, NewFromCommit(commit, files))
131+
return commit.Hash.String()
132+
}
133+
134+
func (a *Analyzer) GetStats() map[string]int {
135+
res := make(map[string]int)
136+
res["nb_commits"] = len(a.CommitsList)
137+
nb_files_shas := 0
138+
for _, commit := range a.CommitsList {
139+
nb_files_shas += len(commit.Files)
140+
}
141+
res["nb_files_shas"] = nb_files_shas
142+
return res
143+
}
144+
145+
//func (a *Analyzer) GetCommitShasArr() []string {
146+
// var res []string
147+
// for k := range a.SetCommitsSha {
148+
// res = append(res, k)
149+
// }
150+
// return res
151+
//}
152+
153+
func main() {
154+
conf := config{}
155+
156+
// Config log
157+
log.SetFormatter(&log.TextFormatter{
158+
DisableColors: true,
159+
FullTimestamp: true,
160+
})
161+
log.SetReportCaller(true)
162+
log.SetLevel(log.InfoLevel)
163+
164+
if err := env.Parse(&conf); err != nil {
165+
log.Fatalf("Could not parse env: %v\n", err)
166+
}
167+
log.Debug(conf)
168+
//var cloner dnacollector.Cloner = &dnacollector.MemoryCloner{}
169+
//auth := &http.BasicAuth{
170+
// Username: "ericfourrier",
171+
// Password: conf.GithubToken,
172+
//}
173+
174+
repository, err := git2.PlainOpen("/Users/ericfourrier/Documents/GGCode/dna-collector/testdata/react-vis")
175+
if err != nil {
176+
fmt.Print(err)
177+
}
178+
repository.Config()
179+
//log.Infof("Cloned repo %v (size: %v)\n", repository.n, repository.GetStorageSize())
180+
extractor, err := dnacollector.NewExtractor(repository)
181+
analyzer := NewAnalyzer()
182+
for {
183+
commit, err := extractor.ExtractNextCommit()
184+
if err != nil && err != io.EOF {
185+
log.Panic(err)
186+
}
187+
if commit == nil {
188+
break
189+
}
190+
191+
analyzer.AnalyzeCommit(commit)
192+
}
193+
res2, _ := json.Marshal(analyzer.CommitsList)
194+
log.Debug(string(res2))
195+
//fmt.Print(analyzer.SetCommitsSha)
196+
//for _, files := range analyzer.CommitTable {
197+
// for _, file := range files {
198+
// log.Info(file.Sha)
199+
// }
200+
//}
201+
log.Info(analyzer.GetStats())
202+
log.Infof("Done extracting %v\n", repository)
203+
204+
}

0 commit comments

Comments
 (0)