Skip to content

Commit 36e69f1

Browse files
authored
Merge pull request #7 from vmarkovtsev/master
Test suite
2 parents 2e299eb + 85d4b40 commit 36e69f1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+4289
-125
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dist: trusty
2-
sudo: false
2+
sudo: required
33

44
language: go
55

Gopkg.lock

Lines changed: 8 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Gopkg.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@
3737
name = "github.com/jdkato/prose"
3838
version = "1.1.0"
3939

40-
[[constraint]]
40+
[[override]]
4141
name = "github.com/sergi/go-diff"
42-
version = "1.0.0"
42+
revision = "da645544ed44df016359bd4c0e3dc60ee3a0da43"
4343

4444
[[constraint]]
4545
name = "github.com/stretchr/testify"
@@ -65,6 +65,10 @@
6565
name = "gopkg.in/src-d/go-git.v4"
6666
version = "4.1.0"
6767

68+
[[constraint]]
69+
name = "github.com/spf13/pflag"
70+
branch = "master"
71+
6872
[prune]
6973
go-tests = true
7074
unused-packages = true

cmd/license-detector/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ func main() {
3333
wg.Add(pflag.NArg())
3434
for argIndex, arg := range pflag.Args() {
3535
go func(argIndex int, arg string) {
36+
defer wg.Done()
3637
_, err := os.Stat(arg)
3738
var licenses map[string]float32
3839
if err == nil {
@@ -45,7 +46,6 @@ func main() {
4546
os.Exit(1)
4647
}
4748
results[argIndex] = analysisResult{Name: arg, Licenses: licenses}
48-
wg.Done()
4949
}(argIndex, arg)
5050
}
5151
wg.Wait()

dataset.zip

7 MB
Binary file not shown.

dataset_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package ld
2+
3+
import (
4+
"archive/zip"
5+
"fmt"
6+
"io/ioutil"
7+
"strings"
8+
"sync"
9+
"testing"
10+
11+
"github.com/stretchr/testify/assert"
12+
)
13+
14+
func TestDataset(t *testing.T) {
15+
zipfile, err := zip.OpenReader("dataset.zip")
16+
assert.Nil(t, err)
17+
defer zipfile.Close()
18+
projects := map[string][]*zip.File{}
19+
for _, f := range zipfile.File {
20+
path := strings.Split(f.Name, "/")
21+
if path[1] != "" {
22+
files := projects[path[0]]
23+
if files == nil {
24+
files = []*zip.File{}
25+
}
26+
files = append(files, f)
27+
projects[path[0]] = files
28+
}
29+
}
30+
licenses := map[string]map[string]float32{}
31+
mutex := sync.Mutex{}
32+
wg := sync.WaitGroup{}
33+
wg.Add(len(projects))
34+
for project, files := range projects {
35+
go func(project string, files []*zip.File) {
36+
defer wg.Done()
37+
myFilesList := make([]string, 0, len(files))
38+
myFilesMap := map[string]*zip.File{}
39+
for _, f := range files {
40+
name := f.Name[strings.Index(f.Name, "/")+1:]
41+
myFilesList = append(myFilesList, name)
42+
myFilesMap[name] = f
43+
}
44+
myLicenses, _ := InvestigateFilesLicenses(myFilesList, func(name string) (string, error) {
45+
reader, err := myFilesMap[name].Open()
46+
if err != nil {
47+
return "", err
48+
}
49+
defer reader.Close()
50+
bytes, err := ioutil.ReadAll(reader)
51+
if err != nil {
52+
return "", err
53+
}
54+
return string(bytes), nil
55+
})
56+
if len(myLicenses) > 0 {
57+
mutex.Lock()
58+
licenses[project] = myLicenses
59+
mutex.Unlock()
60+
}
61+
}(project, files)
62+
}
63+
wg.Wait()
64+
assert.True(t, len(licenses) >= 766)
65+
fmt.Printf("%d %d %d%%\n", len(licenses), len(projects), (100 * len(licenses)) / len(projects))
66+
}

db.go

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -201,44 +201,12 @@ func (db *LicenseDatabase) QueryLicenseText(text string) map[string]float32 {
201201
tokarr[len(db.tokens)] = "!"
202202
println(dmp.DiffPrettyText(dmp.DiffCharsToLines(diff, tokarr)))
203203
}
204-
205-
// TODO(vmarkovtsev): replace with dmp.DiffLevenshtein when this PR is merged:
206-
// https://github.com/sergi/go-diff/pull/90
207-
distance := diffLevenshtein(diff)
204+
distance := dmp.DiffLevenshtein(diff)
208205
candidates[key] = float32(1) - float32(distance)/float32(len(myRunes))
209206
}
210207
return candidates
211208
}
212209

213-
func diffLevenshtein(diffs []diffmatchpatch.Diff) int {
214-
levenshtein := 0
215-
insertions := 0
216-
deletions := 0
217-
max := func(a, b int) int {
218-
if a < b {
219-
return b
220-
}
221-
return a
222-
}
223-
224-
for _, aDiff := range diffs {
225-
switch aDiff.Type {
226-
case diffmatchpatch.DiffInsert:
227-
insertions += len(aDiff.Text)
228-
case diffmatchpatch.DiffDelete:
229-
deletions += len(aDiff.Text)
230-
case diffmatchpatch.DiffEqual:
231-
// A deletion and an insertion is one substitution.
232-
levenshtein += max(insertions, deletions)
233-
insertions = 0
234-
deletions = 0
235-
}
236-
}
237-
238-
levenshtein += max(insertions, deletions)
239-
return levenshtein
240-
}
241-
242210
// QueryReadmeText tries to detect licenses mentioned in the README.
243211
func (db *LicenseDatabase) QueryReadmeText(text string) map[string]float32 {
244212
return investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)

investigate.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,13 @@ func InvestigateFilesLicenses(
8282
if len(candidates) == 0 {
8383
return nil, ErrNoLicenseFound
8484
}
85-
licenses := InvestigateReadmeFiles(candidates)
85+
licenses := InvestigateReadmeTexts(candidates)
8686
if len(licenses) == 0 {
8787
return nil, ErrNoLicenseFound
8888
}
8989
return licenses, nil
9090
}
91-
return InvestigateLicenseFiles(candidates), nil
91+
return InvestigateLicenseTexts(candidates), nil
9292
}
9393

9494
// ExtractLicenseFiles returns the list of possible license texts.
@@ -110,12 +110,12 @@ func ExtractLicenseFiles(files []string, reader func(string) (string, error)) []
110110
return candidates
111111
}
112112

113-
// InvestigateLicenseFiles takes the list of candidate license texts and returns the most probable
113+
// InvestigateLicenseTexts takes the list of candidate license texts and returns the most probable
114114
// reference licenses matched. Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
115-
func InvestigateLicenseFiles(texts []string) map[string]float32 {
115+
func InvestigateLicenseTexts(texts []string) map[string]float32 {
116116
maxLicenses := map[string]float32{}
117117
for _, text := range texts {
118-
candidates := InvestigateLicenseFile(text)
118+
candidates := InvestigateLicenseText(text)
119119
for name, sim := range candidates {
120120
maxSim := maxLicenses[name]
121121
if sim > maxSim {
@@ -126,9 +126,9 @@ func InvestigateLicenseFiles(texts []string) map[string]float32 {
126126
return maxLicenses
127127
}
128128

129-
// InvestigateLicenseFile takes the license text and returns the most probable reference licenses matched.
129+
// InvestigateLicenseText takes the license text and returns the most probable reference licenses matched.
130130
// Each match has the confidence assigned, from 0 to 1, 1 means 100% confident.
131-
func InvestigateLicenseFile(text string) map[string]float32 {
131+
func InvestigateLicenseText(text string) map[string]float32 {
132132
return globalLicenseDatabase.QueryLicenseText(text)
133133
}
134134

@@ -150,12 +150,12 @@ func ExtractReadmeFiles(files []string, reader func(string) (string, error)) []s
150150
return candidates
151151
}
152152

153-
// InvestigateReadmeFiles scans README files for licensing information and outputs the
153+
// InvestigateReadmeTexts scans README files for licensing information and outputs the
154154
// probable names using NER.
155-
func InvestigateReadmeFiles(texts []string) map[string]float32 {
155+
func InvestigateReadmeTexts(texts []string) map[string]float32 {
156156
maxLicenses := map[string]float32{}
157157
for _, text := range texts {
158-
candidates := InvestigateReadmeFile(text)
158+
candidates := InvestigateReadmeText(text)
159159
for name, sim := range candidates {
160160
maxSim := maxLicenses[name]
161161
if sim > maxSim {
@@ -166,9 +166,9 @@ func InvestigateReadmeFiles(texts []string) map[string]float32 {
166166
return maxLicenses
167167
}
168168

169-
// InvestigateReadmeFile scans the README file for licensing information and outputs probable
169+
// InvestigateReadmeText scans the README file for licensing information and outputs probable
170170
// names found with Named Entity Recognition from NLP.
171-
func InvestigateReadmeFile(text string) map[string]float32 {
171+
func InvestigateReadmeText(text string) map[string]float32 {
172172
return globalLicenseDatabase.QueryReadmeText(text)
173173
}
174174

nlp.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ var (
1717
digitsRe = regexp.MustCompile("[0-9]+")
1818
)
1919

20-
// investigateReadmeFile is the implementation of InvestigateReadmeFile.
20+
// investigateReadmeFile is the implementation of InvestigateReadmeText.
2121
// It takes two additional arguments: licenseNameParts and licenseNameSizes.
2222
// The idea is to map substrings to real licenses, and the confidence is
2323
// <the number of matches> / <overall number of substrings>.
@@ -41,10 +41,13 @@ func investigateReadmeFile(
4141
text[beginIndex] != '\n' && beginIndex < matches[0][0]; beginIndex++ {
4242
}
4343
}
44-
for ; text[endIndex] != ' ' && text[endIndex] != '\t' &&
45-
text[endIndex] != '\n' && endIndex < len(text); endIndex++ {
44+
for ; endIndex < len(text) && text[endIndex] != ' ' && text[endIndex] != '\t' &&
45+
text[endIndex] != '\n'; endIndex++ {
4646
}
4747
}
48+
if endIndex > len(text) {
49+
endIndex = len(text)
50+
}
4851
suspectedText := text[beginIndex:endIndex]
4952
suspectedWords := tokenize.TextToWords(suspectedText)
5053
tagger := tag.NewPerceptronTagger()

0 commit comments

Comments
 (0)