Skip to content

Commit da552ec

Browse files
authored
Merge pull request #37 from vmarkovtsev/master
Scan for license files in README
2 parents 06aabc2 + 4979d1f commit da552ec

File tree

3 files changed

+33
-13
lines changed

3 files changed

+33
-13
lines changed

licensedb/internal/db.go

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,32 @@ import (
44
"archive/tar"
55
"bytes"
66
"encoding/csv"
7+
"fmt"
78
"index/suffixarray"
89
"io"
910
"log"
1011
"os"
12+
paths "path"
1113
"regexp"
1214
"sort"
1315
"strings"
1416

1517
"github.com/ekzhu/minhash-lsh"
1618
"github.com/sergi/go-diff/diffmatchpatch"
1719

20+
"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
1821
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/assets"
1922
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/fastlog"
2023
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/normalize"
2124
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/wmh"
2225
)
2326

27+
var (
28+
licenseReadmeMentionRe = regexp.MustCompile(
29+
fmt.Sprintf("(?i)[^\\s]+/[^/\\s]*(%s)[^\\s]*",
30+
strings.Join(licenseFileNames, "|")))
31+
)
32+
2433
// database holds the license texts, their hashes and the hashtables to query for nearest
2534
// neighbors.
2635
type database struct {
@@ -411,18 +420,29 @@ func (db *database) scanForURLs(text string) map[string]bool {
411420
}
412421

413422
// QueryReadmeText tries to detect licenses mentioned in the README.
414-
func (db *database) QueryReadmeText(text string) map[string]float32 {
415-
candidates1 := investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)
416-
candidates2 := investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)
423+
func (db *database) QueryReadmeText(text string, fs filer.Filer) map[string]float32 {
417424
candidates := map[string]float32{}
418-
for key, val := range candidates1 {
419-
candidates[key] = val
425+
append := func(others map[string]float32) {
426+
for key, val := range others {
427+
if candidates[key] < val {
428+
candidates[key] = val
429+
}
430+
}
420431
}
421-
for key, val := range candidates2 {
422-
if candidates[key] < val {
423-
candidates[key] = val
432+
for _, match := range licenseReadmeMentionRe.FindAllString(text, -1) {
433+
match = strings.TrimRight(match, ".,:;-")
434+
content, err := fs.ReadFile(match)
435+
if err == nil {
436+
if preprocessor, exists := filePreprocessors[paths.Ext(match)]; exists {
437+
content = preprocessor(content)
438+
}
439+
append(db.QueryLicenseText(string(content)))
424440
}
425441
}
442+
if len(candidates) == 0 {
443+
append(investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes))
444+
append(investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
445+
}
426446
if db.debug {
427447
for key, val := range candidates {
428448
println("NLP", key, val)

licensedb/internal/investigation.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,10 @@ func ExtractReadmeFiles(files []string, fs filer.Filer) [][]byte {
123123

124124
// InvestigateReadmeTexts scans README files for licensing information and outputs the
125125
// probable names using NER.
126-
func InvestigateReadmeTexts(texts [][]byte) map[string]float32 {
126+
func InvestigateReadmeTexts(texts [][]byte, fs filer.Filer) map[string]float32 {
127127
maxLicenses := map[string]float32{}
128128
for _, text := range texts {
129-
candidates := InvestigateReadmeText(text)
129+
candidates := InvestigateReadmeText(text, fs)
130130
for name, sim := range candidates {
131131
maxSim := maxLicenses[name]
132132
if sim > maxSim {
@@ -139,8 +139,8 @@ func InvestigateReadmeTexts(texts [][]byte) map[string]float32 {
139139

140140
// InvestigateReadmeText scans the README file for licensing information and outputs probable
141141
// names found with Named Entity Recognition from NLP.
142-
func InvestigateReadmeText(text []byte) map[string]float32 {
143-
return globalLicenseDatabase.QueryReadmeText(string(text))
142+
func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
143+
return globalLicenseDatabase.QueryReadmeText(string(text), fs)
144144
}
145145

146146
// IsLicenseDirectory indicates whether the directory is likely to contain licenses.

licensedb/licensedb.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
4646
if len(candidates) == 0 {
4747
return nil, ErrNoLicenseFound
4848
}
49-
licenses = internal.InvestigateReadmeTexts(candidates)
49+
licenses = internal.InvestigateReadmeTexts(candidates, fs)
5050
if len(licenses) == 0 {
5151
return nil, ErrNoLicenseFound
5252
}

0 commit comments

Comments
 (0)