Skip to content

Commit 06aabc2

Browse files
authored
Merge pull request #36 from vmarkovtsev/master
NLP fixes + comparison
2 parents 85f083f + 53671e7 commit 06aabc2

File tree

6 files changed

+150
-36
lines changed

6 files changed

+150
-36
lines changed

Makefile

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
11
GOPATH ?= $(shell go env GOPATH)
22
SPDX_DATA_VERSION ?= 3.0
33

4-
licensedb/internal/assets/bindata.go: licenses.tar urls.csv $(GOPATH)/bin/go-bindata
4+
licensedb/internal/assets/bindata.go: licenses.tar urls.csv names.csv $(GOPATH)/bin/go-bindata
5+
rm -rf license-list-data-$(SPDX_DATA_VERSION)
56
rm -f license-list-data.tar.gz
6-
$(GOPATH)/bin/go-bindata -nometadata -pkg assets -o licensedb/internal/assets/bindata.go licenses.tar urls.csv
7-
rm licenses.tar
8-
rm urls.csv
7+
$(GOPATH)/bin/go-bindata -nometadata -pkg assets -o licensedb/internal/assets/bindata.go licenses.tar urls.csv names.csv
8+
rm licenses.tar urls.csv names.csv
99

1010
licenses.tar: license-list-data.tar.gz
1111
tar -xf license-list-data.tar.gz license-list-data-$(SPDX_DATA_VERSION)/text
1212
tar -cf licenses.tar -C license-list-data-$(SPDX_DATA_VERSION)/text .
1313
rm -rf license-list-data-$(SPDX_DATA_VERSION)
1414

15-
urls.csv: license-list-data.tar.gz
15+
license-list-data-$(SPDX_DATA_VERSION)/json/details: license-list-data.tar.gz
1616
tar -xf license-list-data.tar.gz license-list-data-$(SPDX_DATA_VERSION)/json/details
17+
18+
urls.csv: license-list-data-$(SPDX_DATA_VERSION)/json/details
1719
go run licensedb/internal/assets/extract_urls.go license-list-data-$(SPDX_DATA_VERSION)/json/details > urls.csv
18-
rm -rf license-list-data-$(SPDX_DATA_VERSION)
20+
21+
names.csv: license-list-data-$(SPDX_DATA_VERSION)/json/details
22+
go run licensedb/internal/assets/extract_names.go license-list-data-$(SPDX_DATA_VERSION)/json/details > names.csv
1923

2024
license-list-data.tar.gz:
2125
curl -SLk -o license-list-data.tar.gz https://github.com/spdx/license-list-data/archive/v$(SPDX_DATA_VERSION).tar.gz

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ Comparison to other projects on that dataset:
8888
|[benbalter/licensee](https://github.com/benbalter/licensee)| 75% (673/902) | 111 |
8989
|[google/licenseclassifier](https://github.com/google/licenseclassifier)| 76% (682/902) | 907 |
9090
|[boyter/lc](https://github.com/boyter/lc)| 88% (797/902) | 548 |
91+
|[amzn/askalono](https://github.com/amzn/askalono)| 87% (785/902) | 165 |
92+
|[LiD](https://source.codeaurora.org/external/qostg/lid)| 94% (847/902) | 3660 |
9193

9294
<details><summary>How this was measured</summary>
9395
<pre><code>$ cd $(go env GOPATH)/src/gopkg.in/src-d/go-license-detector.v2/licensedb
@@ -105,6 +107,15 @@ $ time find -type f -print | xargs -n1 -P4 identify_license \
105107
$ # boyter/lc
106108
$ time lc . \
107109
| grep -vE 'NOASSERTION|----|Directory' | cut -d" " -f1 | sort | uniq | wc -l
110+
$ # amzn/askalono
111+
$ echo '#!/bin/sh
112+
result=$(askalono id "$1")
113+
echo "$1
114+
$result"' > ../askalono.wrapper
115+
$ time find -type f -print | xargs -n1 -P4 sh ../askalono.wrapper | grep -Pzo '.*\nLicense: .*\n' askalono.txt | grep -av "License: " | cut -d/ -f 2 | sort | uniq | wc -l
116+
$ # LiD
117+
$ time license-identifier -I dataset -F csv -O lid
118+
$ cat lid_*.csv | cut -d, -f1 | cut -d"'" -f 2 | grep / | cut -d/ -f2 | sort | uniq | wc -l
108119
</code></pre>
109120
</details>
110121

licensedb/internal/assets/bindata.go

Lines changed: 28 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// +build make
2+
3+
package main
4+
5+
import (
6+
"encoding/csv"
7+
"encoding/json"
8+
"io/ioutil"
9+
"log"
10+
"os"
11+
"path"
12+
)
13+
14+
func main() {
15+
dir := os.Args[1]
16+
files, err := ioutil.ReadDir(dir)
17+
if err != nil {
18+
log.Fatalf("Listing %s: %v\n", dir, err)
19+
}
20+
writer := csv.NewWriter(os.Stdout)
21+
defer writer.Flush()
22+
for _, file := range files {
23+
var data map[string]interface{}
24+
content, err := ioutil.ReadFile(path.Join(dir, file.Name()))
25+
if err != nil {
26+
log.Fatalf("Reading %s: %v\n", file.Name(), err)
27+
}
28+
json.Unmarshal(content, &data)
29+
name := data["name"].(string)
30+
id := data["licenseId"].(string)
31+
writer.Write([]string{id, name})
32+
}
33+
}

licensedb/internal/db.go

Lines changed: 67 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ type database struct {
4444
lsh *minhashlsh.MinhashLSH
4545
// turns a license text into a hash
4646
hasher *wmh.WeightedMinHasher
47-
// part of license name -> list of containing license names
47+
// part of license short name (e,g, BSL-1.0) -> list of containing license names
48+
nameShortSubstrings map[string][]substring
49+
// number of substrings per short license name
50+
nameShortSubstringSizes map[string]int
51+
// part of license name (e,g, Boost Software License 1.0) -> list of containing license names
4852
nameSubstrings map[string][]substring
49-
// number of substrings per license
53+
// number of substrings per license name
5054
nameSubstringSizes map[string]int
5155
}
5256

@@ -70,13 +74,7 @@ func (db database) VocabularySize() int {
7074
return len(db.tokens)
7175
}
7276

73-
// Load takes the licenses from the embedded storage, normalizes, hashes them and builds the
74-
// LSH hashtables.
75-
func loadLicenses() *database {
76-
db := &database{}
77-
if os.Getenv("LICENSE_DEBUG") != "" {
78-
db.debug = true
79-
}
77+
func loadUrls(db *database) {
8078
urlCSVBytes, err := assets.Asset("urls.csv")
8179
if err != nil {
8280
log.Fatalf("failed to load urls.csv from the assets: %v", err)
@@ -96,6 +94,52 @@ func loadLicenses() *database {
9694
}
9795
}
9896
db.urlRe = regexp.MustCompile(urlReWriter.String())
97+
}
98+
99+
func loadNames(db *database) {
100+
namesBytes, err := assets.Asset("names.csv")
101+
if err != nil {
102+
log.Fatalf("failed to load banes.csv from the assets: %v", err)
103+
}
104+
namesReader := csv.NewReader(bytes.NewReader(namesBytes))
105+
records, err := namesReader.ReadAll()
106+
if err != nil || len(records) == 0 {
107+
log.Fatalf("failed to parse names.csv from the assets: %v", err)
108+
}
109+
db.nameSubstringSizes = map[string]int{}
110+
db.nameSubstrings = map[string][]substring{}
111+
for _, record := range records {
112+
registerNameSubstrings(record[1], record[0], db.nameSubstringSizes, db.nameSubstrings)
113+
}
114+
}
115+
116+
func registerNameSubstrings(
117+
name string, key string, sizes map[string]int, substrs map[string][]substring) {
118+
parts := splitLicenseName(name)
119+
sizes[key] = 0
120+
for _, part := range parts {
121+
if licenseReadmeRe.MatchString(part.value) {
122+
continue
123+
}
124+
sizes[key]++
125+
list := substrs[part.value]
126+
if list == nil {
127+
list = []substring{}
128+
}
129+
list = append(list, substring{value: key, count: part.count})
130+
substrs[part.value] = list
131+
}
132+
}
133+
134+
// Load takes the licenses from the embedded storage, normalizes, hashes them and builds the
135+
// LSH hashtables.
136+
func loadLicenses() *database {
137+
db := &database{}
138+
if os.Getenv("LICENSE_DEBUG") != "" {
139+
db.debug = true
140+
}
141+
loadUrls(db)
142+
loadNames(db)
99143
tarBytes, err := assets.Asset("licenses.tar")
100144
if err != nil {
101145
log.Fatalf("failed to load licenses.tar from the assets: %v", err)
@@ -174,8 +218,8 @@ func loadLicenses() *database {
174218
log.Println("LSH:", k, l)
175219
}
176220
db.hasher = wmh.NewWeightedMinHasher(len(uniqueTokens), numHashes, 7)
177-
db.nameSubstrings = map[string][]substring{}
178-
db.nameSubstringSizes = map[string]int{}
221+
db.nameShortSubstrings = map[string][]substring{}
222+
db.nameShortSubstringSizes = map[string]int{}
179223
for key, tokens := range tokenFreqs {
180224
indices := make([]int, len(tokens))
181225
values := make([]float32, len(tokens))
@@ -188,18 +232,7 @@ func loadLicenses() *database {
188232
}
189233
}
190234
db.lsh.Add(key, db.hasher.Hash(values, indices))
191-
192-
// register all substrings
193-
parts := splitLicenseName(key)
194-
db.nameSubstringSizes[key] = len(parts)
195-
for _, part := range parts {
196-
list := db.nameSubstrings[part.value]
197-
if list == nil {
198-
list = []substring{}
199-
}
200-
list = append(list, substring{value: key, count: part.count})
201-
db.nameSubstrings[part.value] = list
202-
}
235+
registerNameSubstrings(key, key, db.nameShortSubstringSizes, db.nameShortSubstrings)
203236
}
204237
db.lsh.Index()
205238
return db
@@ -379,7 +412,17 @@ func (db *database) scanForURLs(text string) map[string]bool {
379412

380413
// QueryReadmeText tries to detect licenses mentioned in the README.
381414
func (db *database) QueryReadmeText(text string) map[string]float32 {
382-
candidates := investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)
415+
candidates1 := investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)
416+
candidates2 := investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)
417+
candidates := map[string]float32{}
418+
for key, val := range candidates1 {
419+
candidates[key] = val
420+
}
421+
for key, val := range candidates2 {
422+
if candidates[key] < val {
423+
candidates[key] = val
424+
}
425+
}
383426
if db.debug {
384427
for key, val := range candidates {
385428
println("NLP", key, val)

licensedb/internal/nlp.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ func investigateReadmeFile(
6969
continue
7070
}
7171
scores := map[string]map[string]int{}
72-
entity = licenseReadmeRe.ReplaceAllString(entity, "")
72+
entity = licenseReadmeRe.ReplaceAllString(entity, " ")
7373
substrs := splitLicenseName(entity)
7474
for _, substr := range substrs {
7575
for _, match := range licenseNameParts[substr.value] {

0 commit comments

Comments
 (0)