@@ -44,9 +44,13 @@ type database struct {
4444 lsh * minhashlsh.MinhashLSH
4545 // turns a license text into a hash
4646 hasher * wmh.WeightedMinHasher
47- // part of license name -> list of containing license names
47+ // part of license short name (e,g, BSL-1.0) -> list of containing license names
48+ nameShortSubstrings map [string ][]substring
49+ // number of substrings per short license name
50+ nameShortSubstringSizes map [string ]int
51+ // part of license name (e,g, Boost Software License 1.0) -> list of containing license names
4852 nameSubstrings map [string ][]substring
49- // number of substrings per license
53+ // number of substrings per license name
5054 nameSubstringSizes map [string ]int
5155}
5256
@@ -70,13 +74,7 @@ func (db database) VocabularySize() int {
7074 return len (db .tokens )
7175}
7276
73- // Load takes the licenses from the embedded storage, normalizes, hashes them and builds the
74- // LSH hashtables.
75- func loadLicenses () * database {
76- db := & database {}
77- if os .Getenv ("LICENSE_DEBUG" ) != "" {
78- db .debug = true
79- }
77+ func loadUrls (db * database ) {
8078 urlCSVBytes , err := assets .Asset ("urls.csv" )
8179 if err != nil {
8280 log .Fatalf ("failed to load urls.csv from the assets: %v" , err )
@@ -96,6 +94,52 @@ func loadLicenses() *database {
9694 }
9795 }
9896 db .urlRe = regexp .MustCompile (urlReWriter .String ())
97+ }
98+
99+ func loadNames (db * database ) {
100+ namesBytes , err := assets .Asset ("names.csv" )
101+ if err != nil {
102+ log .Fatalf ("failed to load banes.csv from the assets: %v" , err )
103+ }
104+ namesReader := csv .NewReader (bytes .NewReader (namesBytes ))
105+ records , err := namesReader .ReadAll ()
106+ if err != nil || len (records ) == 0 {
107+ log .Fatalf ("failed to parse names.csv from the assets: %v" , err )
108+ }
109+ db .nameSubstringSizes = map [string ]int {}
110+ db .nameSubstrings = map [string ][]substring {}
111+ for _ , record := range records {
112+ registerNameSubstrings (record [1 ], record [0 ], db .nameSubstringSizes , db .nameSubstrings )
113+ }
114+ }
115+
116+ func registerNameSubstrings (
117+ name string , key string , sizes map [string ]int , substrs map [string ][]substring ) {
118+ parts := splitLicenseName (name )
119+ sizes [key ] = 0
120+ for _ , part := range parts {
121+ if licenseReadmeRe .MatchString (part .value ) {
122+ continue
123+ }
124+ sizes [key ]++
125+ list := substrs [part .value ]
126+ if list == nil {
127+ list = []substring {}
128+ }
129+ list = append (list , substring {value : key , count : part .count })
130+ substrs [part .value ] = list
131+ }
132+ }
133+
134+ // Load takes the licenses from the embedded storage, normalizes, hashes them and builds the
135+ // LSH hashtables.
136+ func loadLicenses () * database {
137+ db := & database {}
138+ if os .Getenv ("LICENSE_DEBUG" ) != "" {
139+ db .debug = true
140+ }
141+ loadUrls (db )
142+ loadNames (db )
99143 tarBytes , err := assets .Asset ("licenses.tar" )
100144 if err != nil {
101145 log .Fatalf ("failed to load licenses.tar from the assets: %v" , err )
@@ -174,8 +218,8 @@ func loadLicenses() *database {
174218 log .Println ("LSH:" , k , l )
175219 }
176220 db .hasher = wmh .NewWeightedMinHasher (len (uniqueTokens ), numHashes , 7 )
177- db .nameSubstrings = map [string ][]substring {}
178- db .nameSubstringSizes = map [string ]int {}
221+ db .nameShortSubstrings = map [string ][]substring {}
222+ db .nameShortSubstringSizes = map [string ]int {}
179223 for key , tokens := range tokenFreqs {
180224 indices := make ([]int , len (tokens ))
181225 values := make ([]float32 , len (tokens ))
@@ -188,18 +232,7 @@ func loadLicenses() *database {
188232 }
189233 }
190234 db .lsh .Add (key , db .hasher .Hash (values , indices ))
191-
192- // register all substrings
193- parts := splitLicenseName (key )
194- db .nameSubstringSizes [key ] = len (parts )
195- for _ , part := range parts {
196- list := db .nameSubstrings [part .value ]
197- if list == nil {
198- list = []substring {}
199- }
200- list = append (list , substring {value : key , count : part .count })
201- db .nameSubstrings [part .value ] = list
202- }
235+ registerNameSubstrings (key , key , db .nameShortSubstringSizes , db .nameShortSubstrings )
203236 }
204237 db .lsh .Index ()
205238 return db
@@ -379,7 +412,17 @@ func (db *database) scanForURLs(text string) map[string]bool {
379412
380413// QueryReadmeText tries to detect licenses mentioned in the README.
381414func (db * database ) QueryReadmeText (text string ) map [string ]float32 {
382- candidates := investigateReadmeFile (text , db .nameSubstrings , db .nameSubstringSizes )
415+ candidates1 := investigateReadmeFile (text , db .nameSubstrings , db .nameSubstringSizes )
416+ candidates2 := investigateReadmeFile (text , db .nameShortSubstrings , db .nameShortSubstringSizes )
417+ candidates := map [string ]float32 {}
418+ for key , val := range candidates1 {
419+ candidates [key ] = val
420+ }
421+ for key , val := range candidates2 {
422+ if candidates [key ] < val {
423+ candidates [key ] = val
424+ }
425+ }
383426 if db .debug {
384427 for key , val := range candidates {
385428 println ("NLP" , key , val )
0 commit comments