Skip to content

Commit 2d10688

Browse files
committed
Detect "SPDX Short Identifier" tags #4301
Reference: #4301 Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent ff88b06 commit 2d10688

File tree

3 files changed

+553
-583
lines changed

3 files changed

+553
-583
lines changed

src/licensedcode/match_spdx_lid.py

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -392,33 +392,71 @@ def clean_text(text):
392392

393393

394394
_split_spdx_lid = re.compile(
395-
'(spd[xz][\\-\\s]+lin?[cs]en?[sc]es?[\\-\\s]+identifi?er\\s*:?\\s*)',
396-
re.IGNORECASE).split
397-
398-
_nuget_split_spdx_lid = re.compile(
399-
'(licenses(?:\\.|\\s)+nuget(?:\\.|\\s)+org\\s*:?\\s*)',
400-
re.IGNORECASE).split
395+
r'('
396+
r'(?:'
397+
r'spd[xz][_\-\s]+'
398+
r'(?:lin?[cs]en?[sc]es?|short)[_\-\s]+'
399+
'identifi?ers?\s*:?'
400+
r'|'
401+
r'licenses[\.\s]+nuget[\.\s]+org\s*/?'
402+
r')\s*'
403+
r')',
404+
re.IGNORECASE,
405+
).split
401406

402407

403408
def split_spdx_lid(text):
404409
"""
405-
Split text if it contains an "SPDX license identifier". Return a 2-tuple if if there is an SPDX
410+
Split text if it contains an "SPDX license identifier". Return a 2-tuple if there is an SPDX
406411
license identifier where the first item contains the "SPDX license identifier" text proper and
407412
the second item contains the remainder of the line (expected to be a license expression).
408413
Otherwise return a 2-tuple where the first item is None and the second item contains the
409414
original text.
410415
411-
Also supports "https://licenses.nuget.org" followed by a license expression.
416+
Also supports "https://licenses.nuget.org" followed by a license expression as well as minor
417+
variants such as SPDX short Indentifier, and typos.
418+
419+
Split regex examples::
420+
421+
>>> _split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
422+
['', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
423+
>>> _split_spdx_lid("licenses.nuget.org / MIT")
424+
['', 'licenses.nuget.org / ', 'MIT']
425+
>>> _split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
426+
['licenseUrl:https://', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
427+
>>> _split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
428+
['', 'SPDX-license-Identifier: ', 'MIT OR Unlicense']
429+
>>> _split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
430+
['', 'SPDX-license-Identifer: ', 'MIT OR Unlicense']
431+
>>> _split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
432+
['', 'SPDX short Identifer : ', 'MIT OR Unlicense']
433+
>>> _split_spdx_lid("For OR Unlicense")
434+
['For OR Unlicense']
435+
>>> _split_spdx_lid(" REM DNL SPDX short Identifer : MIT OR Unlicense")
436+
[' REM DNL ', 'SPDX short Identifer : ', 'MIT OR Unlicense']
437+
438+
Split full examples::
439+
440+
>>> split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
441+
('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
442+
>>> split_spdx_lid("licenses.nuget.org / MIT")
443+
('licenses.nuget.org / ', 'MIT')
444+
>>> split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
445+
('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
446+
>>> split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
447+
('SPDX-license-Identifier: ', 'MIT OR Unlicense')
448+
>>> split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
449+
('SPDX-license-Identifer: ', 'MIT OR Unlicense')
450+
>>> split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
451+
('SPDX short Identifer : ', 'MIT OR Unlicense')
452+
>>> split_spdx_lid("For OR Unlicense")
453+
(None, 'For OR Unlicense')
412454
"""
413455
segments = _split_spdx_lid(text)
414-
expression = segments[-1]
415-
if len(segments) > 1:
416-
return segments[-2], expression
456+
if len(segments) == 3:
457+
# we matched on split OK with exactly three segments
458+
_, prefix, expression = segments
459+
return prefix, expression
417460
else:
418-
segments = _nuget_split_spdx_lid(text)
419-
expression = segments[-1]
420-
if len(segments) > 1:
421-
return segments[-2], expression
422-
else:
423-
return None, text
461+
return None, text
424462

src/licensedcode/query.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@ def logger_debug(*args):
102102
# on a single line (e.g. minified JS or CSS).
103103
MAX_TOKEN_PER_LINE = 25
104104

105-
106105
# Break quary in runs if there are `LINES_THRESHOLD` number of empty
107106
# or non-legalese/junk lines
108107
LINES_THRESHOLD = 4
@@ -248,19 +247,23 @@ def __init__(
248247
# TODO: consider using an intbitset
249248
self.shorts_and_digits_pos = set()
250249

251-
# list of the three SPDX-License-Identifier tokens to identify to detect
250+
# list of the base SPDX-License-Identifier tokens to identify and detect
252251
# a line for SPDX id matching.
253252
# note: this will not match anything if the index is not properly set
254253
dic_get = idx.dictionary.get
255254
spdxid = [dic_get(u'spdx'), dic_get(u'license'), dic_get(u'identifier')]
256255

256+
# "SPDX Short identifier" is also an unfortunate thing in the wild
257+
# both with and without dash
258+
spdxid2 = [dic_get(u'spdx'), dic_get(u'short'), dic_get(u'identifier')]
259+
257260
# There's also other spdx license identifiers like NuGet license URLs
258261
# Like: `https://licenses.nuget.org/(LGPL-2.0-only WITH FLTK-exception OR Apache-2.0+)`
259262
nuget_spdx_id = [dic_get(u'licenses'), dic_get(u'nuget'), dic_get(u'org')]
260263

261264
# None, None None: this is mostly a possible issue in test mode
262265
self.spdx_lid_token_ids = [
263-
x for x in [spdxid, nuget_spdx_id, ] if x != [None, None, None]
266+
x for x in [spdxid, nuget_spdx_id, spdxid2] if None not in x
264267
]
265268

266269
# list of tuple (original line text, start known pos, end known pos) for
@@ -497,7 +500,7 @@ def tokens_by_line(
497500
spdx_start_offset = 2
498501

499502
if spdx_start_offset is not None:
500-
503+
501504
# keep the line, start/end known pos for SPDX matching
502505
spdx_prefix, spdx_expression = split_spdx_lid(line)
503506
spdx_text = ''.join([spdx_prefix or '', spdx_expression])

0 commit comments

Comments
 (0)