@@ -392,33 +392,71 @@ def clean_text(text):
392392
393393
394394_split_spdx_lid = re .compile (
395- '(spd[xz][\\ -\\ s]+lin?[cs]en?[sc]es?[\\ -\\ s]+identifi?er\\ s*:?\\ s*)' ,
396- re .IGNORECASE ).split
397-
398- _nuget_split_spdx_lid = re .compile (
399- '(licenses(?:\\ .|\\ s)+nuget(?:\\ .|\\ s)+org\\ s*:?\\ s*)' ,
400- re .IGNORECASE ).split
395+ r'('
396+ r'(?:'
397+ r'spd[xz][_\-\s]+'
398+ r'(?:lin?[cs]en?[sc]es?|short)[_\-\s]+'
399+ 'identifi?ers?\s*:?'
400+ r'|'
401+ r'licenses[\.\s]+nuget[\.\s]+org\s*/?'
402+ r')\s*'
403+ r')' ,
404+ re .IGNORECASE ,
405+ ).split
401406
402407
403408def split_spdx_lid (text ):
404409 """
405- Split text if it contains an "SPDX license identifier". Return a 2-tuple if if there is an SPDX
410+ Split text if it contains an "SPDX license identifier". Return a 2-tuple if there is an SPDX
406411 license identifier where the first item contains the "SPDX license identifier" text proper and
407412 the second item contains the remainder of the line (expected to be a license expression).
408413 Otherwise return a 2-tuple where the first item is None and the second item contains the
409414 original text.
410415
411- Also supports "https://licenses.nuget.org" followed by a license expression.
416+ Also supports "https://licenses.nuget.org" followed by a license expression as well as minor
417+ variants such as SPDX short Indentifier, and typos.
418+
419+ Split regex examples::
420+
421+ >>> _split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
422+ ['', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
423+ >>> _split_spdx_lid("licenses.nuget.org / MIT")
424+ ['', 'licenses.nuget.org / ', 'MIT']
425+ >>> _split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
426+ ['licenseUrl:https://', 'licenses.nuget.org/', 'MIT%20OR%20Unlicense']
427+ >>> _split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
428+ ['', 'SPDX-license-Identifier: ', 'MIT OR Unlicense']
429+ >>> _split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
430+ ['', 'SPDX-license-Identifer: ', 'MIT OR Unlicense']
431+ >>> _split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
432+ ['', 'SPDX short Identifer : ', 'MIT OR Unlicense']
433+ >>> _split_spdx_lid("For OR Unlicense")
434+ ['For OR Unlicense']
435+ >>> _split_spdx_lid(" REM DNL SPDX short Identifer : MIT OR Unlicense")
436+ [' REM DNL ', 'SPDX short Identifer : ', 'MIT OR Unlicense']
437+
438+ Split full examples::
439+
440+ >>> split_spdx_lid("licenses.nuget.org/MIT%20OR%20Unlicense")
441+ ('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
442+ >>> split_spdx_lid("licenses.nuget.org / MIT")
443+ ('licenses.nuget.org / ', 'MIT')
444+ >>> split_spdx_lid("licenseUrl:https://licenses.nuget.org/MIT%20OR%20Unlicense")
445+ ('licenses.nuget.org/', 'MIT%20OR%20Unlicense')
446+ >>> split_spdx_lid("SPDX-license-Identifier: MIT OR Unlicense")
447+ ('SPDX-license-Identifier: ', 'MIT OR Unlicense')
448+ >>> split_spdx_lid("SPDX-license-Identifer: MIT OR Unlicense")
449+ ('SPDX-license-Identifer: ', 'MIT OR Unlicense')
450+ >>> split_spdx_lid("SPDX short Identifer : MIT OR Unlicense")
451+ ('SPDX short Identifer : ', 'MIT OR Unlicense')
452+ >>> split_spdx_lid("For OR Unlicense")
453+ (None, 'For OR Unlicense')
412454 """
413455 segments = _split_spdx_lid (text )
414- expression = segments [- 1 ]
415- if len (segments ) > 1 :
416- return segments [- 2 ], expression
456+ if len (segments ) == 3 :
457+ # we matched on split OK with exactly three segments
458+ _ , prefix , expression = segments
459+ return prefix , expression
417460 else :
418- segments = _nuget_split_spdx_lid (text )
419- expression = segments [- 1 ]
420- if len (segments ) > 1 :
421- return segments [- 2 ], expression
422- else :
423- return None , text
461+ return None , text
424462
0 commit comments