Skip to content

Commit 2b0c7be

Browse files
authored
Merge pull request #58 from nexB/56-parse-and-validate-spdx-v2
Parse and validate SPDX license expressions
2 parents 88977ce + 3487a13 commit 2b0c7be

File tree

6 files changed

+20335
-6
lines changed

6 files changed

+20335
-6
lines changed

src/license_expression/__init__.py

Lines changed: 159 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"""
2020

2121
import itertools
22+
import json
2223
import re
2324
import string
2425
from collections import defaultdict
@@ -27,6 +28,9 @@
2728
from copy import copy
2829
from copy import deepcopy
2930
from functools import total_ordering
31+
from os.path import abspath
32+
from os.path import dirname
33+
from os.path import join
3034

3135
import boolean
3236
from boolean import Expression as LicenseExpression
@@ -51,6 +55,12 @@
5155
from license_expression._pyahocorasick import Trie as AdvancedTokenizer
5256
from license_expression._pyahocorasick import Token
5357

58+
59+
curr_dir = dirname(abspath(__file__))
60+
data_dir = join(curr_dir, 'data')
61+
vendored_scancode_licensedb_index_location = join(data_dir, 'scancode-licensedb-index.json')
62+
63+
5464
# append new error codes to PARSE_ERRORS by monkey patching
5565
PARSE_EXPRESSION_NOT_UNICODE = 100
5666
if PARSE_EXPRESSION_NOT_UNICODE not in PARSE_ERRORS:
@@ -116,6 +126,50 @@ class ExpressionParseError(ParseError, ExpressionError):
116126
).finditer
117127

118128

129+
class ExpressionInfo:
130+
"""
131+
The ExpressionInfo class is returned by Licensing.validate() where it stores
132+
information about a given license expression passed into
133+
Licensing.validate().
134+
135+
The ExpressionInfo class has the following fields:
136+
- original_expression: str.
137+
- This is the license expression that was originally passed into Licensing.validate()
138+
- normalized_expression: str.
139+
- If a valid license expression has been passed into `validate()`,
140+
then the license expression string will be set in this field.
141+
- errors: list
142+
- If there were errors validating a license expression,
143+
the error messages will be appended here.
144+
- invalid_symbols: list
145+
- If the license expression that has been passed into `validate()` has
146+
license keys that are invalid (either that they are unknown or not used
147+
in the right context), or the syntax is incorrect because an invalid
148+
symbol was used, then those symbols will be appended here.
149+
"""
150+
def __init__(
151+
self,
152+
original_expression,
153+
normalized_expression=None,
154+
errors=None,
155+
invalid_symbols=None,
156+
):
157+
self.original_expression = original_expression
158+
self.normalized_expression = normalized_expression
159+
self.errors = errors or []
160+
self.invalid_symbols = invalid_symbols or []
161+
162+
def __repr__(self):
163+
return (
164+
'ExpressionInfo(\n'
165+
f' original_expression={self.original_expression!r},\n'
166+
f' normalized_expression={self.normalized_expression!r},\n'
167+
f' errors={self.errors!r},\n'
168+
f' invalid_symbols={self.invalid_symbols!r}\n'
169+
')'
170+
)
171+
172+
119173
class Licensing(boolean.BooleanAlgebra):
120174
"""
121175
Licensing defines a mini language to parse, validate and compare license
@@ -355,6 +409,12 @@ def unknown_license_keys(self, expression, unique=True, **kwargs):
355409
symbols = self.unknown_license_symbols(expression, unique=False, **kwargs)
356410
return self._keys(symbols, unique)
357411

412+
def validate_license_keys(self, expression):
413+
unknown_keys = self.unknown_license_keys(expression, unique=True)
414+
if unknown_keys:
415+
msg = 'Unknown license key(s): {}'.format(', '.join(unknown_keys))
416+
raise ExpressionError(msg)
417+
358418
def parse(self, expression, validate=False, strict=False, simple=False, **kwargs):
359419
"""
360420
Return a new license LicenseExpression object by parsing a license
@@ -422,10 +482,7 @@ def parse(self, expression, validate=False, strict=False, simple=False, **kwargs
422482
raise ExpressionError('expression must be a LicenseExpression once parsed.')
423483

424484
if validate:
425-
unknown_keys = self.unknown_license_keys(expression, unique=True)
426-
if unknown_keys:
427-
msg = 'Unknown license key(s): {}'.format(', '.join(unknown_keys))
428-
raise ExpressionError(msg)
485+
self.validate_license_keys(expression)
429486

430487
return expression
431488

@@ -617,6 +674,104 @@ def dedup(self, expression):
617674
raise Exception(f'Unknown expression type: {expression!r}')
618675
return deduped
619676

677+
def validate(self, expression, strict=True, **kwargs):
678+
"""
679+
Return a ExpressionInfo object that contains information about
680+
the validation of an `expression` license expression string.
681+
682+
If the syntax and license keys of `expression` is valid, then
683+
`ExpressionInfo.normalized_license_expression` is set.
684+
685+
If an error was encountered when validating `expression`,
686+
`ExpressionInfo.errors` will be populated with strings containing the
687+
error message that has occured. If an error has occured due to unknown
688+
license keys or an invalid license symbol, the offending keys or symbols
689+
will be present in `ExpressionInfo.invalid_symbols`
690+
691+
If `strict` is True, validation error messages will be included if in a "WITH"
692+
expression such as "XXX with ZZZ" if the XXX symbol has `is_exception`
693+
set to True or the YYY symbol has `is_exception` set to False. This
694+
checks that symbols are used strictly as intended.
695+
"""
696+
expression_info = ExpressionInfo(
697+
original_expression=str(expression)
698+
)
699+
700+
# Check `expression` type and syntax
701+
try:
702+
parsed_expression = self.parse(expression, strict=strict)
703+
except ExpressionError as e:
704+
expression_info.errors.append(str(e))
705+
expression_info.invalid_symbols.append(e.token_string)
706+
return expression_info
707+
708+
# Check `expression` keys (validate)
709+
try:
710+
self.validate_license_keys(expression)
711+
except ExpressionError as e:
712+
expression_info.errors.append(str(e))
713+
unknown_keys = self.unknown_license_keys(expression)
714+
expression_info.invalid_symbols.extend(unknown_keys)
715+
return expression_info
716+
717+
# If we have not hit an exception, set `normalized_expression` in
718+
# `expression_info` only if we did not encounter any errors
719+
# along the way
720+
if not expression_info.errors and not expression_info.invalid_symbols:
721+
expression_info.normalized_expression = str(parsed_expression)
722+
return expression_info
723+
724+
725+
def get_license_index(license_index_location=vendored_scancode_licensedb_index_location):
726+
"""
727+
Return a list of dictionaries that contain license key information from
728+
`license_index_location`
729+
730+
The default value of `license_index_location` points to a vendored copy
731+
of the license index from https://scancode-licensedb.aboutcode.org/
732+
"""
733+
with open(license_index_location) as f:
734+
return json.load(f)
735+
736+
737+
def load_licensing_from_license_index(license_index):
738+
"""
739+
Return a Licensing object that has been loaded with license keys and
740+
attributes from `license_index`.
741+
"""
742+
syms = [LicenseSymbol(**l) for l in license_index]
743+
return Licensing(syms)
744+
745+
746+
def build_licensing(license_index):
747+
"""
748+
Return a Licensing object that has been loaded with license keys.
749+
"""
750+
lics = [
751+
{
752+
'key': l.get('license_key', ''),
753+
'is_exception': l.get('is_exception', ''),
754+
} for l in license_index if not l.get('is_deprecated', False)
755+
]
756+
return load_licensing_from_license_index(lics)
757+
758+
759+
def build_spdx_licensing(license_index):
760+
"""
761+
Return a Licensing object that has been loaded with SPDX license keys.
762+
"""
763+
# Massage data such that SPDX license key is the primary license key
764+
lics = [
765+
{
766+
'key': l.get('spdx_license_key', ''),
767+
'aliases': l.get('other_spdx_license_keys', []),
768+
'is_exception': l.get('is_exception', ''),
769+
} for l in license_index
770+
if l.get('spdx_license_key')
771+
and not l.get('is_deprecated', False)
772+
]
773+
return load_licensing_from_license_index(lics)
774+
620775

621776
def build_symbols_from_unknown_tokens(tokens):
622777
"""

0 commit comments

Comments
 (0)