Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 2 additions & 1 deletion src/licensedcode/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,11 +545,12 @@ def validate_spdx_license_keys(license_expression, licensing):
try:
parsed.render(template='{symbol.wrapped.spdx_license_key}')
except AttributeError:
msg = f"Error rendering SPDX license key for: {key}"
messages.append(msg)
pass

if messages:
raise InvalidLicenseKeyError(messages)
raise InvalidLicenseKeyError(f"ERROR in parsing license_expression: {license_expression}: type: {type(license_expression)} :{messages}")


class InvalidLicenseKeyError(Exception):
Expand Down
105 changes: 89 additions & 16 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ class LicenseDetection:
'using the SPDX license expression syntax and ScanCode license keys.')
)

license_expression_spdx = attr.ib(
default=None,
metadata=dict(
help='Full license expression string with SPDX license keys.')
)

matches = attr.ib(
default=attr.Factory(list),
metadata=dict(
Expand Down Expand Up @@ -248,8 +254,17 @@ def from_matches(
detection_log=detection_log,
)
detection.identifier = detection.identifier_with_expression
detection.license_expression_spdx = detection.spdx_license_expression()
return detection

def spdx_license_expression(self):
from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import get_cache
return str(build_spdx_license_expression(
license_expression=self.license_expression,
licensing=get_cache().licensing,
))

def __eq__(self, other):
return (
isinstance(other, LicenseDetection)
Expand Down Expand Up @@ -515,6 +530,7 @@ def from_license_detection_mapping(

detection = cls(
license_expression=license_detection_mapping["license_expression"],
license_expression_spdx=license_detection_mapping["license_expression_spdx"],
detection_log=license_detection_mapping.get("detection_log", []) or None,
identifier=license_detection_mapping["identifier"],
matches=matches,
Expand Down Expand Up @@ -590,6 +606,12 @@ class LicenseMatchFromResult(LicenseMatch):
help='Text which was matched')
)

matched_text_diagnostics = attr.ib(
default=None,
metadata=dict(
help='Text which was matched, with extra diagnostics information.')
)

def score(self):
return self.match_score

Expand All @@ -615,15 +637,18 @@ def from_dict(cls, license_match_mapping):
"""
rule = Rule.from_match_data(license_match_mapping)
matched_text = license_match_mapping.get("matched_text") or None
matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None

return cls(
from_file=license_match_mapping["from_file"],
start_line=license_match_mapping["start_line"],
end_line=license_match_mapping["end_line"],
match_score=license_match_mapping["score"],
matched_length=license_match_mapping["matched_length"],
match_coverage=license_match_mapping["match_coverage"],
matcher=license_match_mapping["matcher"],
text=matched_text,
matched_text_diagnostics=matched_text_diagnostics,
rule=rule,
qspan=None,
ispan=None,
Expand All @@ -642,35 +667,57 @@ def to_dict(
include_text=False,
license_text_diagnostics=False,
whole_lines=True,
rule_details=False,
):
"""
Return a "result" scan data built from a LicenseMatch object.
"""
matched_text = None
if include_text:
matched_text = self.matched_text

result = {}

# Detection Level Information
result['score'] = self.score()
result['license_expression'] = self.rule.license_expression
result['license_expression_spdx'] = self.rule.spdx_license_expression()
result['from_file'] = self.from_file
result['start_line'] = self.start_line
result['end_line'] = self.end_line
if rule_details:
result.update(self.rule.get_flags_mapping())
result['matcher'] = self.matcher
result['score'] = self.score()
result['matched_length'] = self.len()
if rule_details:
result["rule_length"] = self.rule.length
result['match_coverage'] = self.coverage()
result['matcher'] = self.matcher

# LicenseDB Level Information (Rule that was matched)
result['license_expression'] = self.rule.license_expression
result['rule_identifier'] = self.rule.identifier
result['rule_relevance'] = self.rule.relevance
result['rule_identifier'] = self.rule.identifier
result['rule_url'] = self.rule.rule_url
if rule_details:
result["rule_notes"] = self.rule.notes
result["referenced_filenames"] = self.rule.referenced_filenames
if include_text and self.matched_text:
result['matched_text'] = self.matched_text
if license_text_diagnostics and self.matched_text_diagnostics:
result['matched_text_diagnostics'] = self.matched_text_diagnostics
if rule_details:
result["rule_text"] = self.rule.text

if include_text:
result['matched_text'] = matched_text
return result


def populate_matches_with_path(matches, path):
"""
Given `matches` list of LicenseMatch objects, populate the `from_file`
attribute in them with `path` which is the path for the origin file for
that license match.
"""
for match in matches:
# Here if we have the `from_file` attribute populated already,
# they are from other files, and if it's empty, they are from
# the original resource, so we populate the files with the resource
# path for the original resource of their origin
if not match["from_file"]:
match["from_file"] = path


def collect_license_detections(codebase, include_license_clues=True):
"""
Return a list of LicenseDetectionFromResult object rehydrated from
Expand All @@ -680,7 +727,10 @@ def collect_license_detections(codebase, include_license_clues=True):
according to their license detections. This is required because package fields
are populated in package plugin, which runs before the license plugin, and thus
the license plugin step where unknown references to other files are dereferenced
does not show up automatically in package attributes.
does not show up automatically in package attributes.

Also populate from_file attributes with resource paths for matches which have
origin in the same file.
"""
has_packages = hasattr(codebase.root, 'package_data')
has_licenses = hasattr(codebase.root, 'license_detections')
Expand All @@ -692,7 +742,11 @@ def collect_license_detections(codebase, include_license_clues=True):
resource_license_detections = []
if has_licenses:
license_detections = getattr(resource, 'license_detections', []) or []
for detection in license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
license_clues = getattr(resource, 'license_clues', []) or []
populate_matches_with_path(matches=license_clues, path=resource.path)
codebase.save_resource(resource)

if license_detections:
license_detection_objects = detections_from_license_detection_mappings(
Expand Down Expand Up @@ -729,6 +783,9 @@ def collect_license_detections(codebase, include_license_clues=True):

package_license_detections = package["license_detections"]
if package_license_detections:
for detection in package_license_detections:
populate_matches_with_path(matches=detection["matches"], path=resource.path)
modified = True
package_license_detection_mappings.extend(package_license_detections)
detection_is_same, license_expression = verify_package_license_expression(
license_detection_mappings=package_license_detections,
Expand Down Expand Up @@ -828,6 +885,7 @@ class UniqueDetection:
"""
identifier = attr.ib(default=None)
license_expression = attr.ib(default=None)
license_expression_spdx = attr.ib(default=None)
detection_count = attr.ib(default=None)
matches = attr.ib(default=attr.Factory(list))
detection_log = attr.ib(default=attr.Factory(list))
Expand Down Expand Up @@ -860,12 +918,14 @@ def get_unique_detections(cls, license_detections):
for match in detection.matches
]
))
detection.license_expression_spdx = detection.spdx_license_expression()
detection.identifier = detection.identifier_with_expression

unique_license_detections.append(
cls(
identifier=detection.identifier,
license_expression=detection.license_expression,
license_expression_spdx=detection.license_expression_spdx,
detection_log=detection_log or [],
matches=detection.matches,
detection_count=len(file_regions),
Expand All @@ -875,7 +935,11 @@ def get_unique_detections(cls, license_detections):

return unique_license_detections

def to_dict(self, license_diagnostics):
def to_dict(self,
include_text=False,
license_text_diagnostics=False,
license_diagnostics=False,
):

def dict_fields(attr, value):

Expand All @@ -890,11 +954,20 @@ def dict_fields(attr, value):

return True

return attr.asdict(self, filter=dict_fields)
detection_mapping = attr.asdict(self, filter=dict_fields)
detection_mapping["sample_matches"] = [
match.to_dict(
include_text=include_text,
license_text_diagnostics=license_text_diagnostics,
)
for match in self.matches
]
return detection_mapping

def get_license_detection_object(self):
return LicenseDetection(
license_expression=self.license_expression,
license_expression_spdx=self.license_expression_spdx,
detection_log=self.detection_log,
matches=self.matches,
identifier=self.identifier,
Expand Down
17 changes: 10 additions & 7 deletions src/licensedcode/licenses_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def process_codebase(self, codebase, **kwargs):
Collect the ``license_references`` and ``rule_references``
list of data mappings and add to the ``codebase``.
"""
include_files = 'license' in kwargs
include_packages = 'package' in kwargs
include_files = hasattr(codebase.attributes, 'license_detections')
include_packages = hasattr(codebase.attributes, 'packages')

license_references, rule_references = collect_license_and_rule_references(
codebase=codebase,
Expand All @@ -86,17 +86,24 @@ def collect_license_and_rule_references(codebase, include_packages=True, include
Return a two-tuple of (``license_references``, ``license_rule_references``)
sorted lists of unique mappings collected from a ``codebase``.
"""
if TRACE:
logger_debug(f'include_packages: {include_packages}, include_files: {include_files}')

license_keys = set()
rules_by_identifier = {}

if include_packages:
pks, prules = collect_references_from_packages(codebase)
license_keys.update(pks)
if TRACE:
logger_debug(f'collect_references_from_packages: license keys: {pks}')
logger_debug(f'collect_references_from_packages: rules by id: {prules}')
rules_by_identifier.update(prules)

if include_files:
pks, prules = collect_references_from_files(codebase)
if TRACE:
logger_debug(f'collect_references_from_files: license keys: {pks}')
logger_debug(f'collect_references_from_files: rules by id: {prules}')
license_keys.update(pks)
rules_by_identifier.update(prules)

Expand Down Expand Up @@ -140,10 +147,6 @@ def collect_references_from_packages(codebase):
if expression:
license_keys.update(licensing.license_keys(expression))

detections = getattr(resource, 'license_detections', []) or []
rules_by_id = build_rules_from_detection_data(detections)
rules_by_identifier.update(rules_by_id)

for rule in rules_by_identifier.values():
# TODO: consider using the expresion object directly instead
expo = rule.license_expression
Expand Down
43 changes: 29 additions & 14 deletions src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,17 @@ class LicenseMatch(object):
metadata=dict(help='match end line, 1-based')
)

from_file = attr.ib(
default=None,
metadata=dict(
help='File path where this LicenseMatch was originally detected. '
'This needs to be stored as we bring over LicenseMatches from '
'other files into LicenseDetection objects now, and we need '
'to track the origin for these to be able to determine easily '
'which are native to that file.'
)
)

query = attr.ib(
default=None,
metadata=dict(help='Query object for this match')
Expand Down Expand Up @@ -722,7 +733,7 @@ def matched_text(
highlight=True,
highlight_matched='{}',
highlight_not_matched='[{}]',
_usecache=True
_usecache=True,
):
"""
Return the matched text for this match or an empty string if no query
Expand Down Expand Up @@ -762,39 +773,43 @@ def to_dict(
spdx_license_url=SPDX_LICENSE_URL,
include_text=False,
license_text_diagnostics=False,
whole_lines=True,
whole_lines=False,
file_path=None,
):
"""
Return a "result" scan data built from a LicenseMatch object.
"""
matched_text = None
matched_text_diagnostics = None

if include_text:
if license_text_diagnostics:
matched_text = self.matched_text(whole_lines=False, highlight=True)
matched_text_diagnostics = self.matched_text(whole_lines=False, highlight=True)

if whole_lines:
matched_text = self.matched_text(whole_lines=True, highlight=False)
else:
if whole_lines:
matched_text = self.matched_text(whole_lines=True, highlight=False)
else:
matched_text = self.matched_text(whole_lines=False, highlight=False)
matched_text = self.matched_text(whole_lines=False, highlight=False)

result = {}

# Detection Level Information
result['score'] = self.score()
result['license_expression'] = self.rule.license_expression
result['spdx_license_expression'] = self.rule.spdx_license_expression()
result['from_file'] = file_path
result['start_line'] = self.start_line
result['end_line'] = self.end_line
result['matcher'] = self.matcher
result['score'] = self.score()
result['matched_length'] = self.len()
result['match_coverage'] = self.coverage()
result['matcher'] = self.matcher

# LicenseDB Level Information (Rule that was matched)
result['license_expression'] = self.rule.license_expression
result['rule_identifier'] = self.rule.identifier
result['rule_relevance'] = self.rule.relevance
result['rule_identifier'] = self.rule.identifier
result['rule_url'] = self.rule.rule_url

if include_text:
result['matched_text'] = matched_text
if license_text_diagnostics:
result['matched_text_diagnostics'] = matched_text_diagnostics
return result

def get_highlighted_text(self, trace=TRACE_HIGHLIGHTED_TEXT):
Expand Down
Loading