aboutcode-org · AyanSinhaMahapatra · Jan 16, 2024 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/src/licensedcode/cache.py b/src/licensedcode/cache.py
@@ -545,11 +545,12 @@ def validate_spdx_license_keys(license_expression, licensing):
         try:
             parsed.render(template='{symbol.wrapped.spdx_license_key}')
         except AttributeError:
+            msg = f"Error rendering SPDX license key for: {key}"
             messages.append(msg)
             pass
 
     if messages:
-        raise InvalidLicenseKeyError(messages)
+        raise InvalidLicenseKeyError(f"ERROR in parsing license_expression: {license_expression}: type: {type(license_expression)} :{messages}")
 
 
 class InvalidLicenseKeyError(Exception):

diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
@@ -169,6 +169,12 @@ class LicenseDetection:
             'using the SPDX license expression syntax and ScanCode license keys.')
     )
 
+    license_expression_spdx = attr.ib(
+        default=None,
+        metadata=dict(
+            help='Full license expression string with SPDX license keys.')
+    )
+
     matches = attr.ib(
         default=attr.Factory(list),
         metadata=dict(
@@ -248,8 +254,17 @@ def from_matches(
             detection_log=detection_log,
         )
         detection.identifier = detection.identifier_with_expression
+        detection.license_expression_spdx = detection.spdx_license_expression()
         return detection
 
+    def spdx_license_expression(self):
+        from licensedcode.cache import build_spdx_license_expression
+        from licensedcode.cache import get_cache
+        return str(build_spdx_license_expression(
+            license_expression=self.license_expression,
+            licensing=get_cache().licensing,
+        ))
+
     def __eq__(self, other):
         return (
             isinstance(other, LicenseDetection)
@@ -515,6 +530,7 @@ def from_license_detection_mapping(
 
         detection = cls(
             license_expression=license_detection_mapping["license_expression"],
+            license_expression_spdx=license_detection_mapping["license_expression_spdx"],
             detection_log=license_detection_mapping.get("detection_log", []) or None,
             identifier=license_detection_mapping["identifier"],
             matches=matches,
@@ -590,6 +606,12 @@ class LicenseMatchFromResult(LicenseMatch):
             help='Text which was matched')
     )
 
+    matched_text_diagnostics = attr.ib(
+        default=None,
+        metadata=dict(
+            help='Text which was matched, with extra diagnostics information.')
+    )
+
     def score(self):
         return self.match_score
 
@@ -615,15 +637,18 @@ def from_dict(cls, license_match_mapping):
         """
         rule = Rule.from_match_data(license_match_mapping)
         matched_text = license_match_mapping.get("matched_text") or None
+        matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None
 
         return cls(
+            from_file=license_match_mapping["from_file"],
             start_line=license_match_mapping["start_line"],
             end_line=license_match_mapping["end_line"],
             match_score=license_match_mapping["score"],
             matched_length=license_match_mapping["matched_length"],
             match_coverage=license_match_mapping["match_coverage"],
             matcher=license_match_mapping["matcher"],
             text=matched_text,
+            matched_text_diagnostics=matched_text_diagnostics,
             rule=rule,
             qspan=None,
             ispan=None,
@@ -642,35 +667,57 @@ def to_dict(
         include_text=False,
         license_text_diagnostics=False,
         whole_lines=True,
+        rule_details=False,
     ):
         """
         Return a "result" scan data built from a LicenseMatch object.
         """
-        matched_text = None
-        if include_text:
-            matched_text = self.matched_text
-
         result = {}
 
-        # Detection Level Information
-        result['score'] = self.score()
+        result['license_expression'] = self.rule.license_expression
+        result['license_expression_spdx'] = self.rule.spdx_license_expression()
+        result['from_file'] = self.from_file
         result['start_line'] = self.start_line
         result['end_line'] = self.end_line
+        if rule_details:
+            result.update(self.rule.get_flags_mapping())
+        result['matcher'] = self.matcher
+        result['score'] = self.score()
         result['matched_length'] = self.len()
+        if rule_details:
+            result["rule_length"] = self.rule.length
         result['match_coverage'] = self.coverage()
-        result['matcher'] = self.matcher
-
-        # LicenseDB Level Information (Rule that was matched)
-        result['license_expression'] = self.rule.license_expression
-        result['rule_identifier'] = self.rule.identifier
         result['rule_relevance'] = self.rule.relevance
+        result['rule_identifier'] = self.rule.identifier
         result['rule_url'] = self.rule.rule_url
+        if rule_details:
+            result["rule_notes"] = self.rule.notes
+            result["referenced_filenames"] = self.rule.referenced_filenames
+        if include_text and self.matched_text:
+            result['matched_text'] = self.matched_text
+        if license_text_diagnostics and self.matched_text_diagnostics:
+            result['matched_text_diagnostics'] = self.matched_text_diagnostics
+        if rule_details:
+            result["rule_text"] = self.rule.text
 
-        if include_text:
-            result['matched_text'] = matched_text
         return result
 
 
+def populate_matches_with_path(matches, path):
+    """
+    Given `matches` list of LicenseMatch objects, populate the `from_file`
+    attribute in them with `path` which is the path for the origin file for
+    that license match.
+    """
+    for match in matches:
+        # Here if we have the `from_file` attribute populated already,
+        # they are from other files, and if it's empty, they are from
+        # the original resource, so we populate the files with the resource
+        # path for the original resource of their origin  
+        if not match["from_file"]:
+            match["from_file"] = path
+
+
 def collect_license_detections(codebase, include_license_clues=True):
     """
     Return a list of LicenseDetectionFromResult object rehydrated from
@@ -680,7 +727,10 @@ def collect_license_detections(codebase, include_license_clues=True):
     according to their license detections. This is required because package fields
     are populated in package plugin, which runs before the license plugin, and thus
     the license plugin step where unknown references to other files are dereferenced
-    does not show up automatically in package attributes. 
+    does not show up automatically in package attributes.
+
+    Also populate from_file attributes with resource paths for matches which have
+    origin in the same file.
     """
     has_packages = hasattr(codebase.root, 'package_data')
     has_licenses = hasattr(codebase.root, 'license_detections')
@@ -692,7 +742,11 @@ def collect_license_detections(codebase, include_license_clues=True):
         resource_license_detections = []
         if has_licenses:
             license_detections = getattr(resource, 'license_detections', []) or []
+            for detection in license_detections:
+                populate_matches_with_path(matches=detection["matches"], path=resource.path)
             license_clues = getattr(resource, 'license_clues', []) or []
+            populate_matches_with_path(matches=license_clues, path=resource.path)
+            codebase.save_resource(resource)
 
             if license_detections:
                 license_detection_objects = detections_from_license_detection_mappings(
@@ -729,6 +783,9 @@ def collect_license_detections(codebase, include_license_clues=True):
 
                 package_license_detections = package["license_detections"]
                 if package_license_detections:
+                    for detection in package_license_detections:
+                        populate_matches_with_path(matches=detection["matches"], path=resource.path)
+                        modified = True
                     package_license_detection_mappings.extend(package_license_detections)
                     detection_is_same, license_expression = verify_package_license_expression(
                         license_detection_mappings=package_license_detections,
@@ -828,6 +885,7 @@ class UniqueDetection:
     """
     identifier = attr.ib(default=None)
     license_expression = attr.ib(default=None)
+    license_expression_spdx = attr.ib(default=None)
     detection_count = attr.ib(default=None)
     matches = attr.ib(default=attr.Factory(list))
     detection_log = attr.ib(default=attr.Factory(list))
@@ -860,12 +918,14 @@ def get_unique_detections(cls, license_detections):
                         for match in detection.matches
                     ]
                 ))
+                detection.license_expression_spdx = detection.spdx_license_expression()
                 detection.identifier = detection.identifier_with_expression
 
             unique_license_detections.append(
                 cls(
                     identifier=detection.identifier,
                     license_expression=detection.license_expression,
+                    license_expression_spdx=detection.license_expression_spdx,
                     detection_log=detection_log or [],
                     matches=detection.matches,
                     detection_count=len(file_regions),
@@ -875,7 +935,11 @@ def get_unique_detections(cls, license_detections):
 
         return unique_license_detections
 
-    def to_dict(self, license_diagnostics):
+    def to_dict(self,
+        include_text=False,
+        license_text_diagnostics=False,
+        license_diagnostics=False,
+    ):
 
         def dict_fields(attr, value):
 
@@ -890,11 +954,20 @@ def dict_fields(attr, value):
 
             return True
 
-        return attr.asdict(self, filter=dict_fields)
+        detection_mapping = attr.asdict(self, filter=dict_fields)
+        detection_mapping["sample_matches"] = [
+            match.to_dict(
+                include_text=include_text,
+                license_text_diagnostics=license_text_diagnostics,
+            )
+            for match in self.matches
+        ]
+        return detection_mapping
 
     def get_license_detection_object(self):
         return LicenseDetection(
             license_expression=self.license_expression,
+            license_expression_spdx=self.license_expression_spdx,
             detection_log=self.detection_log,
             matches=self.matches,
             identifier=self.identifier,

diff --git a/src/licensedcode/licenses_reference.py b/src/licensedcode/licenses_reference.py
@@ -69,8 +69,8 @@ def process_codebase(self, codebase, **kwargs):
         Collect the ``license_references`` and ``rule_references``
         list of data mappings and add to the ``codebase``.
         """
-        include_files = 'license' in kwargs
-        include_packages = 'package' in kwargs
+        include_files = hasattr(codebase.attributes, 'license_detections')
+        include_packages = hasattr(codebase.attributes, 'packages')
 
         license_references, rule_references = collect_license_and_rule_references(
             codebase=codebase,
@@ -86,17 +86,24 @@ def collect_license_and_rule_references(codebase, include_packages=True, include
     Return a two-tuple of (``license_references``, ``license_rule_references``)
     sorted lists of unique mappings collected from a ``codebase``.
     """
+    if TRACE:
+        logger_debug(f'include_packages: {include_packages}, include_files: {include_files}')
 
     license_keys = set()
     rules_by_identifier = {}
 
     if include_packages:
         pks, prules = collect_references_from_packages(codebase)
-        license_keys.update(pks)
+        if TRACE:
+            logger_debug(f'collect_references_from_packages: license keys: {pks}')
+            logger_debug(f'collect_references_from_packages: rules by id: {prules}')
         rules_by_identifier.update(prules)
 
     if include_files:
         pks, prules = collect_references_from_files(codebase)
+        if TRACE:
+            logger_debug(f'collect_references_from_files: license keys: {pks}')
+            logger_debug(f'collect_references_from_files: rules by id: {prules}')
         license_keys.update(pks)
         rules_by_identifier.update(prules)
 
@@ -140,10 +147,6 @@ def collect_references_from_packages(codebase):
             if expression:
                 license_keys.update(licensing.license_keys(expression))
 
-        detections = getattr(resource, 'license_detections', []) or []
-        rules_by_id = build_rules_from_detection_data(detections)
-        rules_by_identifier.update(rules_by_id)
-
     for rule in rules_by_identifier.values():
         # TODO: consider using the expresion object directly instead
         expo = rule.license_expression

diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -223,6 +223,17 @@ class LicenseMatch(object):
         metadata=dict(help='match end line, 1-based')
     )
 
+    from_file = attr.ib(
+        default=None,
+        metadata=dict(
+            help='File path where this LicenseMatch was originally detected. '
+                 'This needs to be stored as we bring over LicenseMatches from '
+                 'other files into LicenseDetection objects now, and we need '
+                 'to track the origin for these to be able to determine easily '
+                 'which are native to that file.'
+        )
+    )
+
     query = attr.ib(
         default=None,
         metadata=dict(help='Query object for this match')
@@ -722,7 +733,7 @@ def matched_text(
         highlight=True,
         highlight_matched='{}',
         highlight_not_matched='[{}]',
-        _usecache=True
+        _usecache=True,
     ):
         """
         Return the matched text for this match or an empty string if no query
@@ -762,39 +773,43 @@ def to_dict(
         spdx_license_url=SPDX_LICENSE_URL,
         include_text=False,
         license_text_diagnostics=False,
-        whole_lines=True,
+        whole_lines=False,
+        file_path=None,
     ):
         """
         Return a "result" scan data built from a LicenseMatch object.
         """
         matched_text = None
+        matched_text_diagnostics = None
+
         if include_text:
             if license_text_diagnostics:
-                matched_text = self.matched_text(whole_lines=False, highlight=True)
+                matched_text_diagnostics = self.matched_text(whole_lines=False, highlight=True)
+
+            if whole_lines:
+                matched_text = self.matched_text(whole_lines=True, highlight=False)
             else:
-                if whole_lines:
-                    matched_text = self.matched_text(whole_lines=True, highlight=False)
-                else:
-                    matched_text = self.matched_text(whole_lines=False, highlight=False)
+                matched_text = self.matched_text(whole_lines=False, highlight=False)
 
         result = {}
 
-        # Detection Level Information
-        result['score'] = self.score()
+        result['license_expression'] = self.rule.license_expression
+        result['spdx_license_expression'] = self.rule.spdx_license_expression()
+        result['from_file'] = file_path
         result['start_line'] = self.start_line
         result['end_line'] = self.end_line
+        result['matcher'] = self.matcher
+        result['score'] = self.score()
         result['matched_length'] = self.len()
         result['match_coverage'] = self.coverage()
-        result['matcher'] = self.matcher
-
-        # LicenseDB Level Information (Rule that was matched)
-        result['license_expression'] = self.rule.license_expression
-        result['rule_identifier'] = self.rule.identifier
         result['rule_relevance'] = self.rule.relevance
+        result['rule_identifier'] = self.rule.identifier
         result['rule_url'] = self.rule.rule_url
 
         if include_text:
             result['matched_text'] = matched_text
+        if license_text_diagnostics:
+            result['matched_text_diagnostics'] = matched_text_diagnostics
         return result
 
     def get_highlighted_text(self, trace=TRACE_HIGHLIGHTED_TEXT):