Skip to content

Commit ac04984

Browse files
Add maven license detection updates
Add a new config variable for datafile handlers, which if enabled will run a package-ecosystem specific implementation of license detection. Added a function to implement this for maven which only keeps the relevant information and runs license detection on the whole license statement and not on the respective values/attributes one-by-one. Enabled this for maven to improve license detection there. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent d5e811b commit ac04984

File tree

215 files changed

+1628
-3284
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

215 files changed

+1628
-3284
lines changed

src/packagedcode/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,3 +254,12 @@ def get_package_handler(package_data):
254254
raise UnknownPackageDatasource(package_data)
255255
return ppc
256256

257+
258+
PACKAGE_DATA_CLASS_WITH_CUSTOM_LICENSE_DETECTION = [
259+
maven.MavenPackageData
260+
]
261+
262+
PACKAGE_DATA_CLASS_BY_DATASOURCE_ID = {
263+
package_data_class.datasource_id: package_data_class
264+
for package_data_class in PACKAGE_DATA_CLASS_WITH_CUSTOM_LICENSE_DETECTION
265+
}

src/packagedcode/licensing.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
from summarycode.classify import LEGAL_STARTS_ENDS
3434
from summarycode.classify import README_STARTS_ENDS
3535

36+
import saneyaml
37+
3638

3739
"""
3840
Detect and normalize licenses as found in package manifests data.
@@ -683,7 +685,7 @@ def get_normalized_license_detections(
683685
license_detections.extend(detections)
684686

685687
else:
686-
extracted_license_statement = repr(extracted_license)
688+
extracted_license_statement = saneyaml.dump(extracted_license)
687689
license_detections = get_license_detections_for_extracted_license_statement(
688690
extracted_license_statement=extracted_license_statement,
689691
try_as_expression=try_as_expression,
@@ -725,7 +727,7 @@ def get_normalized_license_detections(
725727
license_detections.extend(detections)
726728

727729
else:
728-
extracted_license_statement = repr(extracted_license_item)
730+
extracted_license_statement = saneyaml.dump(extracted_license_item)
729731

730732
detections = get_license_detections_for_extracted_license_statement(
731733
extracted_license_statement=extracted_license_statement,
@@ -749,6 +751,7 @@ def get_license_detections_and_expression(
749751
try_as_expression=True,
750752
approximate=True,
751753
expression_symbols=None,
754+
datasource_id = None,
752755
):
753756
"""
754757
Given a text `extracted_license_statement` return a list of LicenseDetection objects.
@@ -764,22 +767,33 @@ def get_license_detections_and_expression(
764767
Return None if the `query_string` is empty. Return "unknown" as a license
765768
expression if there is a `query_string` but nothing was detected.
766769
"""
770+
from packagedcode import PACKAGE_DATA_CLASS_BY_DATASOURCE_ID
771+
767772
detection_data = []
768773
license_expression = None
769774

770775
if not extracted_license_statement:
771776
return detection_data, license_expression
772777

773-
license_detections = get_normalized_license_detections(
774-
extracted_license=extracted_license_statement,
775-
try_as_expression=try_as_expression,
776-
approximate=approximate,
777-
expression_symbols=expression_symbols,
778-
)
778+
if datasource_id in PACKAGE_DATA_CLASS_BY_DATASOURCE_ID:
779+
package_data_class = PACKAGE_DATA_CLASS_BY_DATASOURCE_ID.get(datasource_id, None)
780+
license_detections = package_data_class.get_license_detections_for_extracted_license_statement(
781+
extracted_license=extracted_license_statement,
782+
try_as_expression=try_as_expression,
783+
approximate=approximate,
784+
expression_symbols=expression_symbols,
785+
)
786+
else:
787+
license_detections = get_normalized_license_detections(
788+
extracted_license=extracted_license_statement,
789+
try_as_expression=try_as_expression,
790+
approximate=approximate,
791+
expression_symbols=expression_symbols,
792+
)
779793

780794
if not license_detections:
781795
if not isinstance(extracted_license_statement, str):
782-
extracted_license_statement = repr(extracted_license_statement)
796+
extracted_license_statement = saneyaml.dump(extracted_license_statement)
783797
license_detection = get_unknown_license_detection(query_string=extracted_license_statement)
784798
license_detections = [license_detection]
785799

@@ -808,7 +822,7 @@ def get_license_detections_for_extracted_license_statement(
808822
return []
809823

810824
if not isinstance(extracted_license_statement, str):
811-
extracted_license_statement = repr(extracted_license_statement)
825+
extracted_license_statement = saneyaml.dump(extracted_license_statement)
812826

813827
matches, matched_as_expression = get_license_matches_for_extracted_license_statement(
814828
query_string=extracted_license_statement,

src/packagedcode/maven.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import os.path
1212
from pprint import pformat
1313

14-
import attr
1514
import javaproperties
1615
import lxml
1716
from packageurl import PackageURL
@@ -26,6 +25,8 @@
2625
from textcode import analysis
2726
from typecode import contenttype
2827

28+
import saneyaml
29+
2930
TRACE = False
3031

3132
logger = logging.getLogger(__name__)
@@ -1153,7 +1154,7 @@ def _parse(
11531154
))
11541155

11551156
# FIXME: there are still other data to map in a PackageData
1156-
return models.PackageData(
1157+
return MavenPackageData(
11571158
datasource_id=datasource_id,
11581159
type=package_type,
11591160
primary_language=primary_language,
@@ -1171,6 +1172,63 @@ def _parse(
11711172
**urls,
11721173
)
11731174

1175+
class MavenPackageData(models.PackageData):
1176+
1177+
datasource_id = 'maven_pom'
1178+
1179+
def get_license_detections_for_extracted_license_statement(
1180+
extracted_license,
1181+
try_as_expression=True,
1182+
approximate=True,
1183+
expression_symbols=None,
1184+
):
1185+
from packagedcode.licensing import get_normalized_license_detections
1186+
from packagedcode.licensing import get_license_detections_for_extracted_license_statement
1187+
1188+
if not MavenPackageData.check_extracted_license_statement_structure(extracted_license):
1189+
return get_normalized_license_detections(
1190+
extracted_license=extracted_license,
1191+
try_as_expression=try_as_expression,
1192+
approximate=approximate,
1193+
expression_symbols=expression_symbols,
1194+
)
1195+
1196+
new_extracted_license = extracted_license.copy()
1197+
1198+
for license_entry in new_extracted_license:
1199+
license_entry.pop("distribution")
1200+
if not license_entry.get("name", None):
1201+
license_entry.pop("name")
1202+
if not license_entry.get("url", None):
1203+
license_entry.pop("url")
1204+
if not license_entry.get("comments", None):
1205+
license_entry.pop("comments")
1206+
1207+
extracted_license_statement = saneyaml.dump(new_extracted_license)
1208+
1209+
return get_license_detections_for_extracted_license_statement(
1210+
extracted_license_statement=extracted_license_statement,
1211+
try_as_expression=try_as_expression,
1212+
approximate=approximate,
1213+
expression_symbols=expression_symbols,
1214+
)
1215+
1216+
1217+
def check_extracted_license_statement_structure(extracted_license):
1218+
1219+
is_list_of_mappings = False
1220+
if not isinstance(extracted_license, list):
1221+
return is_list_of_mappings
1222+
else:
1223+
is_list_of_mappings = True
1224+
1225+
for extracted_license_item in extracted_license:
1226+
if not isinstance(extracted_license_item, dict):
1227+
is_list_of_mappings = False
1228+
break
1229+
1230+
return is_list_of_mappings
1231+
11741232

11751233
def build_vcs_and_code_view_urls(scm):
11761234
"""

src/packagedcode/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,8 @@ def get_license_detections_and_expression(self):
879879

880880
return get_license_detections_and_expression(
881881
extracted_license_statement=self.extracted_license_statement,
882-
default_relation_license=default_relation_license
882+
default_relation_license=default_relation_license,
883+
datasource_id=self.datasource_id,
883884
)
884885

885886

tests/formattedcode/data/common/manifests-expected.json

Lines changed: 20 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -36,40 +36,34 @@
3636
"end_line": 1,
3737
"matched_length": 8,
3838
"match_coverage": 100.0,
39-
"matcher": "1-hash",
39+
"matcher": "2-aho",
4040
"license_expression": "cddl-1.0",
4141
"rule_identifier": "cddl-1.0.RULE",
4242
"rule_relevance": 100,
4343
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0.RULE",
44-
"matched_text": "Common Development and Distribution License (CDDL) v1.0"
45-
}
46-
],
47-
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5"
48-
},
49-
{
50-
"license_expression": "cddl-1.0",
51-
"matches": [
44+
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0"
45+
},
5246
{
5347
"score": 100.0,
54-
"start_line": 1,
55-
"end_line": 1,
48+
"start_line": 2,
49+
"end_line": 2,
5650
"matched_length": 7,
5751
"match_coverage": 100.0,
58-
"matcher": "1-hash",
52+
"matcher": "2-aho",
5953
"license_expression": "cddl-1.0",
6054
"rule_identifier": "cddl-1.0_4.RULE",
6155
"rule_relevance": 100,
6256
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_4.RULE",
63-
"matched_text": "http://www.sun.com/cddl/cddl.html"
57+
"matched_text": " url: http://www.sun.com/cddl/cddl.html"
6458
}
6559
],
66-
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83"
60+
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880"
6761
}
6862
],
6963
"other_license_expression": null,
7064
"other_license_expression_spdx": null,
7165
"other_license_detections": [],
72-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n comments:\n distribution: repo\n",
66+
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
7367
"notice_text": null,
7468
"source_packages": [
7569
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
@@ -480,18 +474,13 @@
480474
"license_expression": "apache-2.0",
481475
"detection_count": 1
482476
},
483-
{
484-
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5",
485-
"license_expression": "cddl-1.0",
486-
"detection_count": 1
487-
},
488477
{
489478
"identifier": "cddl_1_0-c6dbef4d-659c-289f-5ee9-1ca0278edad6",
490479
"license_expression": "cddl-1.0",
491480
"detection_count": 1
492481
},
493482
{
494-
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83",
483+
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880",
495484
"license_expression": "cddl-1.0",
496485
"detection_count": 1
497486
},
@@ -653,40 +642,34 @@
653642
"end_line": 1,
654643
"matched_length": 8,
655644
"match_coverage": 100.0,
656-
"matcher": "1-hash",
645+
"matcher": "2-aho",
657646
"license_expression": "cddl-1.0",
658647
"rule_identifier": "cddl-1.0.RULE",
659648
"rule_relevance": 100,
660649
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0.RULE",
661-
"matched_text": "Common Development and Distribution License (CDDL) v1.0"
662-
}
663-
],
664-
"identifier": "cddl_1_0-9893b55c-3b2b-4ee8-a932-6c6c93a63fc5"
665-
},
666-
{
667-
"license_expression": "cddl-1.0",
668-
"matches": [
650+
"matched_text": "- name: Common Development and Distribution License (CDDL) v1.0"
651+
},
669652
{
670653
"score": 100.0,
671-
"start_line": 1,
672-
"end_line": 1,
654+
"start_line": 2,
655+
"end_line": 2,
673656
"matched_length": 7,
674657
"match_coverage": 100.0,
675-
"matcher": "1-hash",
658+
"matcher": "2-aho",
676659
"license_expression": "cddl-1.0",
677660
"rule_identifier": "cddl-1.0_4.RULE",
678661
"rule_relevance": 100,
679662
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/cddl-1.0_4.RULE",
680-
"matched_text": "http://www.sun.com/cddl/cddl.html"
663+
"matched_text": " url: http://www.sun.com/cddl/cddl.html"
681664
}
682665
],
683-
"identifier": "cddl_1_0-ef82fc8c-50cb-6f35-1814-d2eb0bc13e83"
666+
"identifier": "cddl_1_0-dd3dd7df-afca-6a5e-492c-f7b279fdd880"
684667
}
685668
],
686669
"other_license_expression": null,
687670
"other_license_expression_spdx": null,
688671
"other_license_detections": [],
689-
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n comments:\n distribution: repo\n",
672+
"extracted_license_statement": "- name: Common Development and Distribution License (CDDL) v1.0\n url: http://www.sun.com/cddl/cddl.html\n",
690673
"notice_text": null,
691674
"source_packages": [
692675
"pkg:maven/javax.persistence/[email protected]?classifier=sources"
@@ -1397,7 +1380,7 @@
13971380
"rule_identifier": "pypi_gnu_lesser_general_public_license_v3.RULE",
13981381
"rule_relevance": 100,
13991382
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/pypi_gnu_lesser_general_public_license_v3.RULE",
1400-
"matched_text": "['License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)']"
1383+
"matched_text": "- 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)'"
14011384
}
14021385
],
14031386
"identifier": "lgpl_3_0-272571eb-5e68-95b6-ddb0-71de2d8df321"

0 commit comments

Comments
 (0)