|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# |
| 3 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 4 | +# ScanCode is a trademark of nexB Inc. |
| 5 | +# SPDX-License-Identifier: Apache-2.0 |
| 6 | +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. |
| 7 | +# See https://github.com/nexB/scancode-toolkit for support or download. |
| 8 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 9 | +# |
| 10 | + |
| 11 | +import click |
| 12 | + |
| 13 | +from licensedcode.tokenize import ngrams |
| 14 | + |
| 15 | +import synclic |
| 16 | + |
| 17 | +""" |
| 18 | +A script to generate false-positive license detection rules from lists of SPDX |
| 19 | +licenses. |
| 20 | +
|
| 21 | +Common license detection tools use list of SPDX licenses ids to support their operations. |
| 22 | +As a result, we get a lot of matched licenses and in these cases, these are false positives. |
| 23 | +
|
| 24 | +Here we fetch all released SPDX licenses lists and generate false positives |
| 25 | +using these approaches to have a reasonable set of combinations of license ids |
| 26 | +as found in the wild: |
| 27 | +
|
| 28 | +1. For each SPDX license list release, we consider these lists: |
| 29 | + - all IDs |
| 30 | + - all non-deprecated IDs |
| 31 | + - all licenses |
| 32 | + - all non-deprecated licenses |
| 33 | + - all exceptions |
| 34 | + - all non-deprecated exceptions |
| 35 | +
|
| 36 | +We generate lists of ids only and list of ids and name |
| 37 | +
|
| 38 | +2. For each of these lists we sort them: |
| 39 | + - respective case |
| 40 | + - ignoring case |
| 41 | +
|
| 42 | +3. for each of these sorted list we collect sub-sequences of 6 license, one |
| 43 | + per line and generate a false positive RULE from that. |
| 44 | +
|
| 45 | +If a RULE already exists, it will be skipped. |
| 46 | +""" |
| 47 | + |
| 48 | +TRACE = False |
| 49 | + |
| 50 | +template = '''---------------------------------------- |
| 51 | +is_false_positive: yes |
| 52 | +notes: a sequence of SPDX license ids and names is not a license |
| 53 | +--- |
| 54 | +{} |
| 55 | +''' |
| 56 | + |
| 57 | + |
| 58 | +@click.command() |
| 59 | +@click.argument( |
| 60 | + 'license_dir', type=click.Path(), metavar='DIR') |
| 61 | + |
| 62 | +@click.argument( |
| 63 | + # 'A buildrules-formatted file used to generate new licenses rules.') |
| 64 | + 'output', type=click.Path(), metavar='FILE') |
| 65 | + |
| 66 | +@click.option( |
| 67 | + '--commitish', type=str, default=None, |
| 68 | + help='An optional commitish to use for SPDX license data instead of the latest release.') |
| 69 | + |
| 70 | +@click.option( |
| 71 | + # 'A buildrules-formatted file used to generate new licenses rules.') |
| 72 | + '--from-list', default=None, type=click.Path(), metavar='LIST_FILE', |
| 73 | + help='Use file with a list of entries to ignore instead') |
| 74 | + |
| 75 | +@click.option( |
| 76 | + '-n', '--ngrams-length', type=int, default=6, |
| 77 | + help='Number of elements in a sub-sequence when generating a rule.') |
| 78 | + |
| 79 | +@click.option( |
| 80 | + '-t', '--trace', is_flag=True, default=False, |
| 81 | + help='Print execution trace.') |
| 82 | + |
| 83 | +@click.help_option('-h', '--help') |
| 84 | +def cli(license_dir, output, commitish=None, from_list=None, trace=False, ngrams_length=6): |
| 85 | + """ |
| 86 | + Generate ScanCode false-positive license detection rules from lists of SPDX |
| 87 | + license. Save these in FILE for use with buildrules. |
| 88 | +
|
| 89 | + the `spdx` directory is used as a temp store for fetched SPDX licenses. |
| 90 | + """ |
| 91 | + global TRACE |
| 92 | + TRACE = trace |
| 93 | + |
| 94 | + if not from_list: |
| 95 | + spdx_source = synclic.SpdxSource(external_base_dir=license_dir) |
| 96 | + |
| 97 | + spdx_by_key = spdx_source.get_licenses( |
| 98 | + commitish=commitish, |
| 99 | + skip_oddities=False, |
| 100 | + ) |
| 101 | + |
| 102 | + all_licenses_and_exceptions = [] |
| 103 | + all_licenses_and_exceptions_non_deprecated = [] |
| 104 | + licenses = [] |
| 105 | + exceptions = [] |
| 106 | + licenses_non_deprecated = [] |
| 107 | + exceptions_non_deprecated = [] |
| 108 | + |
| 109 | + lists_of_licenses = [ |
| 110 | + all_licenses_and_exceptions, |
| 111 | + all_licenses_and_exceptions_non_deprecated, |
| 112 | + licenses, |
| 113 | + exceptions, |
| 114 | + licenses_non_deprecated, |
| 115 | + exceptions_non_deprecated, |
| 116 | + ] |
| 117 | + |
| 118 | + for lspdx in spdx_by_key.values(): |
| 119 | + all_licenses_and_exceptions.append(lspdx) |
| 120 | + is_deprecated = lspdx.is_deprecated |
| 121 | + if not is_deprecated: |
| 122 | + all_licenses_and_exceptions_non_deprecated.append(lspdx) |
| 123 | + if lspdx.is_exception: |
| 124 | + exceptions.append(lspdx) |
| 125 | + if not is_deprecated: |
| 126 | + exceptions_non_deprecated.append(lspdx) |
| 127 | + else: |
| 128 | + licenses.append(lspdx) |
| 129 | + if not is_deprecated: |
| 130 | + licenses_non_deprecated.append(lspdx) |
| 131 | + |
| 132 | + lists_of_sorted_licenses = [] |
| 133 | + for lic_list in lists_of_licenses: |
| 134 | + sorted_case_sensitive = sorted(lic_list, key=lambda x: x.spdx_license_key) |
| 135 | + |
| 136 | + as_ids = [l.spdx_license_key for l in sorted_case_sensitive] |
| 137 | + lists_of_sorted_licenses.append(as_ids) |
| 138 | + |
| 139 | + as_id_names = [f'{l.spdx_license_key} {l.name}' for l in sorted_case_sensitive] |
| 140 | + lists_of_sorted_licenses.append(as_id_names) |
| 141 | + |
| 142 | + sorted_case_insensitive = sorted(lic_list, key=lambda x: x.spdx_license_key.lower()) |
| 143 | + as_ids = [l.spdx_license_key for l in sorted_case_insensitive] |
| 144 | + lists_of_sorted_licenses.append(as_ids) |
| 145 | + |
| 146 | + as_id_names = [f'{l.spdx_license_key} {l.name}' for l in sorted_case_insensitive] |
| 147 | + lists_of_sorted_licenses.append(as_id_names) |
| 148 | + |
| 149 | + else: |
| 150 | + with open(from_list) as inp: |
| 151 | + lists_of_sorted_licenses = [inp.read().splitlines(False)] |
| 152 | + |
| 153 | + with open(output, 'w') as o: |
| 154 | + for lic_list in lists_of_sorted_licenses: |
| 155 | + write_ngrams(texts=lic_list, output=o, ngram_length=ngrams_length) |
| 156 | + |
| 157 | + o.write('----------------------------------------\n') |
| 158 | + |
| 159 | + |
| 160 | +def write_ngrams(texts, output, _seen=set(), ngram_length=6): |
| 161 | + """ |
| 162 | + Write the texts list as ngrams to the output file-like object. |
| 163 | + """ |
| 164 | + for text in ['\n'.join(ngs) for ngs in ngrams(texts, ngram_length=ngram_length)]: |
| 165 | + if text in _seen: |
| 166 | + continue |
| 167 | + _seen.add(text) |
| 168 | + output.write(template.format(text)) |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == '__main__': |
| 172 | + cli() |
0 commit comments