Skip to content

Commit f3e6753

Browse files
authored
[Fix CI] Improve license parsing in ddev validation (#22559)
* Improve license parsing * Add changelog * fix(licenses): Fix validation of SPDX identifiers containing -or- and -and- - Update _OP_SPLIT regex to use negative lookbehind/lookahead assertions - Prevent matching "-or-" or "-and-" inside identifiers like "GPL-2.0-or-later" - Fix fallback logic to check per-chunk instead of entire parts list - Ensure all license identifiers are extracted from compound expressions Rationale: The _OP_SPLIT regex was incorrectly splitting SPDX identifiers like GPL-2.0-or-later into fragments (GPL-2.0- and -later) because \bOR\b with re.IGNORECASE matched the lowercase "or" between hyphens. This caused valid licenses with -or-later or -and- segments to be rejected by ddev validate licenses. Additionally, the fallback logic bug prevented proper extraction of multiple license identifiers from compound expressions, resulting in only the first identifier being extracted. This commit made by [/dd:git:commit:atomic](https://github.com/DataDog/claude-marketplace/tree/main/dd/commands/git/commit/atomic.md) * Add type hint to parts * fix(licenses): Remove comma as a separator in license expressions - Remove comma from _OP_SPLIT regex separator list - Fixes incorrect splitting of license names containing commas - Addresses validation failures for licenses like "Apache License, Version 2.0" Rationale: Commas are not part of the SPDX expression syntax (only AND, OR, and WITH are valid operators). When commas appear in license strings, they are part of the license name itself, not expression separators. The previous implementation incorrectly split "Apache License, Version 2.0" into ["Apache License", "Version 2.0"], causing validation failures. This commit made by [/dd:git:commit:atomic](https://github.com/DataDog/claude-marketplace/tree/main/dd/commands/git/commit/atomic.md)
1 parent 5fa76fc commit f3e6753

File tree

2 files changed

+38
-7
lines changed

2 files changed

+38
-7
lines changed

ddev/changelog.d/22559.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve license parsing in validation

ddev/src/ddev/cli/validate/licenses.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,22 @@
33
# Licensed under a 3-clause BSD style license (see LICENSE)
44
from __future__ import annotations
55

6+
import re
67
from typing import TYPE_CHECKING
78

89
import click
910

1011
if TYPE_CHECKING:
1112
from ddev.cli.application import Application
1213

14+
# Split license expressions on operators (AND, OR, and/or) and separators (/, +).
15+
# Use negative lookbehind (?<!-) and lookahead (?!-) to avoid matching "-or-" or "-and-"
16+
# inside SPDX identifiers like "GPL-2.0-or-later" or "LGPL-2.1-or-later".
17+
_OP_SPLIT = re.compile(r'\s*(?:(?<!-)\b(?:AND|OR)\b(?!-)|/|\+|\band/or\b)\s*', re.IGNORECASE)
18+
# SPDX ids are typically: letters/digits plus . + -
19+
# and may appear as LicenseRef-* / DocumentRef-*:LicenseRef-*
20+
_ID = re.compile(r"(DocumentRef-[A-Za-z0-9.+-]+:)?LicenseRef-[A-Za-z0-9.+-]+|[A-Za-z0-9.+-]+")
21+
1322

1423
def format_attribution_line(package_name, license_id, package_copyright):
1524
if ',' in package_copyright:
@@ -330,6 +339,33 @@ def read_file_lines(file, encoding='utf-8'):
330339
return f.readlines()
331340

332341

342+
def split_license_expression_simple(expr: str) -> list[str]:
343+
# Normalize parens to spaces
344+
expr = expr.replace("(", " ").replace(")", " ")
345+
346+
parts: list[str] = []
347+
for chunk in _OP_SPLIT.split(expr):
348+
chunk = chunk.strip()
349+
if not chunk:
350+
continue
351+
352+
# Handle "WITH" exceptions by taking the left side license id.
353+
chunk = re.split(r"\s+\bWITH\b\s+", chunk, flags=re.IGNORECASE)[0].strip()
354+
355+
# Extract tokens; keep ones that look like SPDX-ish ids
356+
chunk_start_len = len(parts)
357+
for token in _ID.findall(chunk):
358+
if token:
359+
parts.append(token if isinstance(token, str) else token[0])
360+
if len(parts) == chunk_start_len:
361+
# If we couldn't extract any ID from this chunk, try to use the whole chunk
362+
# as a fallback, stripping any remaining noise.
363+
fallback = re.sub(r'\s+', ' ', chunk).strip()
364+
if fallback:
365+
parts.append(fallback)
366+
return parts
367+
368+
333369
@click.command(short_help='Validate third-party license list')
334370
@click.option('--sync', '-s', is_flag=True, help='Generate the `LICENSE-3rdparty.csv` file')
335371
@click.pass_obj
@@ -420,13 +456,7 @@ def licenses(app: Application, sync: bool):
420456
for package_license in data['licenses']:
421457
package_license = package_license.strip('"')
422458

423-
expanded_licenses = []
424-
for separator in (' and/or ', '/', ' OR ', ' or '):
425-
if separator in package_license:
426-
expanded_licenses.extend(package_license.split(separator))
427-
break
428-
else:
429-
expanded_licenses.append(package_license)
459+
expanded_licenses = split_license_expression_simple(package_license)
430460

431461
for expanded_license in expanded_licenses:
432462
normalized_license = expanded_license.lower()

0 commit comments

Comments
 (0)