Skip to content

Commit d2aa2ba

Browse files
committed
fix: try to detect licensetexts, or pass silently
Signed-off-by: Jan Kowalleck <[email protected]>
1 parent 78133ab commit d2aa2ba

15 files changed

+263
-98
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# This file is part of CycloneDX Python
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# SPDX-License-Identifier: Apache-2.0
16+
# Copyright (c) OWASP Foundation. All Rights Reserved.
17+
18+
from sys import getdefaultencoding
19+
20+
from chardet import detect as chardetect
21+
22+
23+
def bytes2str(data: bytes, *, errors: str = 'strict') -> str:
24+
# see https://docs.python.org/3/library/codecs.html#standard-encodings
25+
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
26+
# replace Windows-encoding with code-page
27+
'Windows-', 'cp')
28+
return data.decode(encoding, errors)

cyclonedx_py/_internal/utils/io.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,14 @@
1515
# SPDX-License-Identifier: Apache-2.0
1616
# Copyright (c) OWASP Foundation. All Rights Reserved.
1717

18-
from sys import getdefaultencoding
1918
from tempfile import NamedTemporaryFile
2019
from typing import BinaryIO
2120

22-
from chardet import detect as chardetect
21+
from .bytes import bytes2str
2322

2423

2524
def io2str(io: BinaryIO, *, errors: str = 'strict') -> str:
26-
data = io.read()
27-
# see https://docs.python.org/3/library/codecs.html#standard-encodings
28-
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
29-
# replace Windows-encoding with code-page
30-
'Windows-', 'cp')
31-
return data.decode(encoding, errors)
25+
return bytes2str(io.read(), errors=errors)
3226

3327

3428
def io2file(io: BinaryIO, *, errors: str = 'strict') -> str:

cyclonedx_py/_internal/utils/pep639.py

Lines changed: 43 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323

2424
from base64 import b64encode
2525
from os.path import join
26-
from typing import TYPE_CHECKING, Generator, Set, Union
26+
from typing import TYPE_CHECKING, Generator
2727

2828
from cyclonedx.factory.license import LicenseFactory
2929
from cyclonedx.model import AttachedText, Encoding
3030
from cyclonedx.model.license import DisjunctiveLicense, LicenseAcknowledgement
3131

32-
from .io import io2str
32+
from .bytes import bytes2str
3333
from .mimetypes import guess_type
3434

3535
if TYPE_CHECKING: # pragma: no cover
@@ -38,102 +38,56 @@
3838

3939
from cyclonedx.model.license import License
4040

41-
42-
def _try_load(dist: 'Distribution', metadir: str, filename: str) -> Union[str, None]:
43-
# Might raise NotImplementedError in theory
44-
# but nothing we can do in that case.
45-
try:
46-
candidate = dist.locate_file(join(metadir, filename))
47-
except NotImplementedError:
48-
return None
49-
50-
if not candidate:
51-
return None
52-
53-
try:
54-
with open(str(candidate), 'rb') as fin:
55-
return io2str(fin)
56-
except FileNotFoundError:
57-
pass
58-
return None
59-
60-
61-
def handle_bad_license_file_encoding(
62-
dist: 'Distribution',
63-
lfile: str,
64-
logger: 'Logger'
65-
) -> Union[str, None]:
66-
# Distribution has no method to find the actual metadata dir,
67-
# e.g. dist-info or egg-info.
68-
# So we mimic the logic in PathDistribution and check both subdirs
69-
content: Union[str, None] = None
70-
for metadir in ('.dist-info', '.egg-info'):
71-
content = _try_load(dist, metadir, lfile)
72-
if content:
73-
break
74-
75-
if content is None:
76-
logger.debug('Error: license file %r for dist %r is not UTF-8 encoded',
77-
lfile, dist.metadata['Name'])
78-
return content
79-
80-
81-
def gather_license_texts(
82-
dist: 'Distribution',
83-
lfiles: Set[str],
84-
logger: 'Logger'
85-
) -> Generator['License', None, None]:
86-
lack = LicenseAcknowledgement.DECLARED
87-
for mlfile in lfiles:
88-
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
89-
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
90-
91-
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
92-
# but in practice, other locations are used, too.
93-
# loop over the candidate location and pick the first one found.
94-
content = None
95-
for loc in ('licenses', 'license_files', '.'):
96-
path = join(loc, mlfile)
97-
try:
98-
content = dist.read_text(path)
99-
except UnicodeDecodeError:
100-
# Malformed, try harder
101-
content = handle_bad_license_file_encoding(dist, path, logger)
102-
103-
if content is not None:
104-
break
105-
else:
106-
logger.debug('Error: failed to read license file %r for dist %r',
107-
mlfile, dist.metadata['Name'])
108-
continue
109-
110-
encoding = None
111-
content_type = guess_type(mlfile) or AttachedText.DEFAULT_CONTENT_TYPE
112-
# per default, license files are human-readable texts.
113-
if not content_type.startswith('text/'):
114-
encoding = Encoding.BASE_64
115-
content = b64encode(content.encode('utf-8')).decode('ascii')
116-
yield DisjunctiveLicense(
117-
name=f'declared license file: {mlfile}',
118-
acknowledgement=lack,
119-
text=AttachedText(
120-
content=content,
121-
encoding=encoding,
122-
content_type=content_type
123-
))
41+
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
42+
# but in practice, other locations are used, too.
43+
_LICENSE_LOCATIONS = ('licenses', 'license_files', '')
12444

12545

12646
def dist2licenses(
12747
dist: 'Distribution',
12848
gather_text: bool,
12949
logger: 'Logger'
13050
) -> Generator['License', None, None]:
51+
lfac = LicenseFactory()
52+
lack = LicenseAcknowledgement.DECLARED
13153
metadata = dist.metadata # see https://packaging.python.org/en/latest/specifications/core-metadata/
13254
if (lexp := metadata['License-Expression']) is not None:
133-
lfac = LicenseFactory()
134-
lack = LicenseAcknowledgement.DECLARED
13555
# see spec: https://peps.python.org/pep-0639/#add-license-expression-field
13656
yield lfac.make_from_string(lexp,
13757
license_acknowledgement=lack)
138-
if gather_text and (lfiles := set(fn for fn in metadata.get_all('License-File', ()))):
139-
yield from gather_license_texts(dist, lfiles, logger)
58+
if gather_text:
59+
for mlfile in set(metadata.get_all('License-File', ())):
60+
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
61+
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
62+
content = None
63+
for mlpath in _LICENSE_LOCATIONS:
64+
try:
65+
content = dist.read_text(join(mlpath, mlfile))
66+
except UnicodeDecodeError as err:
67+
try:
68+
content = bytes2str(err.object)
69+
except UnicodeDecodeError:
70+
pass
71+
else:
72+
break # for-loop
73+
else:
74+
if content is not None:
75+
break # for-loop
76+
if content is None: # pragma: no cover
77+
logger.debug('Error: failed to read license file %r for dist %r',
78+
mlfile, metadata['Name'])
79+
continue
80+
encoding = None
81+
content_type = guess_type(mlfile) or AttachedText.DEFAULT_CONTENT_TYPE
82+
# per default, license files are human-readable texts.
83+
if not content_type.startswith('text/'):
84+
encoding = Encoding.BASE_64
85+
content = b64encode(content.encode('utf-8')).decode('ascii')
86+
yield DisjunctiveLicense(
87+
name=f'declared license file: {mlfile}',
88+
acknowledgement=lack,
89+
text=AttachedText(
90+
content=content,
91+
encoding=encoding,
92+
content_type=content_type
93+
))

tests/_data/infiles/environment/with-license-pep639/init.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def main() -> None:
7474
# with expression-like License AND License-File
7575
'cryptography==43.0.1', # https://github.com/CycloneDX/cyclonedx-python/issues/826
7676
# with possibly unexpected license files
77-
"../../_helpers/local_pckages/with-license-pep639_regression-issue868", # https://github.com/CycloneDX/cyclonedx-python/issues/868
77+
# https://github.com/CycloneDX/cyclonedx-python/issues/868
78+
"../../_helpers/local_pckages/with-license-pep639_regression-issue868",
7879
)
7980

8081

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.1.xml.bin

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.2.json.bin

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.2.xml.bin

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.3.json.bin

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.3.xml.bin

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.4.json.bin

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)