Skip to content

Commit 01c8854

Browse files
author
Michael Schlenker
committed
feat: Handle misencoded license text files graceful.
Signed-off-by: Michael Schlenker <[email protected]>
1 parent 1293b7e commit 01c8854

File tree

1 file changed

+99
-31
lines changed

1 file changed

+99
-31
lines changed

cyclonedx_py/_internal/utils/pep639.py

Lines changed: 99 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@
2323

2424
from base64 import b64encode
2525
from os.path import join
26-
from typing import TYPE_CHECKING, Generator
26+
from typing import TYPE_CHECKING, Generator, Set, Union
2727

2828
from cyclonedx.factory.license import LicenseFactory
2929
from cyclonedx.model import AttachedText, Encoding
3030
from cyclonedx.model.license import DisjunctiveLicense, LicenseAcknowledgement
3131

32+
from .io import io2str
3233
from .mimetypes import guess_type
3334

3435
if TYPE_CHECKING: # pragma: no cover
@@ -38,43 +39,110 @@
3839
from cyclonedx.model.license import License
3940

4041

42+
def handle_bad_license_file_encoding(
43+
dist: 'Distribution',
44+
lfile: str,
45+
logger: 'Logger'
46+
) -> Union[str, None]:
47+
48+
def try_load(dist: 'Distribution', metadir: str, filename: str) -> Union[str, None]:
49+
# Might raise NotImplementedError in theory
50+
# but nothing we can do in that case.
51+
try:
52+
candidate = dist.locate_file(join(metadir, filename))
53+
except NotImplementedError:
54+
return None
55+
56+
if not candidate:
57+
return None
58+
59+
try:
60+
with open(str(candidate), 'rb') as fin:
61+
return io2str(fin)
62+
except FileNotFoundError:
63+
pass
64+
return None
65+
66+
# Distribution has no method to find the actual metadata dir,
67+
# e.g. dist-info or egg-info.
68+
# So we mimic the logic in PathDistribution and check both subdirs
69+
content: Union[str, None] = None
70+
for metadir in ('.dist-info', '.egg-info'):
71+
content = try_load(dist, metadir, lfile)
72+
if content:
73+
break
74+
75+
if content is None:
76+
logger.debug('Error: license file %r for dist %r is not UTF-8 encoded',
77+
lfile, dist.metadata['Name'])
78+
return content
79+
80+
81+
def gather_license_texts(
82+
dist: 'Distribution',
83+
lfiles: Set[str],
84+
logger: 'Logger'
85+
) -> Generator['License', None, None]:
86+
lack = LicenseAcknowledgement.DECLARED
87+
for mlfile in lfiles:
88+
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
89+
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
90+
91+
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
92+
# but in practice, other locations are used, too.
93+
# loop over the candidate location and pick the first one found.
94+
locations = ('licenses', 'license_files', '.')
95+
malformed = None
96+
content = None
97+
for loc in locations:
98+
try:
99+
path = join(loc, mlfile)
100+
content = dist.read_text(path)
101+
except UnicodeDecodeError:
102+
# Malformed, stop looking
103+
malformed = path
104+
break
105+
106+
if content is not None:
107+
break
108+
109+
if content is None and malformed: # pragma: no cover
110+
# Try a little harder
111+
content = handle_bad_license_file_encoding(dist, malformed, logger)
112+
113+
if content is None: # pragme: no cover
114+
logger.debug('Error: failed to read license file %r for dist %r',
115+
mlfile, dist.metadata['Name'])
116+
continue
117+
118+
encoding = None
119+
content_type = guess_type(mlfile) or AttachedText.DEFAULT_CONTENT_TYPE
120+
# per default, license files are human-readable texts.
121+
if not content_type.startswith('text/'):
122+
encoding = Encoding.BASE_64
123+
content = b64encode(content.encode('utf-8')).decode('ascii')
124+
yield DisjunctiveLicense(
125+
name=f'declared license file: {mlfile}',
126+
acknowledgement=lack,
127+
text=AttachedText(
128+
content=content,
129+
encoding=encoding,
130+
content_type=content_type
131+
))
132+
133+
41134
def dist2licenses(
42135
dist: 'Distribution',
43136
gather_text: bool,
44137
logger: 'Logger'
45138
) -> Generator['License', None, None]:
46-
lfac = LicenseFactory()
47-
lack = LicenseAcknowledgement.DECLARED
48139
metadata = dist.metadata # see https://packaging.python.org/en/latest/specifications/core-metadata/
49140
if (lexp := metadata['License-Expression']) is not None:
141+
lfac = LicenseFactory()
142+
lack = LicenseAcknowledgement.DECLARED
50143
# see spec: https://peps.python.org/pep-0639/#add-license-expression-field
51144
yield lfac.make_from_string(lexp,
52145
license_acknowledgement=lack)
53-
if gather_text:
54-
for mlfile in set(metadata.get_all('License-File', ())):
55-
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
56-
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
57-
58-
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
59-
# but in practice, other locations are used, too.
60-
content = dist.read_text(join('licenses', mlfile)) \
61-
or dist.read_text(join('license_files', mlfile)) \
62-
or dist.read_text(mlfile)
63-
if content is None: # pragma: no cover
64-
logger.debug('Error: failed to read license file %r for dist %r',
65-
mlfile, metadata['Name'])
66-
continue
67-
encoding = None
68-
content_type = guess_type(mlfile) or AttachedText.DEFAULT_CONTENT_TYPE
69-
# per default, license files are human-readable texts.
70-
if not content_type.startswith('text/'):
71-
encoding = Encoding.BASE_64
72-
content = b64encode(content.encode('utf-8')).decode('ascii')
73-
yield DisjunctiveLicense(
74-
name=f'declared license file: {mlfile}',
75-
acknowledgement=lack,
76-
text=AttachedText(
77-
content=content,
78-
encoding=encoding,
79-
content_type=content_type
80-
))
146+
if gather_text and (lfiles := set(str(fn) for fn in metadata.get_all('License-File', ()))):
147+
for lic in gather_license_texts(dist, lfiles, logger):
148+
yield lic

0 commit comments

Comments
 (0)