Skip to content

Commit 180c601

Browse files
committed
Be more careful about file reading
If SCons reads a file to interpret the contents, codecs are a concern. The File node class has a get_text_contents() method which makes a best effort at decoding bytes data, but there are other places that don't get their file contents via that method, and so should do their own careful decoding - but don't, they just read as text and hope it's okay. Move the decode-bytes portion out of File.get_text_contents() to SCons.Util.to_Text() so that everyone that needs this can call it. Add a couple of additional known BOM codes (after consulting Python's codecs module). Note that while get_text_contents acts on nodes, the new (moved) routine to_Text acts on passed bytes, so it can be used in a non-Node context as well - for example the Java tool initializer reads a file and tries to decode it, and can get it wrong (see SCons#3569), this change provides it some help. Fixes SCons#3569 FIxes SCons#4462 Signed-off-by: Mats Wichmann <[email protected]>
1 parent 7e120e8 commit 180c601

File tree

9 files changed

+69
-43
lines changed

9 files changed

+69
-43
lines changed

CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ RELEASE VERSION/DATE TO BE FILLED IN LATER
8080
Fixes #4468.
8181
- Fix bad typing in Action.py: process() and strfunction().
8282
- Add Pseudo() to global functions, had been omitted. Fixes #4474.
83+
- Improve handling of file data that SCons itself processes - try
84+
harder to decode non-UTF-8 text. SCons.Util.to_Text now exists
85+
to convert a byte stream, such as "raw" file data. Fixes #3569, #4462.
8386

8487

8588
RELEASE 4.6.0 - Sun, 19 Nov 2023 17:22:20 -0700

RELEASE.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ FIXES
5454
make sure decoding of bytes doesn't fail.
5555
- Documentation indicated that both Pseudo() and env.Pseudo() were usable,
5656
but Pseudo() did not work; is now enabled.
57+
- Improve handling of file data that SCons itself processes - as in
58+
scanners - try harder to decode non-UTF-8 text.
5759

5860
IMPROVEMENTS
5961
------------

SCons/Node/FS.py

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,7 +1057,7 @@ def get_contents(self):
10571057
contents of the file."""
10581058
return SCons.Node._get_contents_map[self._func_get_contents](self)
10591059

1060-
def get_text_contents(self):
1060+
def get_text_contents(self) -> str:
10611061
"""Fetch the decoded text contents of a Unicode encoded Entry.
10621062
10631063
Since this should return the text contents from the file
@@ -1073,6 +1073,7 @@ def get_text_contents(self):
10731073
# hand or catch the exception.
10741074
return ''
10751075
else:
1076+
# now we're a different node type, call its method to get the text.
10761077
return self.get_text_contents()
10771078

10781079
def must_be_same(self, klass) -> None:
@@ -2751,38 +2752,13 @@ def get_contents(self) -> bytes:
27512752
return SCons.Node._get_contents_map[self._func_get_contents](self)
27522753

27532754
def get_text_contents(self) -> str:
2754-
"""Return the contents of the file in text form.
2755-
2756-
This attempts to figure out what the encoding of the text is
2757-
based upon the BOM bytes, and then decodes the contents so that
2758-
it's a valid python string.
2759-
"""
2760-
contents = self.get_contents()
2761-
# The behavior of various decode() methods and functions
2762-
# w.r.t. the initial BOM bytes is different for different
2763-
# encodings and/or Python versions. ('utf-8' does not strip
2764-
# them, but has a 'utf-8-sig' which does; 'utf-16' seems to
2765-
# strip them; etc.) Just sidestep all the complication by
2766-
# explicitly stripping the BOM before we decode().
2767-
if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
2768-
return contents[len(codecs.BOM_UTF8):].decode('utf-8')
2769-
if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
2770-
return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
2771-
if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
2772-
return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
2773-
try:
2774-
return contents.decode('utf-8')
2775-
except UnicodeDecodeError as e:
2776-
try:
2777-
return contents.decode('latin-1')
2778-
except UnicodeDecodeError as e:
2779-
return contents.decode('utf-8', errors='backslashreplace')
2755+
"""Return the contents of the file as text."""
2756+
return SCons.Util.to_Text(self.get_contents())
27802757

27812758
def get_content_hash(self) -> str:
2782-
"""
2783-
Compute and return the hash for this file.
2784-
"""
2759+
"""Compute and return the hash for this file."""
27852760
if not self.rexists():
2761+
# special marker to help distinguish from empty file
27862762
return hash_signature(SCons.Util.NOFILE)
27872763
fname = self.rfile().get_abspath()
27882764
try:

SCons/Scanner/C.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,9 @@ def find_include_file(self, t):
5858
self.missing.append((fname, self.current_file))
5959
return result
6060

61-
def read_file(self, file):
61+
def read_file(self, file) -> str:
6262
try:
63-
with open(str(file.rfile())) as fp:
64-
return fp.read()
63+
return file.rfile().get_text_contents()
6564
except OSError as e:
6665
self.missing.append((file, self.current_file))
6766
return ''
@@ -209,10 +208,9 @@ def find_include_file(self, t):
209208
self.missing.append((fname, self.current_file))
210209
return result
211210

212-
def read_file(self, file):
211+
def read_file(self, file) -> str:
213212
try:
214-
with open(str(file.rfile())) as fp:
215-
return fp.read()
213+
return file.rfile().get_text_contents()
216214
except OSError:
217215
self.missing.append((file, self.current_file))
218216
return ""

SCons/Tool/JavaCommon.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from pathlib import Path
3030
from typing import List
3131

32+
import SCons.Util
33+
3234
java_parsing = True
3335

3436
default_java_version = '1.4'
@@ -451,8 +453,8 @@ def parseToken(self, token):
451453

452454

453455
def parse_java_file(fn, version=default_java_version):
454-
with open(fn, encoding='utf-8') as f:
455-
data = f.read()
456+
with open(fn, "rb") as f:
457+
data = SCons.Util.to_Text(f.read())
456458
return parse_java(data, version)
457459

458460

SCons/Tool/JavaCommonTests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,9 @@ def test_file_parser(self) -> None:
7474
{
7575
public static void main(String[] args)
7676
{
77-
/* This tests that unicde is handled . */
77+
/* This tests that unicode is handled . */
7878
String hello1 = new String("ఎత్తువెడల్పు");
79+
/* and even smart quotes “like this” ‘and this’ */
7980
}
8081
}
8182
"""

SCons/Util/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
to_String,
8282
to_String_for_subst,
8383
to_String_for_signature,
84+
to_Text,
8485
to_bytes,
8586
to_str,
8687
get_env_bool,

SCons/Util/sctypes.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Routines which check types and do type conversions.
88
"""
99

10+
import codecs
1011
import os
1112
import pprint
1213
import re
@@ -187,7 +188,11 @@ def to_String( # pylint: disable=redefined-outer-name,redefined-builtin
187188
UserString=UserString,
188189
BaseStringTypes=BaseStringTypes,
189190
) -> str:
190-
"""Return a string version of obj."""
191+
"""Return a string version of obj.
192+
193+
Use this for data likely to be well-behaved. Use
194+
:func:`to_Text` for unknown file data that needs to be decoded.
195+
"""
191196
if isinstance(obj, BaseStringTypes):
192197
# Early out when already a string!
193198
return obj
@@ -244,6 +249,42 @@ def to_String_for_signature( # pylint: disable=redefined-outer-name,redefined-b
244249
return f()
245250

246251

252+
def to_Text(data: bytes) -> str:
253+
"""Return bytes data converted to text.
254+
255+
Useful for whole-file reads where the data needs some interpretation,
256+
particularly for Scanners. Attempts to figure out what the encoding of
257+
the text is based upon the BOM bytes, and then decodes the contents so
258+
that it's a valid python string.
259+
"""
260+
_encoding_map = [
261+
(codecs.BOM_UTF8, 'utf-8'),
262+
(codecs.BOM_UTF16_LE, 'utf-16le'),
263+
(codecs.BOM_UTF16_BE, 'utf-16be'),
264+
(codecs.BOM_UTF32_LE, 'utf-32le'),
265+
(codecs.BOM_UTF32_BE, 'utf-32be'),
266+
]
267+
268+
# First look for Byte-order-mark sequences to identify the encoding.
269+
# Strip these since some codecs do, some don't.
270+
for bom, encoding in _encoding_map:
271+
if data.startswith(bom):
272+
return data[len(bom):].decode(encoding, errors='backslashreplace')
273+
274+
# If we didn't see a BOM, try UTF-8, then the "preferred" encoding
275+
# (the files might be written on this system), then finally latin-1.
276+
# TODO: possibly should be a way for the build to set an encoding.
277+
try:
278+
return data.decode('utf-8')
279+
except UnicodeDecodeError:
280+
try:
281+
import locale
282+
prefencoding = locale.getpreferredencoding()
283+
return data.decode(prefencoding)
284+
except (UnicodeDecodeError, LookupError):
285+
return data.decode('latin-1', errors='backslashreplace')
286+
287+
247288
def get_env_bool(env, name: str, default: bool=False) -> bool:
248289
"""Convert a construction variable to bool.
249290

SCons/cpp.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
import os
2727
import re
2828

29+
import SCons.Util
30+
2931
# First "subsystem" of regular expressions that we set up:
3032
#
3133
# Stuff to turn the C preprocessor directives in a file's contents into
@@ -401,9 +403,9 @@ def find_include_file(self, t):
401403
return f
402404
return None
403405

404-
def read_file(self, file):
405-
with open(file) as f:
406-
return f.read()
406+
def read_file(self, file) -> str:
407+
with open(file, 'rb') as f:
408+
return SCons.Util.to_Text(f.read())
407409

408410
# Start and stop processing include lines.
409411

0 commit comments

Comments
 (0)