Be more careful about file reading

mwichmann · mwichmann · commit 180c601ade46 · 2024-02-08T09:28:29.000-07:00
If SCons reads a file to interpret the contents, codecs are a concern. The File node class has a get_text_contents() method which makes a best effort at decoding bytes data, but there are other places that don't get their file contents via that method, and so should do their own careful decoding - but don't, they just read as text and hope it's okay. Move the decode-bytes portion out of File.get_text_contents() to SCons.Util.to_Text() so that everyone that needs this can call it. Add a couple of additional known BOM codes (after consulting Python's codecs module). Note that while get_text_contents acts on nodes, the new (moved) routine to_Text acts on passed bytes, so it can be used in a non-Node context as well - for example the Java tool initializer reads a file and tries to decode it, and can get it wrong (see SCons#3569), this change provides it some help. Fixes SCons#3569 FIxes SCons#4462 Signed-off-by: Mats Wichmann <mats@linux.com>
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -80,6 +80,9 @@ RELEASE  VERSION/DATE TO BE FILLED IN LATER
       Fixes #4468.
     - Fix bad typing in Action.py: process() and strfunction().
     - Add Pseudo() to global functions, had been omitted. Fixes #4474.
+    - Improve handling of file data that SCons itself processes - try
+      harder to decode non-UTF-8 text. SCons.Util.to_Text now exists
+      to convert a byte stream, such as "raw" file data.  Fixes #3569, #4462.
 
 
 RELEASE 4.6.0 -  Sun, 19 Nov 2023 17:22:20 -0700
diff --git a/RELEASE.txt b/RELEASE.txt
@@ -54,6 +54,8 @@ FIXES
   make sure decoding of bytes doesn't fail.
 - Documentation indicated that both Pseudo() and env.Pseudo() were usable,
   but Pseudo() did not work; is now enabled.
+- Improve handling of file data that SCons itself processes - as in
+  scanners - try harder to decode non-UTF-8 text.
 
 IMPROVEMENTS
 ------------
diff --git a/SCons/Node/FS.py b/SCons/Node/FS.py
@@ -1057,7 +1057,7 @@ def get_contents(self):
         contents of the file."""
         return SCons.Node._get_contents_map[self._func_get_contents](self)
 
-    def get_text_contents(self):
+    def get_text_contents(self) -> str:
         """Fetch the decoded text contents of a Unicode encoded Entry.
 
         Since this should return the text contents from the file
@@ -1073,6 +1073,7 @@ def get_text_contents(self):
             # hand or catch the exception.
             return ''
         else:
+            # now we're a different node type, call its method to get the text.
             return self.get_text_contents()
 
     def must_be_same(self, klass) -> None:
@@ -2751,38 +2752,13 @@ def get_contents(self) -> bytes:
         return SCons.Node._get_contents_map[self._func_get_contents](self)
 
     def get_text_contents(self) -> str:
-        """Return the contents of the file in text form.
-
-        This attempts to figure out what the encoding of the text is
-        based upon the BOM bytes, and then decodes the contents so that
-        it's a valid python string.
-        """
-        contents = self.get_contents()
-        # The behavior of various decode() methods and functions
-        # w.r.t. the initial BOM bytes is different for different
-        # encodings and/or Python versions.  ('utf-8' does not strip
-        # them, but has a 'utf-8-sig' which does; 'utf-16' seems to
-        # strip them; etc.)  Just sidestep all the complication by
-        # explicitly stripping the BOM before we decode().
-        if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
-            return contents[len(codecs.BOM_UTF8):].decode('utf-8')
-        if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
-            return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
-        if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
-            return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
-        try:
-            return contents.decode('utf-8')
-        except UnicodeDecodeError as e:
-            try:
-                return contents.decode('latin-1')
-            except UnicodeDecodeError as e:
-                return contents.decode('utf-8', errors='backslashreplace')
+        """Return the contents of the file as text."""
+        return SCons.Util.to_Text(self.get_contents())
 
     def get_content_hash(self) -> str:
-        """
-        Compute and return the hash for this file.
-        """
+        """Compute and return the hash for this file."""
         if not self.rexists():
+            # special marker to help distinguish from empty file
             return hash_signature(SCons.Util.NOFILE)
         fname = self.rfile().get_abspath()
         try:
diff --git a/SCons/Scanner/C.py b/SCons/Scanner/C.py
@@ -58,10 +58,9 @@ def find_include_file(self, t):
             self.missing.append((fname, self.current_file))
         return result
 
-    def read_file(self, file):
+    def read_file(self, file) -> str:
         try:
-            with open(str(file.rfile())) as fp:
-                return fp.read()
+            return file.rfile().get_text_contents()
         except OSError as e:
             self.missing.append((file, self.current_file))
             return ''
@@ -209,10 +208,9 @@ def find_include_file(self, t):
             self.missing.append((fname, self.current_file))
         return result
 
-    def read_file(self, file):
+    def read_file(self, file) -> str:
         try:
-            with open(str(file.rfile())) as fp:
-                return fp.read()
+            return file.rfile().get_text_contents()
         except OSError:
             self.missing.append((file, self.current_file))
             return ""
diff --git a/SCons/Tool/JavaCommon.py b/SCons/Tool/JavaCommon.py
@@ -29,6 +29,8 @@
 from pathlib import Path
 from typing import List
 
+import SCons.Util
+
 java_parsing = True
 
 default_java_version = '1.4'
@@ -451,8 +453,8 @@ def parseToken(self, token):
 
 
     def parse_java_file(fn, version=default_java_version):
-        with open(fn, encoding='utf-8') as f:
-            data = f.read()
+        with open(fn, "rb") as f:
+            data = SCons.Util.to_Text(f.read())
         return parse_java(data, version)
 
 
diff --git a/SCons/Tool/JavaCommonTests.py b/SCons/Tool/JavaCommonTests.py
@@ -74,8 +74,9 @@ def test_file_parser(self) -> None:
 {
      public static void main(String[] args)
      {
-        /* This tests that unicde is handled . */
+        /* This tests that unicode is handled . */
         String hello1 = new String("ఎత్తువెడల్పు");
+        /* and even smart quotes “like this” ‘and this’ */
      }
 }
 """
diff --git a/SCons/Util/__init__.py b/SCons/Util/__init__.py
@@ -81,6 +81,7 @@
     to_String,
     to_String_for_subst,
     to_String_for_signature,
+    to_Text,
     to_bytes,
     to_str,
     get_env_bool,
diff --git a/SCons/Util/sctypes.py b/SCons/Util/sctypes.py
@@ -7,6 +7,7 @@
 Routines which check types and do type conversions.
 """
 
+import codecs
 import os
 import pprint
 import re
@@ -187,7 +188,11 @@ def to_String(  # pylint: disable=redefined-outer-name,redefined-builtin
     UserString=UserString,
     BaseStringTypes=BaseStringTypes,
 ) -> str:
-    """Return a string version of obj."""
+    """Return a string version of obj.
+
+    Use this for data likely to be well-behaved. Use
+    :func:`to_Text` for unknown file data that needs to be decoded.
+    """
     if isinstance(obj, BaseStringTypes):
         # Early out when already a string!
         return obj
@@ -244,6 +249,42 @@ def to_String_for_signature(  # pylint: disable=redefined-outer-name,redefined-b
     return f()
 
 
+def to_Text(data: bytes) -> str:
+    """Return bytes data converted to text.
+
+    Useful for whole-file reads where the data needs some interpretation,
+    particularly for Scanners.  Attempts to figure out what the encoding of
+    the text is based upon the BOM bytes, and then decodes the contents so
+    that it's a valid python string.
+    """
+    _encoding_map = [
+        (codecs.BOM_UTF8, 'utf-8'),
+        (codecs.BOM_UTF16_LE, 'utf-16le'),
+        (codecs.BOM_UTF16_BE, 'utf-16be'),
+        (codecs.BOM_UTF32_LE, 'utf-32le'),
+        (codecs.BOM_UTF32_BE, 'utf-32be'),
+    ]
+
+    # First look for Byte-order-mark sequences to identify the encoding.
+    # Strip these since some codecs do, some don't.
+    for bom, encoding in _encoding_map:
+        if data.startswith(bom):
+            return data[len(bom):].decode(encoding, errors='backslashreplace')
+
+    # If we didn't see a BOM, try UTF-8, then the "preferred" encoding
+    # (the files might be written on this system), then finally latin-1.
+    # TODO: possibly should be a way for the build to set an encoding.
+    try:
+        return data.decode('utf-8')
+    except UnicodeDecodeError:
+            try:
+                import locale
+                prefencoding = locale.getpreferredencoding()
+                return data.decode(prefencoding)
+            except (UnicodeDecodeError, LookupError):
+                return data.decode('latin-1', errors='backslashreplace')
+
+
 def get_env_bool(env, name: str, default: bool=False) -> bool:
     """Convert a construction variable to bool.
 
diff --git a/SCons/cpp.py b/SCons/cpp.py
@@ -26,6 +26,8 @@
 import os
 import re
 
+import SCons.Util
+
 # First "subsystem" of regular expressions that we set up:
 #
 # Stuff to turn the C preprocessor directives in a file's contents into
@@ -401,9 +403,9 @@ def find_include_file(self, t):
                 return f
         return None
 
-    def read_file(self, file):
-        with open(file) as f:
-            return f.read()
+    def read_file(self, file) -> str:
+        with open(file, 'rb') as f:
+            return SCons.Util.to_Text(f.read())
 
     # Start and stop processing include lines.
 

Original file line number	Diff line number	Diff line change
`@@ -74,8 +74,9 @@ def test_file_parser(self) -> None:`
`74`	`74`	`{`
`75`	`75`	`public static void main(String[] args)`
`76`	`76`	`{`
`77`		`- /* This tests that unicde is handled . */`
	`77`	`+ /* This tests that unicode is handled . */`
`78`	`78`	`String hello1 = new String("ఎత్తువెడల్పు");`
	`79`	`+ /* and even smart quotes “like this” ‘and this’ */`
`79`	`80`	`}`
`80`	`81`	`}`
`81`	`82`	`"""`