Avoid file encoding issues when loading cpp extensions (pytorch#138565)

izmttk · malfet · pytorchmergebot · commit 4d8090cabb55 · 2024-10-28T14:06:34.000Z
I've found that when using `torch.utils.cpp_extension.load` on my Windows system, decoding errors occur when my .cpp/.cu files contain certain non-English characters. `test.py`: ```py from torch.utils.cpp_extension import load my_lib = load(name='my_cuda_kernel', sources=['my_cuda_kernel.cu'], extra_cuda_cflags=['-O2', '-std=c++17']) # ...... ``` `my_cuda_kernel.cu`: ```cpp #include <torch/types.h> #include <torch/extension.h> // 向量化 <------ some chinese characters // ...... ``` Errors will be reported as: ``` Traceback (most recent call last): File "E:\test\test.py", line 8, in <module> my_lib = load( ^^^^^ File "C:\Users\XXX\AppData\Roaming\Python\Python311\site-packages\torch\utils\cpp_extension.py", line 1314, in load return _jit_compile( ^^^^^^^^^^^^^ File "C:\Users\XXX\AppData\Roaming\Python\Python311\site-packages\torch\utils\cpp_extension.py", line 1680, in _jit_compile version = JIT_EXTENSION_VERSIONER.bump_version_if_changed( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\XXX\AppData\Roaming\Python\Python311\site-packages\torch\utils\_cpp_extension_versioner.py", line 46, in bump_version_if_changed hash_value = hash_source_files(hash_value, source_files) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\XXX\AppData\Roaming\Python\Python311\site-packages\torch\utils\_cpp_extension_versioner.py", line 17, in hash_source_files hash_value = update_hash(hash_value, file.read()) ^^^^^^^^^^^ UnicodeDecodeError: 'gbk' codec can't decode byte 0x96 in position 141: illegal multibyte sequence ``` The issue lies in the fact that the `open()` function in Python is platform-dependent, which can cause decoding errors when a file contains characters that are not supported by the default encoding. Pytorch uses file contents to generate hash string: https://github.com/pytorch/pytorch/blob/60c14330411de8f52bfb28d6406f1822edaad944/torch/utils/_cpp_extension_versioner.py#L16-L17 In my windows the default encoding is `gbk` but all of my cpp files are in `utf-8`. There is a simple solution to this problem I think: just change the file reading mode to binary mode, which can avoid issues related to file encoding. It works perfectly on my computer. ```diff - with open(filename) as file: + with open(filename, 'rb') as file: hash_value = update_hash(hash_value, file.read()) ``` Pull Request resolved: pytorch#138565 Approved by: https://github.com/malfet, https://github.com/janeyx99 Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: cpp-extensions"]
 
 import glob
+import locale
 import os
 import re
 import shutil
@@ -529,6 +530,40 @@ def compile(code):
         module = compile("int f() { return 789; }")
         self.assertEqual(module.f(), 789)
 
+    @unittest.skipIf(
+        "utf" not in locale.getlocale()[1].lower(), "Only test in UTF-8 locale"
+    )
+    def test_load_with_non_platform_default_encoding(self):
+        # Assume the code is saved in UTF-8, but the locale is set to a different encoding.
+        # You might encounter decoding errors in ExtensionVersioner.
+        # But this case is quite hard to cover because CI environments may not in non-latin locale.
+        # So the following code just test source file in gbk and locale in utf-8.
+
+        cpp_source = """
+        #include <torch/extension.h>
+
+        // Non-latin1 character test: 字符.
+        // It will cause utf-8 decoding error.
+
+        int f() { return 123; }
+        PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+            m.def("f", &f, "f");
+        }
+        """
+
+        build_dir = tempfile.mkdtemp()
+        src_path = os.path.join(build_dir, "main.cpp")
+
+        with open(src_path, encoding="gbk", mode="w") as f:
+            f.write(cpp_source)
+
+        module = torch.utils.cpp_extension.load(
+            name="non_default_encoding",
+            sources=src_path,
+            verbose=True,
+        )
+        self.assertEqual(module.f(), 123)
+
     def test_cpp_frontend_module_has_same_output_as_python(self, dtype=torch.double):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
@@ -13,7 +13,7 @@ def update_hash(seed, value):
 
 def hash_source_files(hash_value, source_files):
     for filename in source_files:
-        with open(filename) as file:
+        with open(filename, 'rb') as file:
             hash_value = update_hash(hash_value, file.read())
     return hash_value