Make read_file and write_file accept unicode strings on Windows (#2949) (#3009)

fmassa · peterjc123 · web-flow · commit 98521c7bd34c · 2020-11-19T15:39:33.000+01:00
* Make read_file accept unicode strings on Windows

* More fixes

* Remove definitions from source files

* Move string definitions to header

* Add checks

* Fix comments

* Update macro

* Fix comments

* Fix lint

* include windows header

* Change func signature in header

* Use from_blob

* Fix fread calls

* Fix clang format

* Fix missing return

* Avoid copy

Co-authored-by: peterjc123 &lt;peterghost86@gmail.com&gt;
diff --git a/test/test_image.py b/test/test_image.py
@@ -221,6 +221,18 @@ def test_read_file(self):
                 RuntimeError, "No such file or directory: 'tst'"):
             read_file('tst')
 
+    def test_read_file_non_ascii(self):
+        with get_tmp_dir() as d:
+            fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
+            fpath = os.path.join(d, fname)
+            with open(fpath, 'wb') as f:
+                f.write(content)
+
+            data = read_file(fpath)
+            expected = torch.tensor(list(content), dtype=torch.uint8)
+            self.assertTrue(data.equal(expected))
+            os.unlink(fpath)
+
     def test_write_file(self):
         with get_tmp_dir() as d:
             fname, content = 'test1.bin', b'TorchVision\211\n'
@@ -233,6 +245,18 @@ def test_write_file(self):
             self.assertEqual(content, saved_content)
             os.unlink(fpath)
 
+    def test_write_file_non_ascii(self):
+        with get_tmp_dir() as d:
+            fname, content = '日本語(Japanese).bin', b'TorchVision\211\n'
+            fpath = os.path.join(d, fname)
+            content_tensor = torch.tensor(list(content), dtype=torch.uint8)
+            write_file(fpath, content_tensor)
+
+            with open(fpath, 'rb') as f:
+                saved_content = f.read()
+            self.assertEqual(content, saved_content)
+            os.unlink(fpath)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/torchvision/csrc/cpu/image/read_write_file_cpu.cpp b/torchvision/csrc/cpu/image/read_write_file_cpu.cpp
@@ -1,17 +1,40 @@
 #include "read_write_file_cpu.h"
 
-// According to
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
-// we should use _stat64 for 64-bit file size on Windows.
 #ifdef _WIN32
-#define VISION_STAT _stat64
-#else
-#define VISION_STAT stat
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+
+std::wstring utf8_decode(const std::string& str) {
+  if (str.empty()) {
+    return std::wstring();
+  }
+  int size_needed = MultiByteToWideChar(
+      CP_UTF8, 0, str.c_str(), static_cast<int>(str.size()), NULL, 0);
+  TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode");
+  std::wstring wstrTo(size_needed, 0);
+  MultiByteToWideChar(
+      CP_UTF8,
+      0,
+      str.c_str(),
+      static_cast<int>(str.size()),
+      &wstrTo[0],
+      size_needed);
+  return wstrTo;
+}
 #endif
 
-torch::Tensor read_file(std::string filename) {
-  struct VISION_STAT stat_buf;
-  int rc = VISION_STAT(filename.c_str(), &stat_buf);
+torch::Tensor read_file(const std::string& filename) {
+#ifdef _WIN32
+  // According to
+  // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=vs-2019,
+  // we should use struct __stat64 and _wstat64 for 64-bit file size on Windows.
+  struct __stat64 stat_buf;
+  auto fileW = utf8_decode(filename);
+  int rc = _wstat64(fileW.c_str(), &stat_buf);
+#else
+  struct stat stat_buf;
+  int rc = stat(filename.c_str(), &stat_buf);
+#endif
   // errno is a variable defined in errno.h
   TORCH_CHECK(
       rc == 0, "[Errno ", errno, "] ", strerror(errno), ": '", filename, "'");
@@ -21,9 +44,20 @@ torch::Tensor read_file(std::string filename) {
   TORCH_CHECK(size > 0, "Expected a non empty file");
 
 #ifdef _WIN32
-  auto data =
-      torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8)
-          .clone();
+  // TODO: Once torch::from_file handles UTF-8 paths correctly, we should move
+  // back to use the following implementation since it uses file mapping.
+  //   auto data =
+  //       torch::from_file(filename, /*shared=*/false, /*size=*/size,
+  //       torch::kU8).clone()
+  FILE* infile = _wfopen(fileW.c_str(), L"rb");
+
+  TORCH_CHECK(infile != nullptr, "Error opening input file");
+
+  auto data = torch::empty({size}, torch::kU8);
+  auto dataBytes = data.data_ptr<uint8_t>();
+
+  fread(dataBytes, sizeof(uint8_t), size, infile);
+  fclose(infile);
 #else
   auto data =
       torch::from_file(filename, /*shared=*/false, /*size=*/size, torch::kU8);
@@ -32,7 +66,7 @@ torch::Tensor read_file(std::string filename) {
   return data;
 }
 
-void write_file(std::string filename, torch::Tensor& data) {
+void write_file(const std::string& filename, torch::Tensor& data) {
   // Check that the input tensor is on CPU
   TORCH_CHECK(data.device() == torch::kCPU, "Input tensor should be on CPU");
 
@@ -44,7 +78,12 @@ void write_file(std::string filename, torch::Tensor& data) {
 
   auto fileBytes = data.data_ptr<uint8_t>();
   auto fileCStr = filename.c_str();
+#ifdef _WIN32
+  auto fileW = utf8_decode(filename);
+  FILE* outfile = _wfopen(fileW.c_str(), L"wb");
+#else
   FILE* outfile = fopen(fileCStr, "wb");
+#endif
 
   TORCH_CHECK(outfile != NULL, "Error opening output file");
 
diff --git a/torchvision/csrc/cpu/image/read_write_file_cpu.h b/torchvision/csrc/cpu/image/read_write_file_cpu.h
@@ -4,6 +4,6 @@
 #include <sys/stat.h>
 #include <torch/torch.h>
 
-C10_EXPORT torch::Tensor read_file(std::string filename);
+C10_EXPORT torch::Tensor read_file(const std::string& filename);
 
-C10_EXPORT void write_file(std::string filename, torch::Tensor& data);
+C10_EXPORT void write_file(const std::string& filename, torch::Tensor& data);