|
| 1 | +Modified patch to apply to vendored onnx sources |
| 2 | + |
| 3 | + |
| 4 | +From 66b7fb630903fdcf3e83b6b6d56d82e904264a20 Mon Sep 17 00:00:00 2001 |
| 5 | +From: liqun Fu < [email protected]> |
| 6 | +Date: Mon, 19 Feb 2024 11:12:40 -0800 |
| 7 | +Subject: [PATCH] Fix path sanitization bypass leading to arbitrary read |
| 8 | + (#5917) |
| 9 | + |
| 10 | +Signed-off-by: liqunfu < [email protected]> |
| 11 | +Signed-off-by: liqun Fu < [email protected]> |
| 12 | +Co-authored-by: Justin Chu < [email protected]> |
| 13 | +--- |
| 14 | + third_party/onnx/onnx/checker.cc | 163 ++++++++++-------- |
| 15 | + third_party/onnx/onnx/checker.h | 5 +- |
| 16 | + third_party/onnx/onnx/common/path.h | 17 +- |
| 17 | + third_party/onnx/onnx/cpp2py_export.cc | 2 + |
| 18 | + third_party/onnx/onnx/external_data_helper.py | 17 +- |
| 19 | + .../onnx/onnx/test/test_external_data.py | 47 +++++ |
| 20 | + 6 files changed, 160 insertions(+), 91 deletions(-) |
| 21 | + |
| 22 | +diff --git a/third_party/onnx/onnx/checker.cc b/third_party/onnx/onnx/checker.cc |
| 23 | +index 0c81c87e..38a068dd 100644 |
| 24 | +--- a/third_party/onnx/onnx/checker.cc |
| 25 | ++++ b/third_party/onnx/onnx/checker.cc |
| 26 | +@@ -4,7 +4,6 @@ |
| 27 | + |
| 28 | + #include "onnx/checker.h" |
| 29 | + #include "onnx/common/file_utils.h" |
| 30 | +-#include "onnx/common/path.h" |
| 31 | + #include "onnx/defs/schema.h" |
| 32 | + #include "onnx/defs/tensor_proto_util.h" |
| 33 | + #include "onnx/proto_utils.h" |
| 34 | +@@ -129,80 +128,7 @@ void check_tensor(const TensorProto& tensor, const CheckerContext& ctx) { |
| 35 | + for (const StringStringEntryProto& entry : tensor.external_data()) { |
| 36 | + if (entry.has_key() && entry.has_value() && entry.key() == "location") { |
| 37 | + has_location = true; |
| 38 | +-#ifdef _WIN32 |
| 39 | +- auto file_path = std::filesystem::path(utf8str_to_wstring(entry.value())); |
| 40 | +- if (file_path.is_absolute()) { |
| 41 | +- fail_check( |
| 42 | +- "Location of external TensorProto ( tensor name: ", |
| 43 | +- tensor.name(), |
| 44 | +- ") should be a relative path, but it is an absolute path: ", |
| 45 | +- entry.value()); |
| 46 | +- } |
| 47 | +- auto relative_path = file_path.lexically_normal().make_preferred().wstring(); |
| 48 | +- // Check that normalized relative path contains ".." on Windows. |
| 49 | +- if (relative_path.find(L"..", 0) != std::string::npos) { |
| 50 | +- fail_check( |
| 51 | +- "Data of TensorProto ( tensor name: ", |
| 52 | +- tensor.name(), |
| 53 | +- ") should be file inside the ", |
| 54 | +- ctx.get_model_dir(), |
| 55 | +- ", but the '", |
| 56 | +- entry.value(), |
| 57 | +- "' points outside the directory"); |
| 58 | +- } |
| 59 | +- std::wstring data_path = path_join(utf8str_to_wstring(ctx.get_model_dir()), relative_path); |
| 60 | +- struct _stat buff; |
| 61 | +- if (_wstat(data_path.c_str(), &buff) != 0) { |
| 62 | +- fail_check( |
| 63 | +- "Data of TensorProto ( tensor name: ", |
| 64 | +- tensor.name(), |
| 65 | +- ") should be stored in ", |
| 66 | +- entry.value(), |
| 67 | +- ", but it doesn't exist or is not accessible."); |
| 68 | +- } |
| 69 | +-#else // POSIX |
| 70 | +- if (entry.value().empty()) { |
| 71 | +- fail_check("Location of external TensorProto ( tensor name: ", tensor.name(), ") should not be empty."); |
| 72 | +- } else if (entry.value()[0] == '/') { |
| 73 | +- fail_check( |
| 74 | +- "Location of external TensorProto ( tensor name: ", |
| 75 | +- tensor.name(), |
| 76 | +- ") should be a relative path, but it is an absolute path: ", |
| 77 | +- entry.value()); |
| 78 | +- } |
| 79 | +- std::string relative_path = clean_relative_path(entry.value()); |
| 80 | +- // Check that normalized relative path contains ".." on POSIX |
| 81 | +- if (relative_path.find("..", 0) != std::string::npos) { |
| 82 | +- fail_check( |
| 83 | +- "Data of TensorProto ( tensor name: ", |
| 84 | +- tensor.name(), |
| 85 | +- ") should be file inside the ", |
| 86 | +- ctx.get_model_dir(), |
| 87 | +- ", but the '", |
| 88 | +- entry.value(), |
| 89 | +- "' points outside the directory"); |
| 90 | +- } |
| 91 | +- std::string data_path = path_join(ctx.get_model_dir(), relative_path); |
| 92 | +- // use stat to check whether the file exists |
| 93 | +- struct stat buffer; |
| 94 | +- if (stat((data_path).c_str(), &buffer) != 0) { |
| 95 | +- fail_check( |
| 96 | +- "Data of TensorProto ( tensor name: ", |
| 97 | +- tensor.name(), |
| 98 | +- ") should be stored in ", |
| 99 | +- data_path, |
| 100 | +- ", but it doesn't exist or is not accessible."); |
| 101 | +- } |
| 102 | +- // Do not allow symlinks or directories. |
| 103 | +- if (!S_ISREG(buffer.st_mode)) { |
| 104 | +- fail_check( |
| 105 | +- "Data of TensorProto ( tensor name: ", |
| 106 | +- tensor.name(), |
| 107 | +- ") should be stored in ", |
| 108 | +- data_path, |
| 109 | +- ", but it is not regular file."); |
| 110 | +- } |
| 111 | +-#endif |
| 112 | ++ resolve_external_data_location(ctx.get_model_dir(), entry.value(), tensor.name()); |
| 113 | + } |
| 114 | + } |
| 115 | + if (!has_location) { |
| 116 | +@@ -1028,6 +954,93 @@ void check_model(const ModelProto& model, bool full_check) { |
| 117 | + } |
| 118 | + } |
| 119 | + |
| 120 | ++std::string resolve_external_data_location( |
| 121 | ++ const std::string& base_dir, |
| 122 | ++ const std::string& location, |
| 123 | ++ const std::string& tensor_name) { |
| 124 | ++#ifdef _WIN32 |
| 125 | ++ auto file_path = std::filesystem::path(utf8str_to_wstring(location)); |
| 126 | ++ if (file_path.is_absolute()) { |
| 127 | ++ fail_check( |
| 128 | ++ "Location of external TensorProto ( tensor name: ", |
| 129 | ++ tensor_name, |
| 130 | ++ ") should be a relative path, but it is an absolute path: ", |
| 131 | ++ location); |
| 132 | ++ } |
| 133 | ++ auto relative_path = file_path.lexically_normal().make_preferred().wstring(); |
| 134 | ++ // Check that normalized relative path contains ".." on Windows. |
| 135 | ++ if (relative_path.find(L"..", 0) != std::string::npos) { |
| 136 | ++ fail_check( |
| 137 | ++ "Data of TensorProto ( tensor name: ", |
| 138 | ++ tensor_name, |
| 139 | ++ ") should be file inside the ", |
| 140 | ++ base_dir, |
| 141 | ++ ", but the '", |
| 142 | ++ location, |
| 143 | ++ "' points outside the directory"); |
| 144 | ++ } |
| 145 | ++ std::wstring data_path = path_join(utf8str_to_wstring(base_dir), relative_path); |
| 146 | ++ struct _stat64 buff; |
| 147 | ++ if (data_path.empty() || (data_path[0] != '#' && _wstat64(data_path.c_str(), &buff) != 0)) { |
| 148 | ++ fail_check( |
| 149 | ++ "Data of TensorProto ( tensor name: ", |
| 150 | ++ tensor_name, |
| 151 | ++ ") should be stored in ", |
| 152 | ++ location, |
| 153 | ++ ", but it doesn't exist or is not accessible."); |
| 154 | ++ } |
| 155 | ++ return wstring_to_utf8str(data_path); |
| 156 | ++#else // POSIX |
| 157 | ++ if (location.empty()) { |
| 158 | ++ fail_check("Location of external TensorProto ( tensor name: ", tensor_name, ") should not be empty."); |
| 159 | ++ } else if (location[0] == '/') { |
| 160 | ++ fail_check( |
| 161 | ++ "Location of external TensorProto ( tensor name: ", |
| 162 | ++ tensor_name, |
| 163 | ++ ") should be a relative path, but it is an absolute path: ", |
| 164 | ++ location); |
| 165 | ++ } |
| 166 | ++ std::string relative_path = clean_relative_path(location); |
| 167 | ++ // Check that normalized relative path contains ".." on POSIX |
| 168 | ++ if (relative_path.find("..", 0) != std::string::npos) { |
| 169 | ++ fail_check( |
| 170 | ++ "Data of TensorProto ( tensor name: ", |
| 171 | ++ tensor_name, |
| 172 | ++ ") should be file inside the ", |
| 173 | ++ base_dir, |
| 174 | ++ ", but the '", |
| 175 | ++ location, |
| 176 | ++ "' points outside the directory"); |
| 177 | ++ } |
| 178 | ++ std::string data_path = path_join(base_dir, relative_path); |
| 179 | ++ // use stat64 to check whether the file exists |
| 180 | ++#if defined(__APPLE__) || defined(__wasm__) || !defined(__GLIBC__) |
| 181 | ++ struct stat buffer; // APPLE, wasm and non-glic stdlibs do not have stat64 |
| 182 | ++ if (data_path.empty() || (data_path[0] != '#' && stat((data_path).c_str(), &buffer) != 0)) { |
| 183 | ++#else |
| 184 | ++ struct stat64 buffer; // All POSIX under glibc except APPLE and wasm have stat64 |
| 185 | ++ if (data_path.empty() || (data_path[0] != '#' && stat64((data_path).c_str(), &buffer) != 0)) { |
| 186 | ++#endif |
| 187 | ++ fail_check( |
| 188 | ++ "Data of TensorProto ( tensor name: ", |
| 189 | ++ tensor_name, |
| 190 | ++ ") should be stored in ", |
| 191 | ++ data_path, |
| 192 | ++ ", but it doesn't exist or is not accessible."); |
| 193 | ++ } |
| 194 | ++ // Do not allow symlinks or directories. |
| 195 | ++ if (data_path.empty() || (data_path[0] != '#' && !S_ISREG(buffer.st_mode))) { |
| 196 | ++ fail_check( |
| 197 | ++ "Data of TensorProto ( tensor name: ", |
| 198 | ++ tensor_name, |
| 199 | ++ ") should be stored in ", |
| 200 | ++ data_path, |
| 201 | ++ ", but it is not regular file."); |
| 202 | ++ } |
| 203 | ++ return data_path; |
| 204 | ++#endif |
| 205 | ++} |
| 206 | ++ |
| 207 | + std::set<std::string> experimental_ops = { |
| 208 | + "ATen", |
| 209 | + "Affine", |
| 210 | +diff --git a/third_party/onnx/onnx/checker.h b/third_party/onnx/onnx/checker.h |
| 211 | +index 05eeaad0..50381aae 100644 |
| 212 | +--- a/third_party/onnx/onnx/checker.h |
| 213 | ++++ b/third_party/onnx/onnx/checker.h |
| 214 | +@@ -148,7 +148,10 @@ void check_model_local_functions( |
| 215 | + |
| 216 | + void check_model(const ModelProto& model, bool full_check = false); |
| 217 | + void check_model(const std::string& model_path, bool full_check = false); |
| 218 | +- |
| 219 | ++std::string resolve_external_data_location( |
| 220 | ++ const std::string& base_dir, |
| 221 | ++ const std::string& location, |
| 222 | ++ const std::string& tensor_name); |
| 223 | + bool check_is_experimental_op(const NodeProto& node); |
| 224 | + |
| 225 | + } // namespace checker |
| 226 | +diff --git a/third_party/onnx/onnx/common/path.h b/third_party/onnx/onnx/common/path.h |
| 227 | +index f71b5b12..3d69e448 100644 |
| 228 | +--- a/third_party/onnx/onnx/common/path.h |
| 229 | ++++ b/third_party/onnx/onnx/common/path.h |
| 230 | +@@ -30,12 +30,23 @@ inline std::wstring utf8str_to_wstring(const std::string& utf8str) { |
| 231 | + if (utf8str.size() > INT_MAX) { |
| 232 | + fail_check("utf8str_to_wstring: string is too long for converting to wstring."); |
| 233 | + } |
| 234 | +- int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0); |
| 235 | ++ int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), static_cast<int>(utf8str.size()), NULL, 0); |
| 236 | + std::wstring ws_str(size_required, 0); |
| 237 | +- MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_str[0], size_required); |
| 238 | ++ MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), static_cast<int>(utf8str.size()), &ws_str[0], size_required); |
| 239 | + return ws_str; |
| 240 | + } |
| 241 | +- |
| 242 | ++inline std::string wstring_to_utf8str(const std::wstring& ws_str) { |
| 243 | ++ if (ws_str.size() > INT_MAX) { |
| 244 | ++ fail_check("wstring_to_utf8str: string is too long for converting to UTF-8."); |
| 245 | ++ } |
| 246 | ++ int size_required = |
| 247 | ++ WideCharToMultiByte(CP_UTF8, 0, ws_str.c_str(), static_cast<int>(ws_str.size()), NULL, 0, NULL, NULL); |
| 248 | ++ std::string utf8str(size_required, 0); |
| 249 | ++ WideCharToMultiByte( |
| 250 | ++ CP_UTF8, 0, ws_str.c_str(), static_cast<int>(ws_str.size()), &utf8str[0], size_required, NULL, NULL); |
| 251 | ++ return utf8str; |
| 252 | ++} |
| 253 | ++ |
| 254 | + #else |
| 255 | + std::string path_join(const std::string& origin, const std::string& append); |
| 256 | + // TODO: also use std::filesystem::path for clean_relative_path after ONNX has supported C++17 for POSIX |
| 257 | +diff --git a/third_party/onnx/onnx/cpp2py_export.cc b/third_party/onnx/onnx/cpp2py_export.cc |
| 258 | +index f6e1738e..02aabcab 100644 |
| 259 | +--- a/third_party/onnx/onnx/cpp2py_export.cc |
| 260 | ++++ b/third_party/onnx/onnx/cpp2py_export.cc |
| 261 | +@@ -395,6 +395,8 @@ PYBIND11_MODULE(onnx_cpp2py_export, onnx_cpp2py_export) { |
| 262 | + "path"_a, |
| 263 | + "full_check"_a = false); |
| 264 | + |
| 265 | ++ checker.def("_resolve_external_data_location", &checker::resolve_external_data_location); |
| 266 | ++ |
| 267 | + // Submodule `version_converter` |
| 268 | + auto version_converter = onnx_cpp2py_export.def_submodule("version_converter"); |
| 269 | + version_converter.doc() = "VersionConverter submodule"; |
| 270 | +diff --git a/third_party/onnx/onnx/external_data_helper.py b/third_party/onnx/onnx/external_data_helper.py |
| 271 | +index 54c0f403..6763b11d 100644 |
| 272 | +--- a/third_party/onnx/onnx/external_data_helper.py |
| 273 | ++++ b/third_party/onnx/onnx/external_data_helper.py |
| 274 | +@@ -5,7 +5,8 @@ import sys |
| 275 | + import uuid |
| 276 | + from itertools import chain |
| 277 | + from typing import Callable, Iterable, Optional |
| 278 | +- |
| 279 | ++ |
| 280 | ++import onnx.onnx_cpp2py_export.checker as c_checker |
| 281 | + from .onnx_pb import AttributeProto, GraphProto, ModelProto, TensorProto |
| 282 | + |
| 283 | + |
| 284 | +@@ -37,9 +38,9 @@ def load_external_data_for_tensor(tensor: TensorProto, base_dir: str) -> None: |
| 285 | + base_dir: directory that contains the external data. |
| 286 | + """ |
| 287 | + info = ExternalDataInfo(tensor) |
| 288 | +- file_location = _sanitize_path(info.location) |
| 289 | +- external_data_file_path = os.path.join(base_dir, file_location) |
| 290 | +- |
| 291 | ++ external_data_file_path = c_checker._resolve_external_data_location( # type: ignore[attr-defined] |
| 292 | ++ base_dir, info.location, tensor.name |
| 293 | ++ ) |
| 294 | + with open(external_data_file_path, "rb") as data_file: |
| 295 | + if info.offset: |
| 296 | + data_file.seek(info.offset) |
| 297 | +@@ -251,14 +252,6 @@ def _get_attribute_tensors(onnx_model_proto: ModelProto) -> Iterable[TensorProto |
| 298 | + yield from _get_attribute_tensors_from_graph(onnx_model_proto.graph) |
| 299 | + |
| 300 | + |
| 301 | +-def _sanitize_path(path: str) -> str: |
| 302 | +- """Remove path components which would allow traversing up a directory tree from a base path. |
| 303 | +- |
| 304 | +- Note: This method is currently very basic and should be expanded. |
| 305 | +- """ |
| 306 | +- return path.lstrip("/.") |
| 307 | +- |
| 308 | +- |
| 309 | + def _is_valid_filename(filename: str) -> bool: |
| 310 | + """Utility to check whether the provided filename is valid.""" |
| 311 | + exp = re.compile('^[^<>:;,?"*|/]+$') |
| 312 | +diff --git a/third_party/onnx/onnx/test/test_external_data.py b/third_party/onnx/onnx/test/test_external_data.py |
| 313 | +index 5ba6b9cf..6e06312b 100644 |
| 314 | +--- a/third_party/onnx/onnx/test/test_external_data.py |
| 315 | ++++ b/third_party/onnx/onnx/test/test_external_data.py |
| 316 | +@@ -1,4 +1,5 @@ |
| 317 | + # SPDX-License-Identifier: Apache-2.0 |
| 318 | ++import itertools |
| 319 | + import os |
| 320 | + import os.path as Path |
| 321 | + import shutil |
| 322 | +@@ -186,6 +187,52 @@ class TestLoadExternalDataSingleFile(TestLoadExternalDataBase): |
| 323 | + attribute_tensor = new_model.graph.node[0].attribute[0].t |
| 324 | + self.assertTrue(np.allclose(to_array(attribute_tensor), self.attribute_value)) |
| 325 | + |
| 326 | ++ @parameterized.parameterized.expand(itertools.product((True, False), (True, False))) |
| 327 | ++ def test_save_external_invalid_single_file_data_and_check( |
| 328 | ++ self, use_absolute_path: bool, use_model_path: bool |
| 329 | ++ ) -> None: |
| 330 | ++ model = onnx.load_model(self.model_filename, self.serialization_format) |
| 331 | ++ |
| 332 | ++ model_dir = os.path.join(self.temp_dir, "save_copy") |
| 333 | ++ os.mkdir(model_dir) |
| 334 | ++ |
| 335 | ++ traversal_external_data_dir = os.path.join( |
| 336 | ++ self.temp_dir, "invlid_external_data" |
| 337 | ++ ) |
| 338 | ++ os.mkdir(traversal_external_data_dir) |
| 339 | ++ |
| 340 | ++ if use_absolute_path: |
| 341 | ++ traversal_external_data_location = os.path.join( |
| 342 | ++ traversal_external_data_dir, "tensors.bin" |
| 343 | ++ ) |
| 344 | ++ else: |
| 345 | ++ traversal_external_data_location = "../invlid_external_data/tensors.bin" |
| 346 | ++ |
| 347 | ++ external_data_dir = os.path.join(self.temp_dir, "external_data") |
| 348 | ++ os.mkdir(external_data_dir) |
| 349 | ++ new_model_filepath = os.path.join(model_dir, "model.onnx") |
| 350 | ++ |
| 351 | ++ def convert_model_to_external_data_no_check(model: ModelProto, location: str): |
| 352 | ++ for tensor in model.graph.initializer: |
| 353 | ++ if tensor.HasField("raw_data"): |
| 354 | ++ set_external_data(tensor, location) |
| 355 | ++ |
| 356 | ++ convert_model_to_external_data_no_check( |
| 357 | ++ model, |
| 358 | ++ location=traversal_external_data_location, |
| 359 | ++ ) |
| 360 | ++ |
| 361 | ++ onnx.save_model(model, new_model_filepath, self.serialization_format) |
| 362 | ++ if use_model_path: |
| 363 | ++ with self.assertRaises(onnx.checker.ValidationError): |
| 364 | ++ _ = onnx.load_model(new_model_filepath, self.serialization_format) |
| 365 | ++ else: |
| 366 | ++ onnx_model = onnx.load_model( |
| 367 | ++ new_model_filepath, self.serialization_format, load_external_data=False |
| 368 | ++ ) |
| 369 | ++ with self.assertRaises(onnx.checker.ValidationError): |
| 370 | ++ load_external_data_for_model(onnx_model, external_data_dir) |
| 371 | ++ |
| 372 | + |
| 373 | + class TestSaveAllTensorsAsExternalData(TestLoadExternalDataBase): |
| 374 | + def setUp(self) -> None: |
| 375 | +-- |
| 376 | +2.25.1 |
| 377 | + |
0 commit comments