Skip to content

Commit b3b3d09

Browse files
laramielcopybara-github
authored andcommitted
Improve tensorstore file:// uri parsing
* Relative file uris are now disallowed Before file://host/path would parse the authority as a path element, allowing relative uris. This is incorrect and these are no longer permitted. This may break users of uri-syntax expecting relative uris. * Windows drive path uri handling improved. Uris like `file:///c:/foo` which include windows drive letters would create illegal windows paths; this has now been corrected. Fixes: #239 PiperOrigin-RevId: 784309212 Change-Id: Ie341e9d2950b16c679857f2f0c8a7f4480c5a1bd
1 parent be45594 commit b3b3d09

File tree

18 files changed

+216
-68
lines changed

18 files changed

+216
-68
lines changed

python/tensorstore/kvstore.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,9 +1392,9 @@ Example of constructing from the :json:schema:`JSON representation<KvStore>`:
13921392
13931393
Example of constructing from a :json:schema:`URL<KvStoreUrl>`:
13941394
1395-
>>> spec = ts.KvStore.Spec('file://tmp/data/')
1395+
>>> spec = ts.KvStore.Spec('file:///tmp/data/')
13961396
>>> spec
1397-
KvStore.Spec({'driver': 'file', 'path': 'tmp/data/'})
1397+
KvStore.Spec({'driver': 'file', 'path': '/tmp/data/'})
13981398
13991399
)");
14001400

python/tensorstore/tests/kvstore_test.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"""Tests for tensorstore.KvStore."""
1515

1616
import copy
17+
import pathlib
1718
import pickle
1819
import tempfile
1920

@@ -103,17 +104,18 @@ def test_copy_range_to_ocdbt_memory():
103104
def test_copy_range_to_ocdbt_file():
104105
context = ts.Context()
105106
with tempfile.TemporaryDirectory() as dir_path:
107+
base_url = pathlib.Path(dir_path).resolve().as_uri()
106108
child_spec = {
107109
'driver': 'ocdbt',
108-
'base': f'file://{dir_path}/child',
110+
'base': f'{base_url}/child',
109111
}
110112
child = ts.KvStore.open(child_spec, context=context).result()
111113
for k in ['a', 'b', 'c']:
112114
child[k] = f'value_{k}'
113115

114116
parent_spec = {
115117
'driver': 'ocdbt',
116-
'base': f'file://{dir_path}',
118+
'base': base_url,
117119
}
118120
parent = ts.KvStore.open(parent_spec, context=context).result()
119121
child.experimental_copy_range_to(parent).result()
@@ -134,12 +136,11 @@ def test_copy_range_to_memory_fails():
134136
def test_copy_range_to_file_fails():
135137
context = ts.Context()
136138
with tempfile.TemporaryDirectory() as dir_path:
137-
child = ts.KvStore.open(
138-
f'file://{dir_path}/child/', context=context
139-
).result()
139+
base_url = pathlib.Path(dir_path).resolve().as_uri()
140+
child = ts.KvStore.open(f'{base_url}/child/', context=context).result()
140141
for k in ['a', 'b', 'c']:
141142
child[k] = f'value_{k}'
142-
parent = ts.KvStore.open(f'file://{dir_path}', context=context).result()
143+
parent = ts.KvStore.open(base_url, context=context).result()
143144
with pytest.raises(NotImplementedError):
144145
child.experimental_copy_range_to(parent).result()
145146

python/tensorstore/tests/tensorstore_test.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@
1414
"""Tests for tensorstore.TensorStore."""
1515

1616
import copy
17+
import pathlib
1718
import pickle
1819
import re
1920
import tempfile
20-
import time
21-
2221
import numpy as np
2322
import pytest
2423
import tensorstore as ts
@@ -592,8 +591,10 @@ def validate_spec(s):
592591

593592

594593
async def test_non_utf8_error():
595-
with pytest.raises(ValueError, match='.*local file "\\\\xfa.*'):
596-
await ts.open({"driver": "zarr", "kvstore": "file://%fa"})
594+
with tempfile.TemporaryDirectory() as dir_path:
595+
base_url = pathlib.Path(dir_path).resolve()
596+
with pytest.raises(ValueError, match='.*local file ".*\\\\xfa.*'):
597+
await ts.open({"driver": "zarr", "kvstore": base_url.as_uri() + "/%fa"})
597598

598599

599600
async def test_write_batch():

tensorstore/driver/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ tensorstore_cc_library(
245245
"//tensorstore/internal:lock_collection",
246246
"//tensorstore/internal:nditerable",
247247
"//tensorstore/internal:nditerable_transformed_array",
248+
"//tensorstore/internal:uri_utils",
248249
"//tensorstore/internal/testing:dynamic",
249250
"//tensorstore/internal/testing:json_gtest",
250251
"//tensorstore/internal/testing:queue_testutil",

tensorstore/driver/auto/BUILD

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ tensorstore_cc_test(
6565
"//tensorstore:spec",
6666
"//tensorstore:transaction",
6767
"//tensorstore/driver/zarr3",
68-
"//tensorstore/internal/os:file_util",
68+
"//tensorstore/internal:uri_utils",
6969
"//tensorstore/internal/testing:json_gtest",
7070
"//tensorstore/internal/testing:scoped_directory",
7171
"//tensorstore/kvstore",
@@ -75,6 +75,8 @@ tensorstore_cc_test(
7575
"//tensorstore/kvstore/zip",
7676
"//tensorstore/util:status_testutil",
7777
"@abseil-cpp//absl/status",
78+
"@abseil-cpp//absl/strings",
79+
"@abseil-cpp//absl/strings:cord",
7880
"@googletest//:gtest_main",
7981
"@nlohmann_json//:json",
8082
],

tensorstore/driver/auto/driver_test.cc

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,20 @@
1414

1515
#include <stdint.h>
1616

17+
#include <string>
18+
#include <string_view>
19+
1720
#include <gmock/gmock.h>
1821
#include <gtest/gtest.h>
1922
#include "absl/status/status.h"
23+
#include "absl/strings/cord.h"
24+
#include "absl/strings/str_cat.h"
2025
#include <nlohmann/json.hpp>
2126
#include "tensorstore/context.h"
2227
#include "tensorstore/data_type.h"
2328
#include "tensorstore/internal/testing/json_gtest.h"
2429
#include "tensorstore/internal/testing/scoped_directory.h"
30+
#include "tensorstore/internal/uri_utils.h"
2531
#include "tensorstore/kvstore/kvstore.h"
2632
#include "tensorstore/kvstore/operations.h"
2733
#include "tensorstore/open.h"
@@ -46,6 +52,12 @@ auto MakeInitial(const Context &context) {
4652
tensorstore::OpenMode::create);
4753
}
4854

55+
template <typename... Args>
56+
std::string AsFileUri(std::string_view path, Args... args) {
57+
return absl::StrCat("file://", tensorstore::internal::OsPathToUriPath(path),
58+
std::forward<Args>(args)...);
59+
}
60+
4961
TEST(AutoTest, ExplicitDriver) {
5062
auto context = Context::Default();
5163
TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto store_initial,
@@ -185,12 +197,12 @@ TEST(AutoTest, MemoryOcdbtZarr3) {
185197

186198
TEST(AutoTest, FileZarr3) {
187199
ScopedTemporaryDirectory tempdir;
188-
TENSORSTORE_ASSERT_OK(tensorstore::Open("file://" + tempdir.path() + "|zarr3",
200+
TENSORSTORE_ASSERT_OK(tensorstore::Open(AsFileUri(tempdir.path(), "|zarr3"),
189201
tensorstore::dtype_v<int32_t>,
190202
tensorstore::Schema::Shape({5}),
191203
tensorstore::OpenMode::create)
192204
.result());
193-
TENSORSTORE_ASSERT_OK(tensorstore::Open("file://" + tempdir.path()).result());
205+
TENSORSTORE_ASSERT_OK(tensorstore::Open(AsFileUri(tempdir.path())).result());
194206
}
195207

196208
TEST(AutoTest, MemoryZarr3Transaction) {

tensorstore/driver/auto/index.rst

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ Auto-detecting an array
5757
A :ref:`zarr3<driver/zarr3>` TensorStore can be detected from its path:
5858

5959
>>> # Create new array
60-
>>> await ts.open("file://tmp/dataset.zarr|zarr3",
60+
>>> await ts.open("file:///tmp/dataset.zarr|zarr3",
6161
... dtype="int32",
6262
... shape=[5],
6363
... create=True)
@@ -72,7 +72,7 @@ A :ref:`zarr3<driver/zarr3>` TensorStore can be detected from its path:
7272
},
7373
'driver': 'zarr3',
7474
'dtype': 'int32',
75-
'kvstore': {'driver': 'file', 'path': 'tmp/dataset.zarr/'},
75+
'kvstore': {'driver': 'file', 'path': '/tmp/dataset.zarr/'},
7676
'metadata': {
7777
'chunk_grid': {'configuration': {'chunk_shape': [5]}, 'name': 'regular'},
7878
'chunk_key_encoding': {'name': 'default'},
@@ -86,7 +86,7 @@ A :ref:`zarr3<driver/zarr3>` TensorStore can be detected from its path:
8686
'transform': {'input_exclusive_max': [[5]], 'input_inclusive_min': [0]},
8787
})
8888
>>> # Open with auto-detection
89-
>>> await ts.open("file://tmp/dataset.zarr")
89+
>>> await ts.open("file:///tmp/dataset.zarr")
9090
TensorStore({
9191
'context': {
9292
'cache_pool': {},
@@ -98,7 +98,7 @@ A :ref:`zarr3<driver/zarr3>` TensorStore can be detected from its path:
9898
},
9999
'driver': 'zarr3',
100100
'dtype': 'int32',
101-
'kvstore': {'driver': 'file', 'path': 'tmp/dataset.zarr/'},
101+
'kvstore': {'driver': 'file', 'path': '/tmp/dataset.zarr/'},
102102
'metadata': {
103103
'chunk_grid': {'configuration': {'chunk_shape': [5]}, 'name': 'regular'},
104104
'chunk_key_encoding': {'name': 'default'},
@@ -115,41 +115,41 @@ A :ref:`zarr3<driver/zarr3>` TensorStore can be detected from its path:
115115
Explicitly constructing a :py:obj:`~tensorstore.Spec` demonstrates the
116116
explicit syntax for using the ``auto`` driver:
117117

118-
>>> ts.Spec("file://tmp/dataset|auto")
119-
Spec({'driver': 'auto', 'kvstore': {'driver': 'file', 'path': 'tmp/dataset'}})
120-
>>> ts.Spec("file://tmp/dataset")
121-
Spec({'driver': 'auto', 'kvstore': {'driver': 'file', 'path': 'tmp/dataset'}})
118+
>>> ts.Spec("file:///tmp/dataset|auto")
119+
Spec({'driver': 'auto', 'kvstore': {'driver': 'file', 'path': '/tmp/dataset'}})
120+
>>> ts.Spec("file:///tmp/dataset")
121+
Spec({'driver': 'auto', 'kvstore': {'driver': 'file', 'path': '/tmp/dataset'}})
122122

123123
Chaining TensorStore adapters
124124
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
125125

126126
TensorStore adapters like :ref:`cast<driver/cast>` can also be
127127
used in conjunction with format auto-detection:
128128

129-
>>> ts.Spec("file://tmp/dataset.zarr|cast:int64")
129+
>>> ts.Spec("file:///tmp/dataset.zarr|cast:int64")
130130
Spec({
131131
'base': {
132132
'driver': 'auto',
133-
'kvstore': {'driver': 'file', 'path': 'tmp/dataset.zarr'},
133+
'kvstore': {'driver': 'file', 'path': '/tmp/dataset.zarr'},
134134
},
135135
'driver': 'cast',
136136
'dtype': 'int64',
137137
})
138-
>>> ts.Spec("file://tmp/dataset.zarr|auto|cast:int64")
138+
>>> ts.Spec("file:///tmp/dataset.zarr|auto|cast:int64")
139139
Spec({
140140
'base': {
141141
'driver': 'auto',
142-
'kvstore': {'driver': 'file', 'path': 'tmp/dataset.zarr'},
142+
'kvstore': {'driver': 'file', 'path': '/tmp/dataset.zarr'},
143143
},
144144
'driver': 'cast',
145145
'dtype': 'int64',
146146
})
147-
>>> await ts.open("file://tmp/dataset.zarr|cast:int64")
147+
>>> await ts.open("file:///tmp/dataset.zarr|cast:int64")
148148
TensorStore({
149149
'base': {
150150
'driver': 'zarr3',
151151
'dtype': 'int32',
152-
'kvstore': {'driver': 'file', 'path': 'tmp/dataset.zarr/'},
152+
'kvstore': {'driver': 'file', 'path': '/tmp/dataset.zarr/'},
153153
'metadata': {
154154
'chunk_grid': {
155155
'configuration': {'chunk_shape': [5]},
@@ -186,7 +186,7 @@ Multiple steps of auto-detection are also possible. Here, a
186186
from the path to the OCDBT database.
187187

188188
>>> # Create new array within new OCDBT database
189-
>>> await ts.open("file://tmp/dataset.ocdbt|ocdbt|zarr3",
189+
>>> await ts.open("file:///tmp/dataset.ocdbt|ocdbt|zarr3",
190190
... dtype="int32",
191191
... shape=[5],
192192
... create=True)
@@ -203,7 +203,7 @@ from the path to the OCDBT database.
203203
'driver': 'zarr3',
204204
'dtype': 'int32',
205205
'kvstore': {
206-
'base': {'driver': 'file', 'path': 'tmp/dataset.ocdbt/'},
206+
'base': {'driver': 'file', 'path': '/tmp/dataset.ocdbt/'},
207207
'config': {
208208
'compression': {'id': 'zstd'},
209209
'max_decoded_node_bytes': 8388608,
@@ -226,7 +226,7 @@ from the path to the OCDBT database.
226226
'transform': {'input_exclusive_max': [[5]], 'input_inclusive_min': [0]},
227227
})
228228
>>> # Open with auto-detection
229-
>>> await ts.open("file://tmp/dataset.ocdbt")
229+
>>> await ts.open("file:///tmp/dataset.ocdbt")
230230
TensorStore({
231231
'context': {
232232
'cache_pool': {},
@@ -240,7 +240,7 @@ from the path to the OCDBT database.
240240
'driver': 'zarr3',
241241
'dtype': 'int32',
242242
'kvstore': {
243-
'base': {'driver': 'file', 'path': 'tmp/dataset.ocdbt/'},
243+
'base': {'driver': 'file', 'path': '/tmp/dataset.ocdbt/'},
244244
'config': {
245245
'compression': {'id': 'zstd'},
246246
'max_decoded_node_bytes': 8388608,
@@ -268,7 +268,7 @@ of the OCDBT database:
268268

269269
>>> # Create new array within new OCDBT database
270270
>>> await ts.open(
271-
... "file://tmp/dataset2.ocdbt|ocdbt:path/within/database|zarr3",
271+
... "file:///tmp/dataset2.ocdbt|ocdbt:path/within/database|zarr3",
272272
... dtype="int32",
273273
... shape=[5],
274274
... create=True)
@@ -285,7 +285,7 @@ of the OCDBT database:
285285
'driver': 'zarr3',
286286
'dtype': 'int32',
287287
'kvstore': {
288-
'base': {'driver': 'file', 'path': 'tmp/dataset2.ocdbt/'},
288+
'base': {'driver': 'file', 'path': '/tmp/dataset2.ocdbt/'},
289289
'config': {
290290
'compression': {'id': 'zstd'},
291291
'max_decoded_node_bytes': 8388608,
@@ -309,10 +309,10 @@ of the OCDBT database:
309309
'transform': {'input_exclusive_max': [[5]], 'input_inclusive_min': [0]},
310310
})
311311
>>> # Open with auto-detection
312-
>>> await ts.open("file://tmp/dataset2.ocdbt")
312+
>>> await ts.open("file:///tmp/dataset2.ocdbt")
313313
Traceback (most recent call last):
314314
...
315-
ValueError: FAILED_PRECONDITION: Error opening "auto" driver: Failed to detect format for "" in OCDBT database at local file "tmp/dataset2.ocdbt/"...
315+
ValueError: FAILED_PRECONDITION: Error opening "auto" driver: Failed to detect format for "" in OCDBT database at local file "/tmp/dataset2.ocdbt/"...
316316

317317
.. _multi-step-auto-detection-algorithm:
318318

tensorstore/driver/driver_testutil.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
#include "tensorstore/internal/testing/json_gtest.h"
7373
#include "tensorstore/internal/testing/random_seed.h"
7474
#include "tensorstore/internal/testing/scoped_directory.h"
75+
#include "tensorstore/internal/uri_utils.h"
7576
#include "tensorstore/json_serialization_options_base.h"
7677
#include "tensorstore/kvstore/generation.h"
7778
#include "tensorstore/kvstore/memory/memory_key_value_store.h"
@@ -168,8 +169,11 @@ void TestTensorStoreDriverSpecRoundtrip(
168169
ReplaceStringInJson(options.create_spec, tempdir_key, tempdir->path());
169170
ReplaceStringInJson(options.minimal_spec, tempdir_key, tempdir->path());
170171
ReplaceStringInJson(options.full_base_spec, tempdir_key, tempdir->path());
171-
options.url =
172-
absl::StrReplaceAll(options.url, {{tempdir_key, tempdir->path()}});
172+
173+
// For the URL, the tempdir path must begin with a leading slash.
174+
options.url = absl::StrReplaceAll(
175+
options.url,
176+
{{tempdir_key, internal::OsPathToUriPath(tempdir->path())}});
173177
}
174178
Transaction transaction(mode);
175179
auto context = Context::Default();

tensorstore/internal/os/error_code.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef TENSORSTORE_INTERNAL_OS_ERROR_CODE_H_
1616
#define TENSORSTORE_INTERNAL_OS_ERROR_CODE_H_
1717

18+
#include <cassert>
1819
#include <cerrno>
1920
#include <string>
2021
#include <string_view>
@@ -71,6 +72,7 @@ absl::Status StatusWithOsError(
7172
absl::StatusCode status_code, OsErrorCode error_code, //
7273
A a = {}, B b = {}, C c = {}, D d = {}, E e = {}, F f = {},
7374
SourceLocation loc = tensorstore::SourceLocation::current()) {
75+
assert(status_code != absl::StatusCode::kOk);
7476
absl::Status status(
7577
status_code,
7678
tensorstore::StrCat(a, b, c, d, e, f, " [OS error ", error_code, ": ",

tensorstore/internal/uri_utils.cc

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include "absl/strings/ascii.h"
2727
#include "absl/strings/match.h"
2828
#include "absl/strings/str_cat.h"
29+
#include "absl/strings/str_join.h"
30+
#include "absl/strings/str_split.h"
2931
#include "tensorstore/internal/ascii_set.h"
3032

3133
namespace tensorstore {
@@ -224,5 +226,33 @@ std::optional<HostPort> SplitHostPort(std::string_view host_port) {
224226
return HostPort{host_port.substr(0, colon), host_port.substr(colon + 1)};
225227
}
226228

229+
std::string OsPathToUriPath(std::string_view path) {
230+
bool is_root_path =
231+
(path.size() > 2 && absl::ascii_isalpha(path[0]) && path[1] == ':') ||
232+
(!path.empty() && path[0] == '/');
233+
#ifdef _WIN32
234+
constexpr const char kPathSeparator[] = "/\\";
235+
#else
236+
constexpr const char kPathSeparator[] = "/";
237+
#endif
238+
auto splitter = absl::StrSplit(path, absl::ByAnyChar(kPathSeparator));
239+
auto it = splitter.begin();
240+
if (it->empty()) it++;
241+
std::string joined_path = absl::StrJoin(it, splitter.end(), "/");
242+
if (is_root_path) {
243+
joined_path.insert(0, 1, '/');
244+
}
245+
return PercentEncodeUriPath(joined_path);
246+
}
247+
248+
std::string UriPathToOsPath(std::string_view path) {
249+
std::string result = PercentDecode(path);
250+
if (result.size() > 3 && result[0] == '/' && absl::ascii_isalpha(result[1]) &&
251+
result[2] == ':') {
252+
return result.substr(1);
253+
}
254+
return result;
255+
}
256+
227257
} // namespace internal
228258
} // namespace tensorstore

0 commit comments

Comments
 (0)