Skip to content

Commit 1615c6f

Browse files
jbmscopybara-github
authored andcommitted
Add OCDBT undump functions
These functions can be used for certain advanced manipulations of the OCDBT database. PiperOrigin-RevId: 845388227 Change-Id: I02150d3db11229eca506b481f60bf7208b90b787
1 parent 07ee013 commit 1615c6f

File tree

7 files changed

+334
-11
lines changed

7 files changed

+334
-11
lines changed

python/tensorstore/ocdbt.cc

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,106 @@ Dumps the internal representation of an OCDBT database.
191191
... btree["entries"][1]["indirect_value"]).result()
192192
b'ce'
193193
194+
)");
195+
196+
ocdbt_m.def(
197+
"undump_manifest",
198+
[](::nlohmann::json dumped_manifest) -> py::bytes {
199+
auto encoded_manifest =
200+
ValueOrThrow(internal_ocdbt::UndumpManifest(dumped_manifest));
201+
return py::bytes(std::string(encoded_manifest));
202+
},
203+
py::arg("dumped_manifest"),
204+
R"(
205+
Converts a JSON manifest dump to the on-disk manifest format.
206+
207+
Args:
208+
dumped_manifest: The JSON dump of the manifest in the format returned by
209+
`.dump`.
210+
211+
Returns:
212+
The on-disk representation of the manifest.
213+
214+
Group:
215+
OCDBT
216+
217+
Examples:
218+
---------
219+
220+
>>> store = ts.KvStore.open({
221+
... "driver": "ocdbt",
222+
... "base": "memory://"
223+
... }).result()
224+
>>> store["a"] = b"b"
225+
>>> manifest = ts.ocdbt.dump(store.base).result()
226+
>>> encoded = ts.ocdbt.undump_manifest(manifest)
227+
>>> assert encoded == store.base["manifest.ocdbt"]
228+
229+
)");
230+
ocdbt_m.def(
231+
"undump_version_tree_node",
232+
[](::nlohmann::json dumped_config,
233+
::nlohmann::json dumped_node) -> py::bytes {
234+
auto encoded_version_tree_node =
235+
ValueOrThrow(internal_ocdbt::UndumpVersionTreeNode(dumped_config,
236+
dumped_node));
237+
return py::bytes(std::string(encoded_version_tree_node));
238+
},
239+
py::arg("dumped_config"), py::arg("dumped_node"),
240+
R"(
241+
Converts a JSON version tree node dump to the on-disk version tree node
242+
format.
243+
244+
Args:
245+
dumped_config: The JSON dump of the version tree node's config in the format
246+
returned by `.dump`.
247+
dumped_node: The JSON dump of the version tree node in the format returned by
248+
`.dump`.
249+
250+
Returns:
251+
The on-disk representation of the version tree node.
252+
253+
Group:
254+
OCDBT
255+
256+
Examples:
257+
---------
258+
259+
>>> store = ts.KvStore.open({
260+
... "driver": "ocdbt",
261+
... "config": {
262+
... "version_tree_arity_log2": 1
263+
... },
264+
... "base": "memory://"
265+
... }).result()
266+
>>> store["a"] = b"b"
267+
>>> store["a"] = b"c"
268+
>>> manifest = ts.ocdbt.dump(store.base).result()
269+
>>> manifest
270+
{'config': {'compression': {'id': 'zstd'},
271+
'max_decoded_node_bytes': 8388608,
272+
'max_inline_value_bytes': 100,
273+
'uuid': '...',
274+
'version_tree_arity_log2': 1},
275+
'version_tree_nodes': [{'commit_time': ...,
276+
'generation_number': 2,
277+
'height': 1,
278+
'location': 'versionnode::d/...',
279+
'num_generations': 2}],
280+
'versions': [{'commit_time': ...,
281+
'generation_number': 3,
282+
'root': {'location': 'btreenode::d/...',
283+
'statistics': {'num_indirect_value_bytes': 0,
284+
'num_keys': 1,
285+
'num_tree_bytes': 35}},
286+
'root_height': 0}]}
287+
>>> version_tree_node_json = ts.ocdbt.dump(
288+
... store.base, manifest["version_tree_nodes"][0]["location"]).result()
289+
>>> encoded = ts.ocdbt.undump_version_tree_node(manifest["config"],
290+
... version_tree_node_json)
291+
>>> isinstance(encoded, bytes)
292+
True
293+
194294
)");
195295
});
196296
}

python/tensorstore/ocdbt.pyi

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@ from __future__ import annotations
22
import tensorstore
33
import typing
44

5-
__all__ = ["DistributedCoordinatorServer", "dump"]
5+
__all__ = [
6+
"DistributedCoordinatorServer",
7+
"dump",
8+
"undump_manifest",
9+
"undump_version_tree_node",
10+
]
611

712

813
class DistributedCoordinatorServer:
@@ -106,3 +111,90 @@ def dump(
106111
... btree["entries"][1]["indirect_value"]).result()
107112
b'ce'
108113
"""
114+
115+
116+
def undump_manifest(dumped_manifest: typing.Any) -> bytes:
117+
"""
118+
Converts a JSON manifest dump to the on-disk manifest format.
119+
120+
Args:
121+
dumped_manifest: The JSON dump of the manifest in the format returned by
122+
`.dump`.
123+
124+
Returns:
125+
The on-disk representation of the manifest.
126+
127+
Group:
128+
OCDBT
129+
130+
Examples:
131+
---------
132+
133+
>>> store = ts.KvStore.open({
134+
... "driver": "ocdbt",
135+
... "base": "memory://"
136+
... }).result()
137+
>>> store["a"] = b"b"
138+
>>> manifest = ts.ocdbt.dump(store.base).result()
139+
>>> encoded = ts.ocdbt.undump_manifest(manifest)
140+
>>> assert encoded == store.base["manifest.ocdbt"]
141+
"""
142+
143+
144+
def undump_version_tree_node(
145+
dumped_config: typing.Any, dumped_node: typing.Any
146+
) -> bytes:
147+
"""
148+
Converts a JSON version tree node dump to the on-disk version tree node
149+
format.
150+
151+
Args:
152+
dumped_config: The JSON dump of the version tree node's config in the format
153+
returned by `.dump`.
154+
dumped_node: The JSON dump of the version tree node in the format returned by
155+
`.dump`.
156+
157+
Returns:
158+
The on-disk representation of the version tree node.
159+
160+
Group:
161+
OCDBT
162+
163+
Examples:
164+
---------
165+
166+
>>> store = ts.KvStore.open({
167+
... "driver": "ocdbt",
168+
... "config": {
169+
... "version_tree_arity_log2": 1
170+
... },
171+
... "base": "memory://"
172+
... }).result()
173+
>>> store["a"] = b"b"
174+
>>> store["a"] = b"c"
175+
>>> manifest = ts.ocdbt.dump(store.base).result()
176+
>>> manifest
177+
{'config': {'compression': {'id': 'zstd'},
178+
'max_decoded_node_bytes': 8388608,
179+
'max_inline_value_bytes': 100,
180+
'uuid': '...',
181+
'version_tree_arity_log2': 1},
182+
'version_tree_nodes': [{'commit_time': ...,
183+
'generation_number': 2,
184+
'height': 1,
185+
'location': 'versionnode::d/...',
186+
'num_generations': 2}],
187+
'versions': [{'commit_time': ...,
188+
'generation_number': 3,
189+
'root': {'location': 'btreenode::d/...',
190+
'statistics': {'num_indirect_value_bytes': 0,
191+
'num_keys': 1,
192+
'num_tree_bytes': 35}},
193+
'root_height': 0}]}
194+
>>> version_tree_node_json = ts.ocdbt.dump(
195+
... store.base, manifest["version_tree_nodes"][0]["location"]).result()
196+
>>> encoded = ts.ocdbt.undump_version_tree_node(manifest["config"],
197+
... version_tree_node_json)
198+
>>> isinstance(encoded, bytes)
199+
True
200+
"""

tensorstore/kvstore/ocdbt/config.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <atomic>
2020
#include <string_view>
21+
#include <tuple>
2122
#include <type_traits>
2223

2324
#include "absl/status/status.h"
@@ -254,5 +255,14 @@ ConfigConstraints ConfigState::GetConstraints() const {
254255
return constraints_;
255256
}
256257

258+
bool operator==(const ConfigConstraints& lhs, const ConfigConstraints& rhs) {
259+
return std::tie(lhs.uuid, lhs.manifest_kind, lhs.max_inline_value_bytes,
260+
lhs.max_decoded_node_bytes, lhs.version_tree_arity_log2,
261+
lhs.compression) ==
262+
std::tie(rhs.uuid, rhs.manifest_kind, rhs.max_inline_value_bytes,
263+
rhs.max_decoded_node_bytes, rhs.version_tree_arity_log2,
264+
rhs.compression);
265+
}
266+
257267
} // namespace internal_ocdbt
258268
} // namespace tensorstore

tensorstore/kvstore/ocdbt/format/BUILD

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ tensorstore_cc_library(
118118
deps = [
119119
":format",
120120
"//tensorstore:json_serialization_options_base",
121-
"//tensorstore/internal:path",
121+
"//tensorstore/internal:ascii_set",
122122
"//tensorstore/internal:uri_utils",
123123
"//tensorstore/internal/json:value_as",
124124
"//tensorstore/internal/json_binding",
@@ -129,7 +129,6 @@ tensorstore_cc_library(
129129
"//tensorstore/util:status",
130130
"//tensorstore/util:str_cat",
131131
"@abseil-cpp//absl/status",
132-
"@abseil-cpp//absl/strings",
133132
"@abseil-cpp//absl/strings:cord",
134133
"@nlohmann_json//:json",
135134
"@re2",

tensorstore/kvstore/ocdbt/format/dump.cc

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,19 @@
2222

2323
#include "absl/status/status.h"
2424
#include "absl/strings/cord.h"
25-
#include "absl/strings/escaping.h"
2625
#include <nlohmann/json.hpp>
2726
#include "re2/re2.h"
27+
#include "tensorstore/internal/ascii_set.h"
2828
#include "tensorstore/internal/json/value_as.h"
2929
#include "tensorstore/internal/json_binding/bindable.h"
3030
#include "tensorstore/internal/json_binding/json_binding.h"
3131
#include "tensorstore/internal/json_binding/std_array.h"
3232
#include "tensorstore/internal/json_binding/std_variant.h"
33-
#include "tensorstore/internal/path.h"
3433
#include "tensorstore/internal/uri_utils.h"
3534
#include "tensorstore/json_serialization_options_base.h"
3635
#include "tensorstore/kvstore/ocdbt/config.h"
3736
#include "tensorstore/kvstore/ocdbt/format/btree.h"
37+
#include "tensorstore/kvstore/ocdbt/format/config.h"
3838
#include "tensorstore/kvstore/ocdbt/format/indirect_data_reference.h"
3939
#include "tensorstore/kvstore/ocdbt/format/manifest.h"
4040
#include "tensorstore/kvstore/ocdbt/format/version_tree.h"
@@ -72,7 +72,7 @@ namespace jb = tensorstore::internal_json_binding;
7272
constexpr auto ConfigBinder = jb::Compose<ConfigConstraints>(
7373
[](auto is_loading, const auto& options, auto* obj, auto* constraints) {
7474
if constexpr (is_loading) {
75-
CreateConfig(constraints, *obj);
75+
TENSORSTORE_RETURN_IF_ERROR(CreateConfig(*constraints, {}, *obj));
7676
if (ConfigConstraints(*obj) != *constraints) {
7777
return absl::InvalidArgumentError("Config is not fully specified");
7878
}
@@ -92,11 +92,16 @@ static inline constexpr internal::AsciiSet
9292
constexpr auto LabeledIndirectDataReferenceBinder =
9393
[](auto is_loading, const auto& options, auto* obj, auto* j) {
9494
if constexpr (is_loading) {
95-
if (auto* s = j->template get_ptr<const std::string*>()) {
96-
TENSORSTORE_ASSIGN_OR_RETURN(*obj,
97-
LabeledIndirectDataReference::Parse(*s));
95+
if (j->is_discarded()) {
96+
*obj = LabeledIndirectDataReference{IndirectDataKind::kBtreeNode,
97+
IndirectDataReference::Missing()};
9898
} else {
99-
return internal_json::ExpectedError(*j, "string");
99+
if (auto* s = j->template get_ptr<const std::string*>()) {
100+
TENSORSTORE_ASSIGN_OR_RETURN(
101+
*obj, LabeledIndirectDataReference::Parse(*s));
102+
} else {
103+
return internal_json::ExpectedError(*j, "string");
104+
}
100105
}
101106
} else {
102107
if (obj->location.IsMissing()) {
@@ -121,6 +126,11 @@ constexpr auto IndirectDataReferenceBinder(IndirectDataKind kind) {
121126
return jb::Compose<LabeledIndirectDataReference>(
122127
[kind](auto is_loading, const auto& options, auto* obj, auto* j) {
123128
if constexpr (is_loading) {
129+
if (j->kind != kind) {
130+
return absl::InvalidArgumentError(tensorstore::StrCat(
131+
"Indirect data reference kind mismatch: expected ", kind,
132+
", got ", j->kind));
133+
}
124134
*obj = j->location;
125135
} else {
126136
j->location = *obj;
@@ -273,5 +283,21 @@ ::nlohmann::json Dump(const VersionTreeNode& node) {
273283
return jb::ToJson(node, VersionTreeNodeBinder).value();
274284
}
275285

286+
Result<absl::Cord> UndumpManifest(::nlohmann::json dumped_manifest) {
287+
TENSORSTORE_ASSIGN_OR_RETURN(
288+
auto manifest, jb::FromJson<Manifest>(dumped_manifest, ManifestBinder));
289+
return EncodeManifest(manifest, /*encode_as_single=*/false);
290+
}
291+
292+
Result<absl::Cord> UndumpVersionTreeNode(::nlohmann::json dumped_config,
293+
::nlohmann::json dumped_node) {
294+
TENSORSTORE_ASSIGN_OR_RETURN(
295+
auto config, jb::FromJson<Config>(dumped_config, ConfigBinder));
296+
TENSORSTORE_ASSIGN_OR_RETURN(
297+
auto node,
298+
jb::FromJson<VersionTreeNode>(dumped_node, VersionTreeNodeBinder));
299+
return EncodeVersionTreeNode(config, node);
300+
}
301+
276302
} // namespace internal_ocdbt
277303
} // namespace tensorstore

tensorstore/kvstore/ocdbt/format/dump.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,23 @@ ::nlohmann::json Dump(const BtreeNode& node);
6262
/// Dumps a manifest to a JSON representation.
6363
::nlohmann::json Dump(const VersionTreeNode& node);
6464

65+
/// Converts a JSON manifest dump to the on-disk manifest format.
66+
///
67+
/// \param dumped_manifest The JSON dump of the manifest.
68+
///
69+
/// \return The on-disk representation of the manifest.
70+
Result<absl::Cord> UndumpManifest(::nlohmann::json dumped_manifest);
71+
72+
/// Converts a JSON version tree node dump to the on-disk version tree node
73+
/// format.
74+
///
75+
/// \param dumped_config The JSON dump of the version tree node's config.
76+
/// \param dumped_node The JSON dump of the version tree node.
77+
///
78+
/// \return The on-disk representation of the version tree node.
79+
Result<absl::Cord> UndumpVersionTreeNode(::nlohmann::json dumped_config,
80+
::nlohmann::json dumped_node);
81+
6582
} // namespace internal_ocdbt
6683
} // namespace tensorstore
6784

0 commit comments

Comments
 (0)