[store] add get_tensor_into() and batch_get_tensor_into()

zxpdemonio · zxpdemonio · commit 4fa75d6b3c77 · 2025-12-13T15:33:58.000+08:00
Signed-off-by: Cruz Zhao &lt;CruzZhao@linux.alibaba.com&gt;
diff --git a/mooncake-integration/store/store_py.cpp b/mooncake-integration/store/store_py.cpp
@@ -503,6 +503,176 @@ class MooncakeStorePyWrapper {
         return results_list;
     }
 
+    int64_t get_tensor_into(const std::string &key, uintptr_t buffer_ptr,
+                                     size_t size) {
+	    void *buffer = reinterpret_cast<void *>(buffer_ptr);
+        if (!is_client_initialized()) {
+            LOG(ERROR) << "Client is not initialized";
+            return to_py_ret(ErrorCode::INVALID_PARAMS);
+        }
+
+        if (use_dummy_client_) {
+            LOG(ERROR) << "get_tensor is not supported for dummy client now";
+            return to_py_ret(ErrorCode::INVALID_PARAMS);
+        }
+
+        try {
+            // Section with GIL released
+            py::gil_scoped_release release_gil;
+            auto total_length = store_->get_into_internal(key, buffer, size);
+            if (!total_length.has_value()) {
+                py::gil_scoped_acquire acquire_gil;
+                return to_py_ret(ErrorCode::INVALID_PARAMS);
+            }
+
+            TensorMetadata metadata;
+            // Copy data from buffer to contiguous memory
+            memcpy(&metadata, static_cast<char *>(buffer),
+                   sizeof(TensorMetadata));
+
+            if (metadata.ndim < 0 || metadata.ndim > 4) {
+                py::gil_scoped_acquire acquire_gil;
+                LOG(ERROR) << "Invalid tensor metadata: ndim=" << metadata.ndim;
+                return to_py_ret(ErrorCode::INVALID_PARAMS);
+            }
+
+            TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
+            if (dtype_enum == TensorDtype::UNKNOWN) {
+                py::gil_scoped_acquire acquire_gil;
+                LOG(ERROR) << "Unknown tensor dtype!";
+                return to_py_ret(ErrorCode::INVALID_PARAMS);
+            }
+
+            size_t tensor_size = total_length.value() - sizeof(TensorMetadata);
+            if (tensor_size == 0) {
+                py::gil_scoped_acquire acquire_gil;
+                LOG(ERROR) << "Invalid data format: no tensor data found";
+                return to_py_ret(ErrorCode::INVALID_PARAMS);
+            }
+
+            py::gil_scoped_acquire acquire_gil;
+            // Convert bytes to tensor using torch.from_numpy
+            pybind11::object np_array;
+            int dtype_index = static_cast<int>(dtype_enum);
+            if (dtype_index < 0 ||
+                dtype_index >= static_cast<int>(array_creators.size())) {
+                LOG(ERROR) << "Unsupported dtype enum: " << dtype_index;
+                return to_py_ret(ErrorCode::INVALID_PARAMS);
+            }
+
+            return total_length.value();
+
+        } catch (const pybind11::error_already_set &e) {
+            LOG(ERROR) << "Failed to get tensor data: " << e.what();
+            return to_py_ret(ErrorCode::INVALID_PARAMS);
+        }
+    }
+
+    pybind11::list batch_get_tensor_into(const std::vector<std::string> &keys,
+                                         const std::vector<uintptr_t> &buffer_ptrs,
+                                         const std::vector<size_t> &sizes) {
+        std::vector<void *> buffers;
+        buffers.reserve(buffer_ptrs.size());
+        for (uintptr_t ptr : buffer_ptrs) {
+            buffers.push_back(reinterpret_cast<void *>(ptr));
+        }
+
+        if (!is_client_initialized()) {
+            LOG(ERROR) << "Client is not initialized";
+            py::list empty_list;
+            for (size_t i = 0; i < keys.size(); ++i) {
+                empty_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+            }
+            return empty_list;
+        }
+
+        if (use_dummy_client_) {
+            LOG(ERROR) << "batch_get_tensor is not supported for dummy client "
+                          "now";
+            py::list empty_list;
+            for (size_t i = 0; i < keys.size(); ++i) {
+                empty_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+            }
+            return empty_list;
+        }
+
+        // Phase 1: Batch Get Buffers (GIL Released)
+        py::gil_scoped_release release_gil;
+        // This internal call already handles logging for query failures
+        auto total_lengths =
+            store_->batch_get_into_internal(keys, buffers, sizes);
+
+        py::list results_list;
+        try {
+            py::gil_scoped_acquire acquire_gil;
+            auto torch = torch_module();
+
+            for (size_t i = 0; i < total_lengths.size(); i++) {
+                const auto &buffer = buffers[i];
+                if (!buffer) {
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                auto total_length = total_lengths[i];
+                if (!total_length.has_value()) {
+                    LOG(ERROR) << "Invalid data format: insufficient data for"
+                                  "metadata";
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+                if (total_length.value() <=
+                    static_cast<long>(sizeof(TensorMetadata))) {
+                    LOG(ERROR) << "Invalid data format: insufficient data for "
+                                  "metadata";
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                TensorMetadata metadata;
+                memcpy(&metadata, static_cast<char *>(buffer),
+                       sizeof(TensorMetadata));
+
+                if (metadata.ndim < 0 || metadata.ndim > 4) {
+                    LOG(ERROR)
+                        << "Invalid tensor metadata: ndim=" << metadata.ndim;
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                TensorDtype dtype_enum =
+                    static_cast<TensorDtype>(metadata.dtype);
+                if (dtype_enum == TensorDtype::UNKNOWN) {
+                    LOG(ERROR) << "Unknown tensor dtype!";
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                size_t tensor_size =
+                    total_length.value() - sizeof(TensorMetadata);
+                if (tensor_size == 0) {
+                    LOG(ERROR) << "Invalid data format: no tensor data found";
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                int dtype_index = static_cast<int>(dtype_enum);
+                if (dtype_index < 0 ||
+                    dtype_index >= static_cast<int>(array_creators.size())) {
+                    LOG(ERROR) << "Unsupported dtype enum: " << dtype_index;
+                    results_list.append(to_py_ret(ErrorCode::INVALID_PARAMS));
+                    continue;
+                }
+
+                results_list.append(total_length.value());
+            }
+        } catch (const pybind11::error_already_set &e) {
+            LOG(ERROR) << "Failed during batch tensor deserialization: "
+                       << e.what();
+        }
+        return results_list;
+    }
+
     int put_tensor_with_tp(const std::string &key, pybind11::object tensor,
                            int tp_rank = 0, int tp_size = 1,
                            int split_dim = 0) {
@@ -1241,6 +1411,15 @@ PYBIND11_MODULE(store, m) {
         .def("pub_tensor", &MooncakeStorePyWrapper::pub_tensor, py::arg("key"),
              py::arg("tensor"), py::arg("config") = ReplicateConfig{},
              "Publish a PyTorch tensor with configurable replication settings")
+        .def("get_tensor_into", &MooncakeStorePyWrapper::get_tensor_into,
+             py::arg("key"), py::arg("buffer_ptr"), py::arg("size"),
+             "Get tensor directly into a pre-allocated buffer")
+        .def("batch_get_tensor_into",
+             &MooncakeStorePyWrapper::batch_get_tensor_into, py::arg("keys"),
+             py::arg("buffer_ptrs"), py::arg("sizes"),
+             "Get tensors directly into pre-allocated buffers for "
+             "multiple "
+             "keys")
         .def(
             "register_buffer",
             [](MooncakeStorePyWrapper &self, uintptr_t buffer_ptr,
diff --git a/mooncake-store/include/dummy_client.h b/mooncake-store/include/dummy_client.h
@@ -71,8 +71,16 @@ class DummyClient : public PyClient {
 
     int unregister_buffer(void *buffer);
 
+    tl::expected<int64_t, ErrorCode> get_into_internal(const std::string &key,
+                                                       void *buffer,
+                                                       size_t size);
+
     int64_t get_into(const std::string &key, void *buffer, size_t size);
 
+    std::vector<tl::expected<int64_t, ErrorCode>> batch_get_into_internal(
+        const std::vector<std::string> &keys,
+        const std::vector<void *> &buffers, const std::vector<size_t> &sizes);
+
     std::vector<int64_t> batch_get_into(const std::vector<std::string> &keys,
                                         const std::vector<void *> &buffers,
                                         const std::vector<size_t> &sizes);
@@ -221,4 +229,4 @@ class DummyClient : public PyClient {
     volatile bool connected_ = false;
 };
 
-}  // namespace mooncake
+}  // namespace mooncake
diff --git a/mooncake-store/include/pyclient.h b/mooncake-store/include/pyclient.h
@@ -53,9 +53,17 @@ class PyClient {
 
     virtual int unregister_buffer(void *buffer) = 0;
 
+    virtual tl::expected<int64_t, ErrorCode> get_into_internal(
+        const std::string &key, void *buffer, size_t size) = 0;
+
     virtual int64_t get_into(const std::string &key, void *buffer,
                              size_t size) = 0;
 
+    virtual std::vector<tl::expected<int64_t, ErrorCode>>
+    batch_get_into_internal(const std::vector<std::string> &keys,
+                            const std::vector<void *> &buffers,
+                            const std::vector<size_t> &sizes) = 0;
+
     virtual std::vector<int64_t> batch_get_into(
         const std::vector<std::string> &keys,
         const std::vector<void *> &buffers,
diff --git a/mooncake-store/src/dummy_client.cpp b/mooncake-store/src/dummy_client.cpp
@@ -511,6 +511,12 @@ std::vector<std::shared_ptr<BufferHandle>> DummyClient::batch_get_buffer(
     return std::vector<std::shared_ptr<BufferHandle>>();
 }
 
+tl::expected<int64_t, ErrorCode> DummyClient::get_into_internal(
+    const std::string& key, void* buffer, size_t size) {
+    // TODO: implement this function
+    return tl::unexpected(ErrorCode::INVALID_PARAMS);
+}
+
 int64_t DummyClient::get_into(const std::string& key, void* buffer,
                               size_t size) {
     // TODO: implement this function
@@ -548,16 +554,25 @@ int DummyClient::put_from(const std::string& key, void* buffer, size_t size,
     return -1;
 }
 
-std::vector<int64_t> DummyClient::batch_get_into(
-    const std::vector<std::string>& keys, const std::vector<void*>& buffer_ptrs,
-    const std::vector<size_t>& sizes) {
+std::vector<tl::expected<int64_t, ErrorCode>>
+DummyClient::batch_get_into_internal(const std::vector<std::string>& keys,
+                                     const std::vector<void*>& buffer_ptrs,
+                                     const std::vector<size_t>& sizes) {
     std::vector<uint64_t> buffers;
     for (auto ptr : buffer_ptrs) {
         buffers.push_back(reinterpret_cast<uint64_t>(ptr));
     }
-    auto internal_results =
+    auto results =
         invoke_batch_rpc<&RealClient::batch_get_into_dummy_helper, int64_t>(
             keys.size(), keys, buffers, sizes, client_id_);
+
+    return results;
+}
+
+std::vector<int64_t> DummyClient::batch_get_into(
+    const std::vector<std::string>& keys, const std::vector<void*>& buffer_ptrs,
+    const std::vector<size_t>& sizes) {
+    auto internal_results = batch_get_into_internal(keys, buffer_ptrs, sizes);
     std::vector<int64_t> results;
     results.reserve(internal_results.size());
 
@@ -688,4 +703,4 @@ void DummyClient::ping_thread_main() {
     }
 }
 
-}  // namespace mooncake
+}  // namespace mooncake