From e63fa887e9acb1b2e80b928eabd44e2ca1b4faf1 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 29 Aug 2025 19:43:28 +0800
Subject: [PATCH 01/64] add basic implementation and python interfaces

---
 .../cororpc_communicator.cpp                  | 272 ++++++++++++++
 .../coro_rpc_connector/cororpc_interface.cpp  | 305 +++++++++++++++
 .../coro_rpc_connector/py_rpc_example.cpp     | 347 ++++++++++++++++++
 3 files changed, 924 insertions(+)
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
new file mode 100644
index 000000000..4ed513b8e
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -0,0 +1,272 @@
+#include "transport/coro_rpc_connector/cororpc_communicator.h"
+#include <chrono>
+#include <stdexcept>
+#include <iostream>
+
+using namespace async_simple::coro;
+
+namespace mooncake {
+
+// Impl类的处理函数实现
+std::string CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<std::string> context, std::string_view data) {
+    // 简单回显数据，实际使用中可根据需要修改
+    return std::string(data);
+}
+
+std::string CoroRPCCommunicator::Impl::handleTensorTransfer(coro_rpc::context<std::string> context) {
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+    // 处理张量数据，这里简单返回接收到的大小信息
+    return "received tensor size: " + std::to_string(attachment.size());
+}
+
+void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
+    auto ctx_info = context.get_context_info();
+    // 回显附件数据
+    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
+    context.response_msg();
+}
+
+void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
+    auto ctx_info = context.get_context_info();
+    // 回显张量附件数据
+    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
+    context.response_msg();
+}
+
+// CoroRPCCommunicator构造函数和析构函数
+CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_unique<Impl>()) {}
+
+CoroRPCCommunicator::~CoroRPCCommunicator() {
+    stopServer();
+}
+
+bool CoroRPCCommunicator::initialize(const Config& config) {
+    impl_->config = config;
+    
+    if (!config.listen_address.empty()) {
+        // 初始化服务器
+        impl_->server = std::make_unique<coro_rpc::coro_rpc_server>(
+            config.thread_count, 
+            config.listen_address,
+            std::chrono::seconds(config.timeout_seconds)
+        );
+        
+        // 注册处理函数
+        impl_->server->register_handler<
+            &CoroRPCCommunicator::Impl::handleDataTransfer,
+            &CoroRPCCommunicator::Impl::handleTensorTransfer,
+            &CoroRPCCommunicator::Impl::handleDataTransferWithAttachment,
+            &CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment
+        >(impl_.get());
+    }
+    
+    return true;
+}
+
+bool CoroRPCCommunicator::startServer() {
+    if (!impl_->server) {
+        return false;
+    }
+    
+    auto ec = impl_->server->start();
+    impl_->is_server_started = (ec.val() == 0);
+    return impl_->is_server_started;
+}
+
+bool CoroRPCCommunicator::startServerAsync() {
+    if (!impl_->server) {
+        return false;
+    }
+    
+    auto ec = impl_->server->async_start();
+    impl_->is_server_started = !ec.hasResult();
+    return impl_->is_server_started;
+}
+
+void CoroRPCCommunicator::stopServer() {
+    if (impl_->server && impl_->is_server_started) {
+        impl_->server.reset();
+        impl_->is_server_started = false;
+    }
+}
+
+int CoroRPCCommunicator::sendData(const std::string& target_address,
+                                  const void* data,
+                                  size_t data_size) {
+    try {
+        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
+        auto connect_result = syncAwait(client->connect(target_address));
+        if (!connect_result) {
+            return -1;
+        }
+        
+        std::string_view data_view(static_cast<const char*>(data), data_size);
+        auto result = syncAwait(client->call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view));
+        
+        return result.has_value() ? 0 : -1;
+    } catch (const std::exception& e) {
+        return -1;
+    }
+}
+
+Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
+                                                 const void* data,
+                                                 size_t data_size) {
+    result res;
+    
+    try {
+        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
+        auto connect_result = co_await client->connect(target_address);
+        if (!connect_result) {
+            res.code = -1;
+            res.err_msg = "connection failed";
+            co_return res;
+        }
+        
+        std::string_view data_view(static_cast<const char*>(data), data_size);
+        auto result = co_await client->call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+        
+        if (result.has_value()) {
+            res.code = 0;
+            res.data = result.value();
+            res.data_size = res.data.size();
+        } else {
+            res.code = result.error().val();
+            res.err_msg = result.error().msg;
+        }
+    } catch (const std::exception& e) {
+        res.code = -1;
+        res.err_msg = e.what();
+    }
+    
+    co_return res;
+}
+
+int CoroRPCCommunicator::sendTensor(const std::string& target_address,
+                                    const pybind11::object& tensor) {
+    try {
+        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
+        auto connect_result = syncAwait(client->connect(target_address));
+        if (!connect_result) {
+            return -1;
+        }
+        
+        // 从PyTorch tensor获取数据指针和大小
+        uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
+        size_t numel = tensor.attr("numel")().cast<size_t>();
+        size_t element_size = tensor.attr("element_size")().cast<size_t>();
+        size_t tensor_size = numel * element_size;
+        
+        client->set_req_attachment(std::string_view(reinterpret_cast<char*>(data_ptr), tensor_size));
+        auto result = syncAwait(client->call<&CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment>());
+        
+        return result.has_value() ? 0 : -1;
+    } catch (const std::exception& e) {
+        return -1;
+    }
+}
+
+std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address,
+                                                       const TensorInfo& tensor) {
+    auto promise = std::make_shared<std::promise<int>>();
+    auto future = promise->get_future();
+    
+    auto task = [this, target_address, tensor, promise]() -> Lazy<void> {
+        try {
+            auto client = std::make_unique<coro_rpc::coro_rpc_client>();
+            auto connect_result = co_await client->connect(target_address);
+            if (!connect_result) {
+                promise->set_value(-1);
+                co_return;
+            }
+            
+            std::string_view data_view(static_cast<const char*>(tensor.data_ptr), tensor.total_bytes);
+            client->set_req_attachment(data_view);
+            auto result = co_await client->call<&CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment>();
+            
+            promise->set_value(result.has_value() ? 0 : -1);
+        } catch (const std::exception& e) {
+            promise->set_value(-1);
+        }
+    };
+    
+    task().start([](auto&&) {});
+    
+    return future;
+}
+
+int CoroRPCCommunicator::receiveData(const std::string& source_address,
+                                     void* buffer,
+                                     size_t buffer_size,
+                                     int timeout_ms) {
+    // 这是一个简化实现，实际中可能需要更复杂的接收逻辑
+    // 由于coro_rpc主要是请求-响应模式，这里返回不支持
+    return -1;
+}
+
+Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address,
+                                                        int timeout_ms) {
+    // 这是一个简化实现，实际中可能需要更复杂的接收逻辑
+    co_return "";
+}
+
+bool CoroRPCCommunicator::addRemoteConnection(const std::string& remote_address) {
+    try {
+        auto pool = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
+        impl_->client_pools[remote_address] = pool;
+        return true;
+    } catch (const std::exception& e) {
+        return false;
+    }
+}
+
+void CoroRPCCommunicator::removeRemoteConnection(const std::string& remote_address) {
+    impl_->client_pools.erase(remote_address);
+}
+
+bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
+    return impl_->client_pools.find(remote_address) != impl_->client_pools.end();
+}
+
+std::string CoroRPCCommunicator::handleDataTransfer(coro_rpc::context<std::string> context, std::string_view data) {
+    return impl_->handleDataTransfer(std::move(context), data);
+}
+
+std::string CoroRPCCommunicator::handleTensorTransfer(coro_rpc::context<std::string> context) {
+    return impl_->handleTensorTransfer(std::move(context));
+}
+
+void CoroRPCCommunicator::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
+    impl_->handleDataTransferWithAttachment(std::move(context), data);
+}
+
+void CoroRPCCommunicator::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
+    impl_->handleTensorTransferWithAttachment(std::move(context));
+}
+
+std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size, size_t timeout_seconds) {
+    auto communicator = std::make_unique<CoroRPCCommunicator>();
+    Config config;
+    config.pool_size = pool_size;
+    config.timeout_seconds = timeout_seconds;
+    
+    if (communicator->initialize(config)) {
+        return communicator;
+    }
+    return nullptr;
+}
+
+std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_address, size_t thread_count) {
+    auto communicator = std::make_unique<CoroRPCCommunicator>();
+    Config config;
+    config.listen_address = listen_address;
+    config.thread_count = thread_count;
+    
+    if (communicator->initialize(config)) {
+        return communicator;
+    }
+    return nullptr;
+}
+
+} // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
new file mode 100644
index 000000000..6b6cb4e85
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -0,0 +1,305 @@
+#include "transport/coro_rpc_connector/cororpc_interface.h"
+#include "transport/coro_rpc_connector/cororpc_communicator.h"
+#include <pybind11/stl.h>
+#include <iostream>
+#include <json/json.h>
+
+namespace mooncake {
+
+// Impl类定义
+class CoroRPCInterface::Impl {
+public:
+    std::unique_ptr<CoroRPCCommunicator> communicator;
+    pybind11::function data_receive_callback;
+    pybind11::function tensor_receive_callback;
+    
+    // 处理接收到的数据
+    void onDataReceived(const std::string& source, const std::string& data);
+    void onTensorReceived(const std::string& source, const std::string& data,
+                         const std::vector<size_t>& shape, const std::string& dtype);
+};
+
+// CoroRPCInterface实现
+CoroRPCInterface::CoroRPCInterface() : impl_(std::make_unique<Impl>()) {}
+
+CoroRPCInterface::~CoroRPCInterface() = default;
+
+bool CoroRPCInterface::initialize(const std::string& listen_address,
+                                 size_t thread_count,
+                                 size_t timeout_seconds,
+                                 size_t pool_size) {
+    Config config;
+    config.listen_address = listen_address;
+    config.thread_count = thread_count;
+    config.timeout_seconds = timeout_seconds;
+    config.pool_size = pool_size;
+    
+    impl_->communicator = std::make_unique<CoroRPCCommunicator>();
+    return impl_->communicator->initialize(config);
+}
+
+bool CoroRPCInterface::startServer() {
+    if (!impl_->communicator) return false;
+    return impl_->communicator->startServer();
+}
+
+bool CoroRPCInterface::startServerAsync() {
+    if (!impl_->communicator) return false;
+    return impl_->communicator->startServerAsync();
+}
+
+void CoroRPCInterface::stopServer() {
+    if (impl_->communicator) {
+        impl_->communicator->stopServer();
+    }
+}
+
+bool CoroRPCInterface::addRemoteConnection(const std::string& remote_address) {
+    if (!impl_->communicator) return false;
+    return impl_->communicator->addRemoteConnection(remote_address);
+}
+
+void CoroRPCInterface::removeRemoteConnection(const std::string& remote_address) {
+    if (impl_->communicator) {
+        impl_->communicator->removeRemoteConnection(remote_address);
+    }
+}
+
+bool CoroRPCInterface::isConnected(const std::string& remote_address) {
+    if (!impl_->communicator) return false;
+    return impl_->communicator->isConnected(remote_address);
+}
+
+int CoroRPCInterface::sendData(const std::string& target_address, pybind11::bytes data) {
+    if (!impl_->communicator) return -1;
+    
+    pybind11::gil_scoped_release release;
+    std::string data_str = data;
+    return impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
+}
+
+pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_address,
+                                                 pybind11::bytes data,
+                                                 pybind11::handle loop) {
+    auto future = loop.attr("create_future")();
+    
+    if (!impl_->communicator) {
+        loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
+        return future;
+    }
+    
+    // 创建异步任务
+    std::string data_str = data;
+    auto task = [this, target_address, data_str, future, loop]() {
+        pybind11::gil_scoped_release release;
+        int result = impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
+        
+        pybind11::gil_scoped_acquire acquire;
+        loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+    };
+    
+    std::thread(task).detach();
+    return future;
+}
+
+int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::handle tensor) {
+    if (!impl_->communicator) return -1;
+    
+    pybind11::gil_scoped_release release;
+    return impl_->communicator->sendTensor(target_address, pybind11::cast<pybind11::object>(tensor));
+}
+
+pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_address,
+                                                   pybind11::handle tensor,
+                                                   pybind11::handle loop) {
+    auto future = loop.attr("create_future")();
+    
+    if (!impl_->communicator) {
+        loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
+        return future;
+    }
+    
+    // 获取tensor信息
+    CoroRPCCommunicator::TensorInfo tensor_info;
+    {
+        pybind11::gil_scoped_acquire acquire;
+        tensor_info.data_ptr = reinterpret_cast<void*>(tensor.attr("data_ptr")().cast<uintptr_t>());
+        size_t numel = tensor.attr("numel")().cast<size_t>();
+        size_t element_size = tensor.attr("element_size")().cast<size_t>();
+        tensor_info.total_bytes = numel * element_size;
+        
+        // 获取shape和dtype
+        auto shape_tuple = tensor.attr("shape");
+        for (pybind11::handle item : shape_tuple) {
+            tensor_info.shape.push_back(item.cast<size_t>());
+        }
+        tensor_info.dtype = tensor.attr("dtype").attr("__str__")().cast<std::string>();
+    }
+    
+    // 异步发送
+    auto std_future = impl_->communicator->sendTensorAsync(target_address, tensor_info);
+    
+    // 转换为Python future
+    auto task = [std_future = std::move(std_future), future, loop]() mutable {
+        int result = std_future.get();
+        pybind11::gil_scoped_acquire acquire;
+        loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+    };
+    
+    std::thread(task).detach();
+    return future;
+}
+
+void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
+    impl_->data_receive_callback = callback;
+}
+
+void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
+    impl_->tensor_receive_callback = callback;
+}
+
+void CoroRPCInterface::handleIncomingData(const std::string& source_address,
+                                         const std::string& data) {
+    if (impl_->data_receive_callback) {
+        ReceivedData received;
+        received.data = data;
+        received.source_address = source_address;
+        received.data_size = data.size();
+        
+        pybind11::gil_scoped_acquire acquire;
+        impl_->data_receive_callback(received);
+    }
+}
+
+void CoroRPCInterface::handleIncomingTensor(const std::string& source_address,
+                                           const std::string& data,
+                                           const std::vector<size_t>& shape,
+                                           const std::string& dtype) {
+    if (impl_->tensor_receive_callback) {
+        ReceivedTensor received;
+        received.data = data;
+        received.source_address = source_address;
+        received.shape = shape;
+        received.dtype = dtype;
+        received.total_bytes = data.size();
+        
+        pybind11::gil_scoped_acquire acquire;
+        impl_->tensor_receive_callback(received);
+    }
+}
+
+// ReceivedTensor的rebuildTensor实现
+pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
+    pybind11::gil_scoped_acquire acquire;
+    
+    try {
+        // 导入torch模块
+        auto torch = pybind11::module::import("torch");
+        auto numpy = pybind11::module::import("numpy");
+        
+        // 确定numpy数据类型
+        std::string np_dtype;
+        if (dtype.find("float32") != std::string::npos) {
+            np_dtype = "float32";
+        } else if (dtype.find("float64") != std::string::npos) {
+            np_dtype = "float64";
+        } else if (dtype.find("int32") != std::string::npos) {
+            np_dtype = "int32";
+        } else if (dtype.find("int64") != std::string::npos) {
+            np_dtype = "int64";
+        } else {
+            np_dtype = "float32"; // 默认类型
+        }
+        
+        // 创建numpy数组
+        auto np_array = numpy.attr("frombuffer")(
+            pybind11::bytes(data), 
+            "dtype"_a=np_dtype
+        ).attr("reshape")(pybind11::cast(shape));
+        
+        // 转换为torch tensor
+        auto tensor = torch.attr("from_numpy")(np_array);
+        
+        return tensor;
+    } catch (const std::exception& e) {
+        std::cerr << "Error rebuilding tensor: " << e.what() << std::endl;
+        return pybind11::none();
+    }
+}
+
+// Impl类方法实现
+void CoroRPCInterface::Impl::onDataReceived(const std::string& source, const std::string& data) {
+    // 这里可以添加具体的数据接收处理逻辑
+}
+
+void CoroRPCInterface::Impl::onTensorReceived(const std::string& source, const std::string& data,
+                                             const std::vector<size_t>& shape, const std::string& dtype) {
+    // 这里可以添加具体的tensor接收处理逻辑
+}
+
+// 工厂函数实现
+std::unique_ptr<CoroRPCInterface> createRPCClient(size_t pool_size, size_t timeout_seconds) {
+    auto interface = std::make_unique<CoroRPCInterface>();
+    if (interface->initialize("", 0, timeout_seconds, pool_size)) {
+        return interface;
+    }
+    return nullptr;
+}
+
+std::unique_ptr<CoroRPCInterface> createRPCServer(const std::string& listen_address, size_t thread_count) {
+    auto interface = std::make_unique<CoroRPCInterface>();
+    if (interface->initialize(listen_address, thread_count)) {
+        return interface;
+    }
+    return nullptr;
+}
+
+} // namespace mooncake
+
+// Python绑定
+namespace py = pybind11;
+
+PYBIND11_MODULE(coro_rpc_interface, m) {
+    using namespace mooncake;
+    
+    // ReceivedData类
+    py::class_<CoroRPCInterface::ReceivedData>(m, "ReceivedData")
+        .def(py::init<>())
+        .def_readonly("source_address", &CoroRPCInterface::ReceivedData::source_address)
+        .def_readonly("data_size", &CoroRPCInterface::ReceivedData::data_size)
+        .def("get_bytes", &CoroRPCInterface::ReceivedData::getBytes);
+    
+    // ReceivedTensor类
+    py::class_<CoroRPCInterface::ReceivedTensor>(m, "ReceivedTensor")
+        .def(py::init<>())
+        .def_readonly("source_address", &CoroRPCInterface::ReceivedTensor::source_address)
+        .def_readonly("shape", &CoroRPCInterface::ReceivedTensor::shape)
+        .def_readonly("dtype", &CoroRPCInterface::ReceivedTensor::dtype)
+        .def_readonly("total_bytes", &CoroRPCInterface::ReceivedTensor::total_bytes)
+        .def("rebuild_tensor", &CoroRPCInterface::ReceivedTensor::rebuildTensor);
+    
+    // 主接口类
+    py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
+        .def(py::init<>())
+        .def("initialize", &CoroRPCInterface::initialize,
+             "listen_address"_a="", "thread_count"_a=0, 
+             "timeout_seconds"_a=30, "pool_size"_a=10)
+        .def("start_server", &CoroRPCInterface::startServer)
+        .def("start_server_async", &CoroRPCInterface::startServerAsync)
+        .def("stop_server", &CoroRPCInterface::stopServer)
+        .def("add_remote_connection", &CoroRPCInterface::addRemoteConnection)
+        .def("remove_remote_connection", &CoroRPCInterface::removeRemoteConnection)
+        .def("is_connected", &CoroRPCInterface::isConnected)
+        .def("send_data", &CoroRPCInterface::sendData)
+        .def("send_data_async", &CoroRPCInterface::sendDataAsync)
+        .def("send_tensor", &CoroRPCInterface::sendTensor)
+        .def("send_tensor_async", &CoroRPCInterface::sendTensorAsync)
+        .def("set_data_receive_callback", &CoroRPCInterface::setDataReceiveCallback)
+        .def("set_tensor_receive_callback", &CoroRPCInterface::setTensorReceiveCallback);
+    
+    // 工厂函数
+    m.def("create_rpc_client", &createRPCClient, 
+          "pool_size"_a=10, "timeout_seconds"_a=30);
+    m.def("create_rpc_server", &createRPCServer,
+          "listen_address"_a, "thread_count"_a=0);
+}
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
new file mode 100644
index 000000000..5379de5ce
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
@@ -0,0 +1,347 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <thread>
+#include <ylt/coro_io/client_pool.hpp>
+#include <ylt/coro_rpc/coro_rpc_client.hpp>
+#include <ylt/coro_rpc/coro_rpc_server.hpp>
+
+#include "async_simple/coro/SyncAwait.h"
+
+namespace py = pybind11;
+
+class py_rpc_context {
+ public:
+  void response_msg(py::buffer msg, py::handle done) {
+    py::buffer_info info = msg.request();
+    const char *data = static_cast<char *>(info.ptr);
+    context_.get_context_info()->set_response_attachment(
+        std::string_view(data, info.size));
+    done.inc_ref();
+    context_.get_context_info()->set_complete_handler(
+        [done](const std::error_code &ec, std::size_t) {
+          py::gil_scoped_acquire acquire;
+          done(!ec);
+          done.dec_ref();
+        });
+    context_.response_msg();
+  }
+
+  coro_rpc::context<void> context_;
+};
+
+class py_coro_rpc_client_pool;
+class py_coro_rpc_server {
+ public:
+  py_coro_rpc_server(size_t thd_num, std::string address,
+                     py::handle py_callback, size_t seconds)
+      : server_(thd_num, address, std::chrono::seconds(seconds)),
+        py_callback_(py_callback) {
+    server_.register_handler<&py_coro_rpc_server::handle_msg,
+                             &py_coro_rpc_server::handle_tensor>(this);
+  }
+
+  bool start() {
+    auto ec = server_.start();
+    return ec.val() == 0;
+  }
+
+  bool async_start() {
+    auto ec = server_.async_start();
+    return !ec.hasResult();
+  }
+
+ private:
+  friend class py_coro_rpc_client_pool;
+  void handle_msg(coro_rpc::context<void> context, std::string_view msg) {
+    py_rpc_context t{};
+    t.context_ = std::move(context);
+    py::gil_scoped_acquire acquire;
+    auto view = py::memoryview::from_buffer(msg.data(), {msg.size()},
+                                            {sizeof(uint8_t)});
+    py_callback_(std::move(t), view);
+  }
+
+  void handle_tensor(coro_rpc::context<void> context) {
+    auto ctx_info = context.get_context_info();
+    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
+    context.response_msg();
+  }
+
+  coro_rpc::coro_rpc_server server_;
+  py::handle py_callback_;
+};
+
+class string_holder {
+ public:
+  string_holder(std::string val) : value(std::move(val)) {}
+
+  py::object str_view(uint64_t data_size) {
+    auto view = py::memoryview::from_buffer(value.data(), {data_size},
+                                            {sizeof(uint8_t)});
+    return view;
+  }
+
+ private:
+  std::string value;
+};
+
+struct rpc_result {
+  int code;
+  std::string err_msg;
+  std::shared_ptr<string_holder> data_ptr;
+  uint64_t data_size;
+  py::object str_view() { return data_ptr->str_view(data_size); }
+};
+
+class py_coro_rpc_client_pool {
+ public:
+  py_coro_rpc_client_pool(std::string url)
+      : pool_(coro_io::client_pool<coro_rpc::coro_rpc_client>::create(url)) {
+    async_simple::coro::syncAwait(client_.connect(url));
+  };
+
+  pybind11::object async_send_msg_with_outbuf(py::handle loop,
+                                              py::handle py_bytes,
+                                              py::buffer out_buf) {
+    auto local_future = loop.attr("create_future")();
+    py::handle future = local_future;
+
+    py::buffer_info info = out_buf.request(true);
+    char *data = static_cast<char *>(info.ptr);
+    std::span<char> buf(data, info.size);
+
+    py_bytes.inc_ref();
+
+    pool_
+        ->send_request([py_bytes, loop, future,
+                        buf](coro_rpc::coro_rpc_client &client)
+                           -> async_simple::coro::Lazy<void> {
+          char *data;
+          ssize_t length;
+          PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
+          client.set_resp_attachment_buf(buf);
+          auto result = co_await client.call<&py_coro_rpc_server::handle_msg>(
+              std::string_view(data, length));
+          py::gil_scoped_acquire acquire;
+          loop.attr("call_soon_threadsafe")(
+              future.attr("set_result"),
+              py::make_tuple(result.has_value(),
+                             client.get_resp_attachment().size()));
+          py_bytes.dec_ref();
+        })
+        .start([](auto &&) {
+        });
+
+    return local_future;
+  }
+
+  pybind11::object async_send_msg(py::handle loop, py::handle py_bytes) {
+    auto local_future = loop.attr("create_future")();
+    py::handle future = local_future;
+
+    py_bytes.inc_ref();
+
+    pool_
+        ->send_request([py_bytes, loop,
+                        future](coro_rpc::coro_rpc_client &client)
+                           -> async_simple::coro::Lazy<void> {
+          char *data;
+          ssize_t length;
+          PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
+          auto r = co_await client.call<&py_coro_rpc_server::handle_msg>(
+              std::string_view(data, length));
+          rpc_result result{};
+          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
+          if (!r.has_value()) {
+            ELOG_INFO << "rpc call failed: " << r.error().msg;
+            result.code = r.error().val();
+            result.err_msg = r.error().msg;
+          }
+          else {
+            result.data_ptr = std::make_shared<string_holder>(
+                std::move(client.release_resp_attachment()));
+            result.data_size = client.get_resp_attachment().size();
+          }
+
+          py::gil_scoped_acquire acquire;
+          loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+          py_bytes.dec_ref();
+        })
+        .start([](auto &&) {
+        });
+
+    return local_future;
+  }
+
+  pybind11::object async_send_tensor(py::handle loop,
+                                     py::handle tensor_handle) {
+    py::object local_future;
+    py::handle future;
+
+    {
+      py::gil_scoped_acquire acquire;
+      local_future = loop.attr("create_future")();
+      future = local_future;
+      tensor_handle.inc_ref();
+    }
+
+    pool_
+        ->send_request([tensor_handle, loop,
+                        future](coro_rpc::coro_rpc_client &client)
+                           -> async_simple::coro::Lazy<void> {
+          {
+            py::gil_scoped_acquire acquire;
+            uintptr_t data_ptr =
+                tensor_handle.attr("data_ptr")().cast<uintptr_t>();
+            size_t numel = tensor_handle.attr("numel")().cast<size_t>();
+            size_t element_size =
+                tensor_handle.attr("element_size")().cast<size_t>();
+            size_t tensor_size = numel * element_size;
+            client.set_req_attachment(
+                std::string_view((char *)data_ptr, tensor_size));
+          }
+
+          auto r = co_await client.call<&py_coro_rpc_server::handle_tensor>();
+          rpc_result result{};
+          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
+          if (!r.has_value()) {
+            ELOG_INFO << "rpc call failed: " << r.error().msg;
+            result.code = r.error().val();
+            result.err_msg = r.error().msg;
+          }
+          else {
+            result.data_ptr = std::make_shared<string_holder>(
+                std::move(client.release_resp_attachment()));
+            result.data_size = client.get_resp_attachment().size();
+          }
+
+          py::gil_scoped_acquire acquire;
+          loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+          tensor_handle.dec_ref();
+        })
+        .start([](auto &&) {
+        });
+
+    return local_future;
+  }
+
+  rpc_result sync_send_msg(py::handle py_bytes) {
+    std::promise<rpc_result> p;
+    auto future = p.get_future();
+    pool_
+        ->send_request([py_bytes, p = std::move(p)](
+                           coro_rpc::coro_rpc_client &client) mutable
+                       -> async_simple::coro::Lazy<void> {
+          std::string_view send_msg;
+          {
+            char *data;
+            ssize_t length;
+            py::gil_scoped_acquire acquire;
+            PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
+            send_msg = std::string_view(data, length);
+          }
+          auto r =
+              co_await client.call<&py_coro_rpc_server::handle_msg>(send_msg);
+          rpc_result result{};
+          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
+          if (!r.has_value()) {
+            ELOG_INFO << "rpc call failed: " << r.error().msg;
+            result.code = r.error().val();
+            result.err_msg = r.error().msg;
+          }
+          else {
+            result.data_ptr = std::make_shared<string_holder>(
+                std::move(client.release_resp_attachment()));
+            result.data_size = client.get_resp_attachment().size();
+          }
+
+          p.set_value(result);
+        })
+        .start([](auto &&) {
+        });
+
+    return future.get();
+  }
+
+  rpc_result sync_send_msg1(py::handle py_bytes) {
+    auto task = [this,
+                 py_bytes]() mutable -> async_simple::coro::Lazy<rpc_result> {
+      std::string_view send_msg;
+      {
+        char *data;
+        ssize_t length;
+        py::gil_scoped_acquire acquire;
+        PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
+        send_msg = std::string_view(data, length);
+      }
+      auto r = co_await client_.call<&py_coro_rpc_server::handle_msg>(send_msg);
+      rpc_result result{};
+      ELOG_INFO << "rpc result: " << client_.get_resp_attachment();
+      if (!r.has_value()) {
+        ELOG_INFO << "rpc call failed: " << r.error().msg;
+        result.code = r.error().val();
+        result.err_msg = r.error().msg;
+      }
+      else {
+        result.data_ptr = std::make_shared<string_holder>(
+            std::move(client_.release_resp_attachment()));
+        result.data_size = client_.get_resp_attachment().size();
+      }
+
+      co_return result;
+    };
+
+    return async_simple::coro::syncAwait(task());
+  }
+
+ private:
+  std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>> pool_;
+  coro_rpc::coro_rpc_client client_;
+};
+
+PYBIND11_MODULE(py_coro_rpc, m) {
+  m.def("hello", [] {
+    return std::string("hello");
+  });
+
+  py::class_<py_rpc_context>(m, "py_rpc_context")
+      .def(py::init<>())
+      .def("response_msg", &py_rpc_context::response_msg);
+
+  py::class_<py_coro_rpc_server>(m, "coro_rpc_server")
+      .def(py::init<size_t, std::string, py::function, size_t>())
+      .def("start", &py_coro_rpc_server::start)
+      .def("async_start", &py_coro_rpc_server::async_start);
+
+  py::class_<py_coro_rpc_client_pool>(m, "py_coro_rpc_client_pool")
+      .def(py::init<std::string>())
+      .def("async_send_msg", &py_coro_rpc_client_pool::async_send_msg)
+      .def("sync_send_msg", &py_coro_rpc_client_pool::sync_send_msg,
+           py::call_guard<py::gil_scoped_release>())
+      .def("sync_send_msg1", &py_coro_rpc_client_pool::sync_send_msg1,
+           py::call_guard<py::gil_scoped_release>())
+      .def("async_send_tensor", &py_coro_rpc_client_pool::async_send_tensor,
+           py::call_guard<py::gil_scoped_release>())
+      .def("async_send_msg_with_outbuf",
+           &py_coro_rpc_client_pool::async_send_msg_with_outbuf);
+
+  py::class_<string_holder, std::shared_ptr<string_holder>>(m, "Holder")
+      .def(py::init<std::string>())
+      .def("str_view", &string_holder::str_view);
+
+  py::class_<rpc_result>(m, "rpc_result")
+      .def(py::init<>())
+      .def_readonly("code", &rpc_result::code)
+      .def_readonly("err_msg", &rpc_result::err_msg)
+      .def_readwrite("data_ptr", &rpc_result::data_ptr)
+      .def_readonly("data_size", &rpc_result::data_size)
+      .def("str_view", &rpc_result::str_view);
+
+  m.def("log", [](std::string str) {
+    ELOG_INFO << str;
+  });
+}
\ No newline at end of file

From 22e59a0e7095f5003d06f9a0ab5bafc19ba4b747 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 1 Sep 2025 19:38:32 +0800
Subject: [PATCH 02/64] add real communicator logic and interface

---
 .gitignore                                    |   2 +
 mooncake-integration/CMakeLists.txt           |  12 +
 .../transfer_engine/transfer_engine_py.cpp    |  54 +++
 .../coro_rpc_connector/cororpc_communicator.h |  88 ++++
 .../coro_rpc_connector/cororpc_interface.h    |  87 ++++
 .../src/transport/CMakeLists.txt              |   6 +-
 .../coro_rpc_connector/CMakeLists.txt         |  38 ++
 .../cororpc_communicator.cpp                  | 403 +++++++++++-------
 .../coro_rpc_connector/cororpc_interface.cpp  | 232 +++++-----
 .../coro_rpc_connector/py_rpc_example.cpp     | 347 ---------------
 .../coro_rpc_connector/test_integration.py    |  30 ++
 .../tests/test_real_coro_rpc.py               |  42 ++
 12 files changed, 736 insertions(+), 605 deletions(-)
 create mode 100644 mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
 create mode 100644 mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
 delete mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
 create mode 100644 mooncake-transfer-engine/tests/test_real_coro_rpc.py

diff --git a/.gitignore b/.gitignore
index c2568c1af..1bf3c24d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,3 +198,5 @@ mooncake-wheel/mooncake/transfer_engine_bench
 
 # Claude Code Memory
 CLAUDE.md
+
+
diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt
index 46886cb73..1eafbfd3b 100644
--- a/mooncake-integration/CMakeLists.txt
+++ b/mooncake-integration/CMakeLists.txt
@@ -20,6 +20,10 @@ endif()
 
 include_directories("/usr/include/jsoncpp")
 include_directories("./")
+include_directories("../mooncake-transfer-engine/include")
+
+# Find Python for pybind11 integration
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
 
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
@@ -31,12 +35,20 @@ set(PYTHON_PACKAGE_NAME "mooncake")
 
 pybind11_add_module(engine ${SOURCES} ${CACHE_ALLOCATOR_SOURCES}
     transfer_engine/transfer_engine_py.cpp 
+    ../mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+    ../mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+)
+
+target_include_directories(engine PRIVATE 
+    ${Python3_INCLUDE_DIRS}
 )
 
 target_link_libraries(engine PUBLIC 
     transfer_engine 
     glog::glog
     gflags::gflags
+    yalantinglibs::yalantinglibs
+    pybind11::module
 )
 
 set(ALLOCATOR_SO_PATH "${CMAKE_BINARY_DIR}/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so")
diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index ca02559e7..2c405b4c8 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -20,6 +20,11 @@
 
 #include <pybind11/stl.h>
 
+// Include coro_rpc_interface headers
+#include "transport/coro_rpc_connector/cororpc_interface.h"
+
+using namespace pybind11::literals;
+
 #ifdef USE_MNNVL
 #include "transport/nvlink_transport/nvlink_transport.h"
 static void *allocateMemory(size_t size) {
@@ -639,6 +644,51 @@ std::string TransferEnginePy::getLocalTopology() {
 
 namespace py = pybind11;
 
+// Forward declaration for coro_rpc_interface binding function
+void bind_coro_rpc_interface(py::module_ &m);
+
+// Implementation of coro_rpc_interface binding function
+void bind_coro_rpc_interface(py::module_ &m) {
+    using namespace mooncake;
+    
+    py::class_<CoroRPCInterface::ReceivedData>(m, "ReceivedData")
+        .def(py::init<>())
+        .def_readonly("source_address", &CoroRPCInterface::ReceivedData::source_address)
+        .def_readonly("data_size", &CoroRPCInterface::ReceivedData::data_size)
+        .def("get_bytes", &CoroRPCInterface::ReceivedData::getBytes);
+    
+    py::class_<CoroRPCInterface::ReceivedTensor>(m, "ReceivedTensor")
+        .def(py::init<>())
+        .def_readonly("source_address", &CoroRPCInterface::ReceivedTensor::source_address)
+        .def_readonly("shape", &CoroRPCInterface::ReceivedTensor::shape)
+        .def_readonly("dtype", &CoroRPCInterface::ReceivedTensor::dtype)
+        .def_readonly("total_bytes", &CoroRPCInterface::ReceivedTensor::total_bytes)
+        .def("rebuild_tensor", &CoroRPCInterface::ReceivedTensor::rebuildTensor);
+    
+    py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
+        .def(py::init<>())
+        .def("initialize", &CoroRPCInterface::initialize,
+             "listen_address"_a="", "thread_count"_a=0, 
+             "timeout_seconds"_a=30, "pool_size"_a=10)
+        .def("start_server", &CoroRPCInterface::startServer)
+        .def("start_server_async", &CoroRPCInterface::startServerAsync)
+        .def("stop_server", &CoroRPCInterface::stopServer)
+        .def("add_remote_connection", &CoroRPCInterface::addRemoteConnection)
+        .def("remove_remote_connection", &CoroRPCInterface::removeRemoteConnection)
+        .def("is_connected", &CoroRPCInterface::isConnected)
+        .def("send_data", &CoroRPCInterface::sendData)
+        .def("send_data_async", &CoroRPCInterface::sendDataAsync)
+        .def("send_tensor", &CoroRPCInterface::sendTensor)
+        .def("send_tensor_async", &CoroRPCInterface::sendTensorAsync)
+        .def("set_data_receive_callback", &CoroRPCInterface::setDataReceiveCallback)
+        .def("set_tensor_receive_callback", &CoroRPCInterface::setTensorReceiveCallback);
+    
+    m.def("create_rpc_client", &createRPCClient, 
+          "pool_size"_a=10, "timeout_seconds"_a=30);
+    m.def("create_rpc_server", &createRPCServer,
+          "listen_address"_a, "thread_count"_a=0);
+}
+
 PYBIND11_MODULE(engine, m) {
     py::enum_<TransferEnginePy::TransferOpcode> transfer_opcode(
         m, "TransferOpcode", py::arithmetic());
@@ -688,4 +738,8 @@ PYBIND11_MODULE(engine, m) {
                  &TransferEnginePy::getFirstBufferAddress);
 
     adaptor_cls.attr("TransferOpcode") = transfer_opcode;
+
+    // Add coro_rpc_interface as a submodule
+    auto coro_rpc_submodule = m.def_submodule("coro_rpc_interface", "CoroRPC interface for communication");
+    bind_coro_rpc_interface(coro_rpc_submodule);
 }
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
new file mode 100644
index 000000000..cd7a67233
--- /dev/null
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <future>
+#include <unordered_map>
+#include <pybind11/pybind11.h>
+#include <ylt/coro_rpc/coro_rpc_client.hpp>
+#include <ylt/coro_rpc/coro_rpc_server.hpp>
+#include <ylt/coro_io/client_pool.hpp>
+
+namespace mooncake {
+
+struct TensorInfo {
+    void* data_ptr = nullptr;
+    std::vector<size_t> shape;
+    std::string dtype;
+    size_t total_bytes = 0;
+};
+
+struct result {
+    int code = 0;
+    std::string err_msg;
+};
+
+struct Config {
+    std::string listen_address;
+    size_t thread_count = 0;
+    size_t timeout_seconds = 30;
+    size_t pool_size = 10;
+};
+
+template<typename T>
+struct SimpleContext {
+    coro_rpc::context<T> context_;
+    void response_msg() { context_.response_msg(); }
+};
+
+class CoroRPCCommunicator {
+public:
+    class Impl {
+    public:
+        Config config;
+        bool is_server_started = false;
+        
+        // 真实的 coro_rpc 组件
+        std::unique_ptr<coro_rpc::coro_rpc_server> server_;
+        std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>> client_pool_;
+        std::unordered_map<std::string, coro_rpc::coro_rpc_client> clients_;
+        
+        void handleDataTransfer(coro_rpc::context<void> context, std::string_view data);
+        void handleTensorTransfer(coro_rpc::context<void> context);
+        void handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data);
+        void handleTensorTransferWithAttachment(coro_rpc::context<void> context);
+    };
+
+    CoroRPCCommunicator();
+    ~CoroRPCCommunicator();
+
+    bool initialize(const Config& config);
+    bool startServer();
+    bool startServerAsync();
+    void stopServer();
+    
+    bool addRemoteConnection(const std::string& remote_address);
+    void removeRemoteConnection(const std::string& remote_address);
+    bool isConnected(const std::string& remote_address);
+    
+    int sendData(const std::string& target_address, const void* data, size_t data_size);
+    std::future<result> sendDataAsync(const std::string& target_address, const void* data, size_t data_size);
+    
+    int sendTensor(const std::string& target_address, const pybind11::object& tensor);
+    std::future<int> sendTensorAsync(const std::string& target_address, const TensorInfo& tensor);
+    
+    int receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms = -1);
+    std::future<std::string> receiveDataAsync(const std::string& source_address, int timeout_ms = -1);
+
+    std::shared_ptr<Impl> getImpl() { return impl_; }
+
+private:
+    std::shared_ptr<Impl> impl_;
+};
+
+std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size = 10, size_t timeout_seconds = 30);
+std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_address, size_t thread_count = 0);
+
+} // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
new file mode 100644
index 000000000..d3b238a3f
--- /dev/null
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <pybind11/pybind11.h>
+
+namespace mooncake {
+
+struct Config;
+
+class CoroRPCInterface {
+public:
+    struct ReceivedData {
+        std::string source_address;
+        std::string data;
+        size_t data_size = 0;
+        
+        pybind11::bytes getBytes() const {
+            return pybind11::bytes(data);
+        }
+    };
+    
+    struct ReceivedTensor {
+        std::string source_address;
+        std::string data;
+        std::vector<size_t> shape;
+        std::string dtype;
+        size_t total_bytes = 0;
+        
+        pybind11::object rebuildTensor() const;
+        
+    private:
+        pybind11::object rebuildTensorInternal() const;
+    };
+
+    class Impl;
+
+    CoroRPCInterface();
+    ~CoroRPCInterface();
+
+    // 初始化
+    bool initialize(const std::string& listen_address = "",
+                   size_t thread_count = 0,
+                   size_t timeout_seconds = 30,
+                   size_t pool_size = 10);
+
+    bool startServer();
+    bool startServerAsync();
+    void stopServer();
+
+    bool addRemoteConnection(const std::string& remote_address);
+    void removeRemoteConnection(const std::string& remote_address);
+    bool isConnected(const std::string& remote_address);
+
+    int sendData(const std::string& target_address, pybind11::bytes data);
+    pybind11::object sendDataAsync(const std::string& target_address,
+                                   pybind11::bytes data,
+                                   pybind11::handle loop);
+
+    int sendTensor(const std::string& target_address, pybind11::handle tensor);
+    pybind11::object sendTensorAsync(const std::string& target_address,
+                                     pybind11::handle tensor,
+                                     pybind11::handle loop);
+
+    void setDataReceiveCallback(pybind11::function callback);
+    void setTensorReceiveCallback(pybind11::function callback);
+
+    void handleIncomingData(const std::string& source_address,
+                           const std::string& data);
+    void handleIncomingTensor(const std::string& source_address,
+                             const std::string& data,
+                             const std::vector<size_t>& shape,
+                             const std::string& dtype);
+
+private:
+    std::unique_ptr<Impl> impl_;
+};
+
+std::unique_ptr<CoroRPCInterface> createRPCClient(size_t pool_size = 10, size_t timeout_seconds = 30);
+std::unique_ptr<CoroRPCInterface> createRPCServer(const std::string& listen_address, size_t thread_count = 0);
+
+} // namespace mooncake
+
+// Forward declaration for pybind11 integration
+namespace pybind11 { class module_; }
+void bind_coro_rpc_interface(pybind11::module_ &m);
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index 8f24ff1d8..87a1ba023 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,7 +1,9 @@
 file(GLOB XPORT_SOURCES "*.cpp")
 
 add_subdirectory(rdma_transport)
-add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
+add_subdirectory(coro_rpc_connector)
+
+add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport> $<TARGET_OBJECTS:coro_rpc_connector>)
 target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread)
 
 if (USE_TCP)
@@ -28,3 +30,5 @@ if (USE_MNNVL)
   add_subdirectory(nvlink_transport)
   target_sources(transport PUBLIC $<TARGET_OBJECTS:nvlink_transport>)
 endif()
+
+target_include_directories(transport PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
new file mode 100644
index 000000000..c433ba0ac
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Find Python and pybind11 for the binding code
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+find_package(pybind11 QUIET)
+if(NOT pybind11_FOUND)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -m pybind11 --cmakedir
+        OUTPUT_VARIABLE pybind11_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE pybind11_RESULT
+    )
+    if(pybind11_RESULT EQUAL 0)
+        find_package(pybind11 REQUIRED PATHS ${pybind11_DIR})
+    else()
+        message(FATAL_ERROR "pybind11 not found. Please install with: pip install pybind11")
+    endif()
+endif()
+
+# Create object library for coro_rpc_connector
+set(CORO_RPC_SOURCES
+    cororpc_interface.cpp
+    cororpc_communicator.cpp
+)
+
+add_library(coro_rpc_connector OBJECT ${CORO_RPC_SOURCES})
+
+target_compile_features(coro_rpc_connector PRIVATE cxx_std_20)
+target_compile_options(coro_rpc_connector PRIVATE -O3 -Wall)
+
+target_include_directories(coro_rpc_connector PRIVATE 
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
+    ${Python3_INCLUDE_DIRS}
+)
+
+target_link_libraries(coro_rpc_connector PRIVATE 
+    yalantinglibs::yalantinglibs
+    pthread
+    pybind11::module
+)
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 4ed513b8e..847f96658 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -1,41 +1,15 @@
 #include "transport/coro_rpc_connector/cororpc_communicator.h"
-#include <chrono>
-#include <stdexcept>
 #include <iostream>
-
-using namespace async_simple::coro;
+#include <thread>
+#include <functional>
+#include <ylt/coro_rpc/coro_rpc_client.hpp>
+#include <ylt/coro_rpc/coro_rpc_server.hpp>
+#include <ylt/coro_io/client_pool.hpp>
+#include "async_simple/coro/SyncAwait.h"
 
 namespace mooncake {
 
-// Impl类的处理函数实现
-std::string CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<std::string> context, std::string_view data) {
-    // 简单回显数据，实际使用中可根据需要修改
-    return std::string(data);
-}
-
-std::string CoroRPCCommunicator::Impl::handleTensorTransfer(coro_rpc::context<std::string> context) {
-    auto ctx_info = context.get_context_info();
-    auto attachment = ctx_info->get_request_attachment();
-    // 处理张量数据，这里简单返回接收到的大小信息
-    return "received tensor size: " + std::to_string(attachment.size());
-}
-
-void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
-    auto ctx_info = context.get_context_info();
-    // 回显附件数据
-    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
-    context.response_msg();
-}
-
-void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
-    auto ctx_info = context.get_context_info();
-    // 回显张量附件数据
-    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
-    context.response_msg();
-}
-
-// CoroRPCCommunicator构造函数和析构函数
-CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_unique<Impl>()) {}
+CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {}
 
 CoroRPCCommunicator::~CoroRPCCommunicator() {
     stopServer();
@@ -45,212 +19,323 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
     
     if (!config.listen_address.empty()) {
-        // 初始化服务器
-        impl_->server = std::make_unique<coro_rpc::coro_rpc_server>(
+        std::cout << "Initializing server on " << config.listen_address << std::endl;
+        
+        impl_->server_ = std::make_unique<coro_rpc::coro_rpc_server>(
             config.thread_count, 
             config.listen_address,
             std::chrono::seconds(config.timeout_seconds)
         );
         
-        // 注册处理函数
-        impl_->server->register_handler<
-            &CoroRPCCommunicator::Impl::handleDataTransfer,
-            &CoroRPCCommunicator::Impl::handleTensorTransfer,
-            &CoroRPCCommunicator::Impl::handleDataTransferWithAttachment,
-            &CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment
-        >(impl_.get());
+        impl_->server_->register_handler<&CoroRPCCommunicator::Impl::handleDataTransfer,
+                                        &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
     
     return true;
 }
 
 bool CoroRPCCommunicator::startServer() {
-    if (!impl_->server) {
+    if (!impl_->server_ || impl_->config.listen_address.empty()) return false;
+    
+    try {
+        auto ec = impl_->server_->start();
+        if (ec.val() == 0) {
+            impl_->is_server_started = true;
+            std::cout << "Server started on " << impl_->config.listen_address << std::endl;
+            return true;
+        } else {
+            std::cerr << "Failed to start server: " << ec.message() << std::endl;
+            return false;
+        }
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to start server: " << e.what() << std::endl;
         return false;
     }
-    
-    auto ec = impl_->server->start();
-    impl_->is_server_started = (ec.val() == 0);
-    return impl_->is_server_started;
 }
 
 bool CoroRPCCommunicator::startServerAsync() {
-    if (!impl_->server) {
+    if (!impl_->server_ || impl_->config.listen_address.empty()) return false;
+    
+    try {
+        auto ec = impl_->server_->async_start();
+        if (!ec.hasResult()) {
+            impl_->is_server_started = true;
+            std::cout << "Server started asynchronously on " << impl_->config.listen_address << std::endl;
+            return true;
+        } else {
+            std::cerr << "Failed to start server asynchronously" << std::endl;
+            return false;
+        }
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to start server asynchronously: " << e.what() << std::endl;
         return false;
     }
-    
-    auto ec = impl_->server->async_start();
-    impl_->is_server_started = !ec.hasResult();
-    return impl_->is_server_started;
 }
 
 void CoroRPCCommunicator::stopServer() {
-    if (impl_->server && impl_->is_server_started) {
-        impl_->server.reset();
+    if (impl_->is_server_started) {
         impl_->is_server_started = false;
+        std::cout << "Server stopped" << std::endl;
     }
 }
 
+bool CoroRPCCommunicator::addRemoteConnection(const std::string& remote_address) {
+    try {
+        if (!impl_->client_pool_) {
+            impl_->client_pool_ = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
+        }
+        
+        auto& client = impl_->clients_[remote_address];
+        auto task = [&client, remote_address]() -> async_simple::coro::Lazy<bool> {
+            auto ec = co_await client.connect(remote_address);
+            co_return !ec;
+        };
+        
+        bool connected = async_simple::coro::syncAwait(task());
+        if (connected) {
+            std::cout << "Successfully connected to " << remote_address << std::endl;
+        } else {
+            std::cout << "Failed to connect to " << remote_address << std::endl;
+        }
+        return connected;
+    } catch (const std::exception& e) {
+        std::cerr << "Exception while connecting to " << remote_address << ": " << e.what() << std::endl;
+        return false;
+    }
+}
+
+void CoroRPCCommunicator::removeRemoteConnection(const std::string& remote_address) {
+    auto it = impl_->clients_.find(remote_address);
+    if (it != impl_->clients_.end()) {
+        impl_->clients_.erase(it);
+        std::cout << "Removed connection to " << remote_address << std::endl;
+    }
+}
+
+bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
+    auto it = impl_->clients_.find(remote_address);
+    if (it != impl_->clients_.end()) {
+        return it->second.has_closed() == false;
+    }
+    return false;
+}
+
 int CoroRPCCommunicator::sendData(const std::string& target_address,
                                   const void* data,
                                   size_t data_size) {
     try {
-        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
-        auto connect_result = syncAwait(client->connect(target_address));
-        if (!connect_result) {
-            return -1;
+        if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+            if (!addRemoteConnection(target_address)) {
+                return -1;
+            }
         }
         
-        std::string_view data_view(static_cast<const char*>(data), data_size);
-        auto result = syncAwait(client->call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view));
+        auto& client = impl_->clients_[target_address];
+        
+        auto task = [&client, data, data_size]() -> async_simple::coro::Lazy<int> {
+            std::string_view data_view(static_cast<const char*>(data), data_size);
+            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+            
+            if (result.has_value()) {
+                co_return 0; 
+            } else {
+                std::cerr << "RPC call failed: " << result.error().msg << std::endl;
+                co_return -1;
+            }
+        };
         
-        return result.has_value() ? 0 : -1;
+        int result = async_simple::coro::syncAwait(task());
+        
+        if (result == 0) {
+            std::cout << "Successfully sent " << data_size << " bytes to " << target_address << std::endl;
+        }
+        
+        return result;
     } catch (const std::exception& e) {
+        std::cerr << "Send data error: " << e.what() << std::endl;
         return -1;
     }
 }
 
-Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
-                                                 const void* data,
-                                                 size_t data_size) {
-    result res;
+std::future<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
+                                                       const void* data,
+                                                       size_t data_size) {
+    auto promise = std::make_shared<std::promise<result>>();
+    auto future = promise->get_future();
     
-    try {
-        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
-        auto connect_result = co_await client->connect(target_address);
-        if (!connect_result) {
+    if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+        if (!addRemoteConnection(target_address)) {
+            result res;
             res.code = -1;
-            res.err_msg = "connection failed";
-            co_return res;
-        }
-        
-        std::string_view data_view(static_cast<const char*>(data), data_size);
-        auto result = co_await client->call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
-        
-        if (result.has_value()) {
-            res.code = 0;
-            res.data = result.value();
-            res.data_size = res.data.size();
-        } else {
-            res.code = result.error().val();
-            res.err_msg = result.error().msg;
+            res.err_msg = "Failed to connect to " + target_address;
+            promise->set_value(res);
+            return future;
         }
-    } catch (const std::exception& e) {
-        res.code = -1;
-        res.err_msg = e.what();
     }
     
-    co_return res;
+    if (impl_->client_pool_) {
+        impl_->client_pool_->send_request(
+            [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                
+                std::string_view data_view(static_cast<const char*>(data), data_size);
+                auto rpc_result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+                
+                result res;
+                if (rpc_result.has_value()) {
+                    res.code = 0;
+                } else {
+                    res.code = rpc_result.error().val();
+                    res.err_msg = rpc_result.error().msg;
+                }
+                
+                promise->set_value(res);
+            }
+        ).start([](auto &&) {});
+    } else {
+        std::thread([this, target_address, data, data_size, promise]() {
+            result res;
+            res.code = sendData(target_address, data, data_size);
+            promise->set_value(res);
+        }).detach();
+    }
+    
+    return future;
 }
 
-int CoroRPCCommunicator::sendTensor(const std::string& target_address,
-                                    const pybind11::object& tensor) {
+int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pybind11::object& tensor) {
     try {
-        auto client = std::make_unique<coro_rpc::coro_rpc_client>();
-        auto connect_result = syncAwait(client->connect(target_address));
-        if (!connect_result) {
-            return -1;
+        if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+            if (!addRemoteConnection(target_address)) {
+                return -1;
+            }
         }
         
-        // 从PyTorch tensor获取数据指针和大小
-        uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
-        size_t numel = tensor.attr("numel")().cast<size_t>();
-        size_t element_size = tensor.attr("element_size")().cast<size_t>();
-        size_t tensor_size = numel * element_size;
+        auto& client = impl_->clients_[target_address];
         
-        client->set_req_attachment(std::string_view(reinterpret_cast<char*>(data_ptr), tensor_size));
-        auto result = syncAwait(client->call<&CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment>());
+        auto task = [&client, &tensor]() -> async_simple::coro::Lazy<int> {
+            uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
+            size_t numel = tensor.attr("numel")().cast<size_t>();
+            size_t element_size = tensor.attr("element_size")().cast<size_t>();
+            size_t tensor_size = numel * element_size;
+
+            client.set_req_attachment(std::string_view((char*)data_ptr, tensor_size));
+            
+            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
+            
+            if (result.has_value()) {
+                co_return 0; 
+            } else {
+                std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
+                co_return -1;
+            }
+        };
         
-        return result.has_value() ? 0 : -1;
+        int result = async_simple::coro::syncAwait(task());
+        
+        if (result == 0) {
+            std::cout << "Successfully sent tensor to " << target_address << std::endl;
+        }
+        
+        return result;
     } catch (const std::exception& e) {
+        std::cerr << "Send tensor error: " << e.what() << std::endl;
         return -1;
     }
 }
 
-std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address,
-                                                       const TensorInfo& tensor) {
+std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address, const TensorInfo& tensor) {
     auto promise = std::make_shared<std::promise<int>>();
     auto future = promise->get_future();
     
-    auto task = [this, target_address, tensor, promise]() -> Lazy<void> {
-        try {
-            auto client = std::make_unique<coro_rpc::coro_rpc_client>();
-            auto connect_result = co_await client->connect(target_address);
-            if (!connect_result) {
-                promise->set_value(-1);
-                co_return;
-            }
-            
-            std::string_view data_view(static_cast<const char*>(tensor.data_ptr), tensor.total_bytes);
-            client->set_req_attachment(data_view);
-            auto result = co_await client->call<&CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment>();
-            
-            promise->set_value(result.has_value() ? 0 : -1);
-        } catch (const std::exception& e) {
+    if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+        if (!addRemoteConnection(target_address)) {
             promise->set_value(-1);
+            return future;
         }
-    };
+    }
     
-    task().start([](auto&&) {});
+    if (impl_->client_pool_) {
+        impl_->client_pool_->send_request(
+            [tensor, promise](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                
+                client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
+                
+                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
+                
+                if (result.has_value()) {
+                    promise->set_value(0);
+                } else {
+                    std::cerr << "Async tensor RPC call failed: " << result.error().msg << std::endl;
+                    promise->set_value(-1);
+                }
+            }
+        ).start([](auto &&) {});
+    } else {
+        LOG("Client pool not available for async tensor send");
+    }
     
     return future;
 }
 
-int CoroRPCCommunicator::receiveData(const std::string& source_address,
-                                     void* buffer,
-                                     size_t buffer_size,
-                                     int timeout_ms) {
-    // 这是一个简化实现，实际中可能需要更复杂的接收逻辑
-    // 由于coro_rpc主要是请求-响应模式，这里返回不支持
-    return -1;
-}
-
-Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address,
-                                                        int timeout_ms) {
-    // 这是一个简化实现，实际中可能需要更复杂的接收逻辑
-    co_return "";
+int CoroRPCCommunicator::receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms) {
+    return 0;
 }
 
-bool CoroRPCCommunicator::addRemoteConnection(const std::string& remote_address) {
-    try {
-        auto pool = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
-        impl_->client_pools[remote_address] = pool;
-        return true;
-    } catch (const std::exception& e) {
-        return false;
-    }
-}
-
-void CoroRPCCommunicator::removeRemoteConnection(const std::string& remote_address) {
-    impl_->client_pools.erase(remote_address);
-}
-
-bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
-    return impl_->client_pools.find(remote_address) != impl_->client_pools.end();
+std::future<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address, int timeout_ms) {
+    auto promise = std::make_shared<std::promise<std::string>>();
+    auto future = promise->get_future();
+    
+    std::thread([promise]() {
+        promise->set_value(std::string());
+    }).detach();
+    
+    return future;
 }
 
-std::string CoroRPCCommunicator::handleDataTransfer(coro_rpc::context<std::string> context, std::string_view data) {
-    return impl_->handleDataTransfer(std::move(context), data);
+void CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<void> context, std::string_view data) {
+    std::cout << "Handling data transfer: " << data.size() << " bytes" << std::endl;
+    context.response_msg();
 }
 
-std::string CoroRPCCommunicator::handleTensorTransfer(coro_rpc::context<std::string> context) {
-    return impl_->handleTensorTransfer(std::move(context));
+void CoroRPCCommunicator::Impl::handleTensorTransfer(coro_rpc::context<void> context) {
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+    
+    std::cout << "Handling tensor transfer: " << attachment.size() << " bytes" << std::endl;
+    
+    ctx_info->set_response_attachment(attachment);
+    context.response_msg();
 }
 
-void CoroRPCCommunicator::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
-    impl_->handleDataTransferWithAttachment(std::move(context), data);
+void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+    
+    std::cout << "Handling data transfer with attachment - Data: " << data.size() 
+              << " bytes, Attachment: " << attachment.size() << " bytes" << std::endl;
+    
+    
+    context.response_msg();
 }
 
-void CoroRPCCommunicator::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
-    impl_->handleTensorTransferWithAttachment(std::move(context));
+void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+    
+    std::cout << "Handling tensor transfer with attachment: " << attachment.size() << " bytes" << std::endl;
+    
+    ctx_info->set_response_attachment(attachment);
+    context.response_msg();
 }
 
 std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size, size_t timeout_seconds) {
-    auto communicator = std::make_unique<CoroRPCCommunicator>();
     Config config;
     config.pool_size = pool_size;
     config.timeout_seconds = timeout_seconds;
     
+    auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
         return communicator;
     }
@@ -258,11 +343,11 @@ std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size, size_t t
 }
 
 std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_address, size_t thread_count) {
-    auto communicator = std::make_unique<CoroRPCCommunicator>();
     Config config;
     config.listen_address = listen_address;
     config.thread_count = thread_count;
     
+    auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
         return communicator;
     }
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 6b6cb4e85..8d7ecb76b 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -1,25 +1,28 @@
 #include "transport/coro_rpc_connector/cororpc_interface.h"
 #include "transport/coro_rpc_connector/cororpc_communicator.h"
 #include <pybind11/stl.h>
+#include <pybind11/numpy.h>
 #include <iostream>
-#include <json/json.h>
+#include <functional>
+#include <memory>
+
+using namespace pybind11::literals;
+
+#define PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF
 
 namespace mooncake {
 
-// Impl类定义
 class CoroRPCInterface::Impl {
 public:
     std::unique_ptr<CoroRPCCommunicator> communicator;
     pybind11::function data_receive_callback;
     pybind11::function tensor_receive_callback;
     
-    // 处理接收到的数据
     void onDataReceived(const std::string& source, const std::string& data);
     void onTensorReceived(const std::string& source, const std::string& data,
                          const std::vector<size_t>& shape, const std::string& dtype);
 };
 
-// CoroRPCInterface实现
 CoroRPCInterface::CoroRPCInterface() : impl_(std::make_unique<Impl>()) {}
 
 CoroRPCInterface::~CoroRPCInterface() = default;
@@ -73,101 +76,176 @@ bool CoroRPCInterface::isConnected(const std::string& remote_address) {
 int CoroRPCInterface::sendData(const std::string& target_address, pybind11::bytes data) {
     if (!impl_->communicator) return -1;
     
+    std::string data_str;
+    {
+        pybind11::gil_scoped_acquire acquire;
+        data_str = std::string(data);
+    }
+    
     pybind11::gil_scoped_release release;
-    std::string data_str = data;
     return impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
 }
 
 pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_address,
                                                  pybind11::bytes data,
                                                  pybind11::handle loop) {
-    auto future = loop.attr("create_future")();
+    pybind11::object future;
+    {
+        pybind11::gil_scoped_acquire acquire;
+        future = loop.attr("create_future")();
+    }
     
     if (!impl_->communicator) {
+        pybind11::gil_scoped_acquire acquire;
         loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
         return future;
     }
     
-    // 创建异步任务
-    std::string data_str = data;
-    auto task = [this, target_address, data_str, future, loop]() {
-        pybind11::gil_scoped_release release;
-        int result = impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
-        
+    auto data_holder = std::make_shared<std::string>();
+    auto target_addr = std::make_shared<std::string>(target_address);
+    auto communicator = impl_->communicator.get();
+    
+    PyObject* future_ptr = nullptr;
+    PyObject* loop_ptr = nullptr;
+    
+    {
         pybind11::gil_scoped_acquire acquire;
-        loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+        *data_holder = std::string(data);
+        future_ptr = future.ptr();
+        loop_ptr = loop.ptr();
+        Py_INCREF(future_ptr);
+        Py_INCREF(loop_ptr);
+    }
+    
+    auto task_func = std::make_shared<std::function<void()>>();
+    *task_func = [communicator, target_addr, data_holder, future_ptr, loop_ptr]() {
+        int result = communicator->sendData(*target_addr, data_holder->data(), data_holder->size());
+        
+        {
+            pybind11::gil_scoped_acquire acquire;
+            try {
+                pybind11::handle loop_handle(loop_ptr);
+                pybind11::handle future_handle(future_ptr);
+                loop_handle.attr("call_soon_threadsafe")(future_handle.attr("set_result"), result);
+            } catch (const std::exception& e) {
+                std::cerr << "Error in async callback: " << e.what() << std::endl;
+            }
+            Py_DECREF(future_ptr);
+            Py_DECREF(loop_ptr);
+        }
     };
     
-    std::thread(task).detach();
+    std::thread([task_func]() { (*task_func)(); }).detach();
     return future;
 }
 
 int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::handle tensor) {
     if (!impl_->communicator) return -1;
     
+    pybind11::object tensor_obj;
+    {
+        pybind11::gil_scoped_acquire acquire;
+        tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
+    }
+
     pybind11::gil_scoped_release release;
-    return impl_->communicator->sendTensor(target_address, pybind11::cast<pybind11::object>(tensor));
+    return impl_->communicator->sendTensor(target_address, tensor_obj);
 }
 
 pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_address,
                                                    pybind11::handle tensor,
                                                    pybind11::handle loop) {
-    auto future = loop.attr("create_future")();
+    pybind11::object future;
+    {
+        pybind11::gil_scoped_acquire acquire;
+        future = loop.attr("create_future")();
+    }
     
     if (!impl_->communicator) {
+        pybind11::gil_scoped_acquire acquire;
         loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
         return future;
     }
     
-    // 获取tensor信息
-    CoroRPCCommunicator::TensorInfo tensor_info;
+    auto tensor_info = std::make_shared<TensorInfo>();
+    auto target_addr = std::make_shared<std::string>(target_address);
+    auto communicator = impl_->communicator.get();
+    
+    PyObject* future_ptr = nullptr;
+    PyObject* loop_ptr = nullptr;
+    
     {
         pybind11::gil_scoped_acquire acquire;
-        tensor_info.data_ptr = reinterpret_cast<void*>(tensor.attr("data_ptr")().cast<uintptr_t>());
-        size_t numel = tensor.attr("numel")().cast<size_t>();
-        size_t element_size = tensor.attr("element_size")().cast<size_t>();
-        tensor_info.total_bytes = numel * element_size;
-        
-        // 获取shape和dtype
-        auto shape_tuple = tensor.attr("shape");
-        for (pybind11::handle item : shape_tuple) {
-            tensor_info.shape.push_back(item.cast<size_t>());
+        try {
+            tensor_info->data_ptr = reinterpret_cast<void*>(tensor.attr("data_ptr")().cast<uintptr_t>());
+            size_t numel = tensor.attr("numel")().cast<size_t>();
+            size_t element_size = tensor.attr("element_size")().cast<size_t>();
+            tensor_info->total_bytes = numel * element_size;
+            
+            auto shape_tuple = tensor.attr("shape");
+            for (pybind11::handle item : shape_tuple) {
+                tensor_info->shape.push_back(item.cast<size_t>());
+            }
+            tensor_info->dtype = tensor.attr("dtype").attr("__str__")().cast<std::string>();
+            
+            future_ptr = future.ptr();
+            loop_ptr = loop.ptr();
+            Py_INCREF(future_ptr);
+            Py_INCREF(loop_ptr);
+        } catch (const std::exception& e) {
+            std::cerr << "Error extracting tensor info: " << e.what() << std::endl;
+            loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
+            return future;
         }
-        tensor_info.dtype = tensor.attr("dtype").attr("__str__")().cast<std::string>();
     }
     
-    // 异步发送
-    auto std_future = impl_->communicator->sendTensorAsync(target_address, tensor_info);
-    
-    // 转换为Python future
-    auto task = [std_future = std::move(std_future), future, loop]() mutable {
+    auto task_func = std::make_shared<std::function<void()>>();
+    *task_func = [communicator, target_addr, tensor_info, future_ptr, loop_ptr]() {
+        auto std_future = communicator->sendTensorAsync(*target_addr, *tensor_info);
         int result = std_future.get();
-        pybind11::gil_scoped_acquire acquire;
-        loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
+        
+        {
+            pybind11::gil_scoped_acquire acquire;
+            try {
+                pybind11::handle loop_handle(loop_ptr);
+                pybind11::handle future_handle(future_ptr);
+                loop_handle.attr("call_soon_threadsafe")(future_handle.attr("set_result"), result);
+            } catch (const std::exception& e) {
+                std::cerr << "Error in tensor async callback: " << e.what() << std::endl;
+            }
+            Py_DECREF(future_ptr);
+            Py_DECREF(loop_ptr);
+        }
     };
     
-    std::thread(task).detach();
+    std::thread([task_func]() { (*task_func)(); }).detach();
     return future;
 }
 
 void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
+    pybind11::gil_scoped_acquire acquire;
     impl_->data_receive_callback = callback;
 }
 
 void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
+    pybind11::gil_scoped_acquire acquire;
     impl_->tensor_receive_callback = callback;
 }
 
 void CoroRPCInterface::handleIncomingData(const std::string& source_address,
                                          const std::string& data) {
-    if (impl_->data_receive_callback) {
+    if (!impl_->data_receive_callback) return;
+    
+    pybind11::gil_scoped_acquire acquire;
+    try {
         ReceivedData received;
         received.data = data;
         received.source_address = source_address;
         received.data_size = data.size();
         
-        pybind11::gil_scoped_acquire acquire;
         impl_->data_receive_callback(received);
+    } catch (const std::exception& e) {
+        std::cerr << "Error in data receive callback: " << e.what() << std::endl;
     }
 }
 
@@ -175,7 +253,10 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source_address,
                                            const std::string& data,
                                            const std::vector<size_t>& shape,
                                            const std::string& dtype) {
-    if (impl_->tensor_receive_callback) {
+    if (!impl_->tensor_receive_callback) return;
+    
+    pybind11::gil_scoped_acquire acquire;
+    try {
         ReceivedTensor received;
         received.data = data;
         received.source_address = source_address;
@@ -183,21 +264,26 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source_address,
         received.dtype = dtype;
         received.total_bytes = data.size();
         
-        pybind11::gil_scoped_acquire acquire;
         impl_->tensor_receive_callback(received);
+    } catch (const std::exception& e) {
+        std::cerr << "Error in tensor receive callback: " << e.what() << std::endl;
     }
 }
 
-// ReceivedTensor的rebuildTensor实现
 pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
-    pybind11::gil_scoped_acquire acquire;
-    
+    if (!PyGILState_Check()) {
+        pybind11::gil_scoped_acquire acquire;
+        return rebuildTensorInternal();
+    } else {
+        return rebuildTensorInternal();
+    }
+}
+
+pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const {
     try {
-        // 导入torch模块
         auto torch = pybind11::module::import("torch");
         auto numpy = pybind11::module::import("numpy");
         
-        // 确定numpy数据类型
         std::string np_dtype;
         if (dtype.find("float32") != std::string::npos) {
             np_dtype = "float32";
@@ -208,18 +294,15 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
         } else if (dtype.find("int64") != std::string::npos) {
             np_dtype = "int64";
         } else {
-            np_dtype = "float32"; // 默认类型
+            np_dtype = "float32";
         }
         
-        // 创建numpy数组
         auto np_array = numpy.attr("frombuffer")(
             pybind11::bytes(data), 
             "dtype"_a=np_dtype
         ).attr("reshape")(pybind11::cast(shape));
         
-        // 转换为torch tensor
         auto tensor = torch.attr("from_numpy")(np_array);
-        
         return tensor;
     } catch (const std::exception& e) {
         std::cerr << "Error rebuilding tensor: " << e.what() << std::endl;
@@ -227,17 +310,13 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
     }
 }
 
-// Impl类方法实现
 void CoroRPCInterface::Impl::onDataReceived(const std::string& source, const std::string& data) {
-    // 这里可以添加具体的数据接收处理逻辑
 }
 
 void CoroRPCInterface::Impl::onTensorReceived(const std::string& source, const std::string& data,
                                              const std::vector<size_t>& shape, const std::string& dtype) {
-    // 这里可以添加具体的tensor接收处理逻辑
 }
 
-// 工厂函数实现
 std::unique_ptr<CoroRPCInterface> createRPCClient(size_t pool_size, size_t timeout_seconds) {
     auto interface = std::make_unique<CoroRPCInterface>();
     if (interface->initialize("", 0, timeout_seconds, pool_size)) {
@@ -256,50 +335,7 @@ std::unique_ptr<CoroRPCInterface> createRPCServer(const std::string& listen_addr
 
 } // namespace mooncake
 
-// Python绑定
 namespace py = pybind11;
 
-PYBIND11_MODULE(coro_rpc_interface, m) {
-    using namespace mooncake;
-    
-    // ReceivedData类
-    py::class_<CoroRPCInterface::ReceivedData>(m, "ReceivedData")
-        .def(py::init<>())
-        .def_readonly("source_address", &CoroRPCInterface::ReceivedData::source_address)
-        .def_readonly("data_size", &CoroRPCInterface::ReceivedData::data_size)
-        .def("get_bytes", &CoroRPCInterface::ReceivedData::getBytes);
-    
-    // ReceivedTensor类
-    py::class_<CoroRPCInterface::ReceivedTensor>(m, "ReceivedTensor")
-        .def(py::init<>())
-        .def_readonly("source_address", &CoroRPCInterface::ReceivedTensor::source_address)
-        .def_readonly("shape", &CoroRPCInterface::ReceivedTensor::shape)
-        .def_readonly("dtype", &CoroRPCInterface::ReceivedTensor::dtype)
-        .def_readonly("total_bytes", &CoroRPCInterface::ReceivedTensor::total_bytes)
-        .def("rebuild_tensor", &CoroRPCInterface::ReceivedTensor::rebuildTensor);
-    
-    // 主接口类
-    py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
-        .def(py::init<>())
-        .def("initialize", &CoroRPCInterface::initialize,
-             "listen_address"_a="", "thread_count"_a=0, 
-             "timeout_seconds"_a=30, "pool_size"_a=10)
-        .def("start_server", &CoroRPCInterface::startServer)
-        .def("start_server_async", &CoroRPCInterface::startServerAsync)
-        .def("stop_server", &CoroRPCInterface::stopServer)
-        .def("add_remote_connection", &CoroRPCInterface::addRemoteConnection)
-        .def("remove_remote_connection", &CoroRPCInterface::removeRemoteConnection)
-        .def("is_connected", &CoroRPCInterface::isConnected)
-        .def("send_data", &CoroRPCInterface::sendData)
-        .def("send_data_async", &CoroRPCInterface::sendDataAsync)
-        .def("send_tensor", &CoroRPCInterface::sendTensor)
-        .def("send_tensor_async", &CoroRPCInterface::sendTensorAsync)
-        .def("set_data_receive_callback", &CoroRPCInterface::setDataReceiveCallback)
-        .def("set_tensor_receive_callback", &CoroRPCInterface::setTensorReceiveCallback);
-    
-    // 工厂函数
-    m.def("create_rpc_client", &createRPCClient, 
-          "pool_size"_a=10, "timeout_seconds"_a=30);
-    m.def("create_rpc_server", &createRPCServer,
-          "listen_address"_a, "thread_count"_a=0);
-}
\ No newline at end of file
+// Note: bind_coro_rpc_interface function is now implemented in transfer_engine_py.cpp
+// to avoid linking issues
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
deleted file mode 100644
index 5379de5ce..000000000
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/py_rpc_example.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/pytypes.h>
-
-#include <chrono>
-#include <memory>
-#include <string>
-#include <thread>
-#include <ylt/coro_io/client_pool.hpp>
-#include <ylt/coro_rpc/coro_rpc_client.hpp>
-#include <ylt/coro_rpc/coro_rpc_server.hpp>
-
-#include "async_simple/coro/SyncAwait.h"
-
-namespace py = pybind11;
-
-class py_rpc_context {
- public:
-  void response_msg(py::buffer msg, py::handle done) {
-    py::buffer_info info = msg.request();
-    const char *data = static_cast<char *>(info.ptr);
-    context_.get_context_info()->set_response_attachment(
-        std::string_view(data, info.size));
-    done.inc_ref();
-    context_.get_context_info()->set_complete_handler(
-        [done](const std::error_code &ec, std::size_t) {
-          py::gil_scoped_acquire acquire;
-          done(!ec);
-          done.dec_ref();
-        });
-    context_.response_msg();
-  }
-
-  coro_rpc::context<void> context_;
-};
-
-class py_coro_rpc_client_pool;
-class py_coro_rpc_server {
- public:
-  py_coro_rpc_server(size_t thd_num, std::string address,
-                     py::handle py_callback, size_t seconds)
-      : server_(thd_num, address, std::chrono::seconds(seconds)),
-        py_callback_(py_callback) {
-    server_.register_handler<&py_coro_rpc_server::handle_msg,
-                             &py_coro_rpc_server::handle_tensor>(this);
-  }
-
-  bool start() {
-    auto ec = server_.start();
-    return ec.val() == 0;
-  }
-
-  bool async_start() {
-    auto ec = server_.async_start();
-    return !ec.hasResult();
-  }
-
- private:
-  friend class py_coro_rpc_client_pool;
-  void handle_msg(coro_rpc::context<void> context, std::string_view msg) {
-    py_rpc_context t{};
-    t.context_ = std::move(context);
-    py::gil_scoped_acquire acquire;
-    auto view = py::memoryview::from_buffer(msg.data(), {msg.size()},
-                                            {sizeof(uint8_t)});
-    py_callback_(std::move(t), view);
-  }
-
-  void handle_tensor(coro_rpc::context<void> context) {
-    auto ctx_info = context.get_context_info();
-    ctx_info->set_response_attachment(ctx_info->get_request_attachment());
-    context.response_msg();
-  }
-
-  coro_rpc::coro_rpc_server server_;
-  py::handle py_callback_;
-};
-
-class string_holder {
- public:
-  string_holder(std::string val) : value(std::move(val)) {}
-
-  py::object str_view(uint64_t data_size) {
-    auto view = py::memoryview::from_buffer(value.data(), {data_size},
-                                            {sizeof(uint8_t)});
-    return view;
-  }
-
- private:
-  std::string value;
-};
-
-struct rpc_result {
-  int code;
-  std::string err_msg;
-  std::shared_ptr<string_holder> data_ptr;
-  uint64_t data_size;
-  py::object str_view() { return data_ptr->str_view(data_size); }
-};
-
-class py_coro_rpc_client_pool {
- public:
-  py_coro_rpc_client_pool(std::string url)
-      : pool_(coro_io::client_pool<coro_rpc::coro_rpc_client>::create(url)) {
-    async_simple::coro::syncAwait(client_.connect(url));
-  };
-
-  pybind11::object async_send_msg_with_outbuf(py::handle loop,
-                                              py::handle py_bytes,
-                                              py::buffer out_buf) {
-    auto local_future = loop.attr("create_future")();
-    py::handle future = local_future;
-
-    py::buffer_info info = out_buf.request(true);
-    char *data = static_cast<char *>(info.ptr);
-    std::span<char> buf(data, info.size);
-
-    py_bytes.inc_ref();
-
-    pool_
-        ->send_request([py_bytes, loop, future,
-                        buf](coro_rpc::coro_rpc_client &client)
-                           -> async_simple::coro::Lazy<void> {
-          char *data;
-          ssize_t length;
-          PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
-          client.set_resp_attachment_buf(buf);
-          auto result = co_await client.call<&py_coro_rpc_server::handle_msg>(
-              std::string_view(data, length));
-          py::gil_scoped_acquire acquire;
-          loop.attr("call_soon_threadsafe")(
-              future.attr("set_result"),
-              py::make_tuple(result.has_value(),
-                             client.get_resp_attachment().size()));
-          py_bytes.dec_ref();
-        })
-        .start([](auto &&) {
-        });
-
-    return local_future;
-  }
-
-  pybind11::object async_send_msg(py::handle loop, py::handle py_bytes) {
-    auto local_future = loop.attr("create_future")();
-    py::handle future = local_future;
-
-    py_bytes.inc_ref();
-
-    pool_
-        ->send_request([py_bytes, loop,
-                        future](coro_rpc::coro_rpc_client &client)
-                           -> async_simple::coro::Lazy<void> {
-          char *data;
-          ssize_t length;
-          PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
-          auto r = co_await client.call<&py_coro_rpc_server::handle_msg>(
-              std::string_view(data, length));
-          rpc_result result{};
-          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
-          if (!r.has_value()) {
-            ELOG_INFO << "rpc call failed: " << r.error().msg;
-            result.code = r.error().val();
-            result.err_msg = r.error().msg;
-          }
-          else {
-            result.data_ptr = std::make_shared<string_holder>(
-                std::move(client.release_resp_attachment()));
-            result.data_size = client.get_resp_attachment().size();
-          }
-
-          py::gil_scoped_acquire acquire;
-          loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
-          py_bytes.dec_ref();
-        })
-        .start([](auto &&) {
-        });
-
-    return local_future;
-  }
-
-  pybind11::object async_send_tensor(py::handle loop,
-                                     py::handle tensor_handle) {
-    py::object local_future;
-    py::handle future;
-
-    {
-      py::gil_scoped_acquire acquire;
-      local_future = loop.attr("create_future")();
-      future = local_future;
-      tensor_handle.inc_ref();
-    }
-
-    pool_
-        ->send_request([tensor_handle, loop,
-                        future](coro_rpc::coro_rpc_client &client)
-                           -> async_simple::coro::Lazy<void> {
-          {
-            py::gil_scoped_acquire acquire;
-            uintptr_t data_ptr =
-                tensor_handle.attr("data_ptr")().cast<uintptr_t>();
-            size_t numel = tensor_handle.attr("numel")().cast<size_t>();
-            size_t element_size =
-                tensor_handle.attr("element_size")().cast<size_t>();
-            size_t tensor_size = numel * element_size;
-            client.set_req_attachment(
-                std::string_view((char *)data_ptr, tensor_size));
-          }
-
-          auto r = co_await client.call<&py_coro_rpc_server::handle_tensor>();
-          rpc_result result{};
-          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
-          if (!r.has_value()) {
-            ELOG_INFO << "rpc call failed: " << r.error().msg;
-            result.code = r.error().val();
-            result.err_msg = r.error().msg;
-          }
-          else {
-            result.data_ptr = std::make_shared<string_holder>(
-                std::move(client.release_resp_attachment()));
-            result.data_size = client.get_resp_attachment().size();
-          }
-
-          py::gil_scoped_acquire acquire;
-          loop.attr("call_soon_threadsafe")(future.attr("set_result"), result);
-          tensor_handle.dec_ref();
-        })
-        .start([](auto &&) {
-        });
-
-    return local_future;
-  }
-
-  rpc_result sync_send_msg(py::handle py_bytes) {
-    std::promise<rpc_result> p;
-    auto future = p.get_future();
-    pool_
-        ->send_request([py_bytes, p = std::move(p)](
-                           coro_rpc::coro_rpc_client &client) mutable
-                       -> async_simple::coro::Lazy<void> {
-          std::string_view send_msg;
-          {
-            char *data;
-            ssize_t length;
-            py::gil_scoped_acquire acquire;
-            PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
-            send_msg = std::string_view(data, length);
-          }
-          auto r =
-              co_await client.call<&py_coro_rpc_server::handle_msg>(send_msg);
-          rpc_result result{};
-          ELOG_INFO << "rpc result: " << client.get_resp_attachment();
-          if (!r.has_value()) {
-            ELOG_INFO << "rpc call failed: " << r.error().msg;
-            result.code = r.error().val();
-            result.err_msg = r.error().msg;
-          }
-          else {
-            result.data_ptr = std::make_shared<string_holder>(
-                std::move(client.release_resp_attachment()));
-            result.data_size = client.get_resp_attachment().size();
-          }
-
-          p.set_value(result);
-        })
-        .start([](auto &&) {
-        });
-
-    return future.get();
-  }
-
-  rpc_result sync_send_msg1(py::handle py_bytes) {
-    auto task = [this,
-                 py_bytes]() mutable -> async_simple::coro::Lazy<rpc_result> {
-      std::string_view send_msg;
-      {
-        char *data;
-        ssize_t length;
-        py::gil_scoped_acquire acquire;
-        PyBytes_AsStringAndSize(py_bytes.ptr(), &data, &length);
-        send_msg = std::string_view(data, length);
-      }
-      auto r = co_await client_.call<&py_coro_rpc_server::handle_msg>(send_msg);
-      rpc_result result{};
-      ELOG_INFO << "rpc result: " << client_.get_resp_attachment();
-      if (!r.has_value()) {
-        ELOG_INFO << "rpc call failed: " << r.error().msg;
-        result.code = r.error().val();
-        result.err_msg = r.error().msg;
-      }
-      else {
-        result.data_ptr = std::make_shared<string_holder>(
-            std::move(client_.release_resp_attachment()));
-        result.data_size = client_.get_resp_attachment().size();
-      }
-
-      co_return result;
-    };
-
-    return async_simple::coro::syncAwait(task());
-  }
-
- private:
-  std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>> pool_;
-  coro_rpc::coro_rpc_client client_;
-};
-
-PYBIND11_MODULE(py_coro_rpc, m) {
-  m.def("hello", [] {
-    return std::string("hello");
-  });
-
-  py::class_<py_rpc_context>(m, "py_rpc_context")
-      .def(py::init<>())
-      .def("response_msg", &py_rpc_context::response_msg);
-
-  py::class_<py_coro_rpc_server>(m, "coro_rpc_server")
-      .def(py::init<size_t, std::string, py::function, size_t>())
-      .def("start", &py_coro_rpc_server::start)
-      .def("async_start", &py_coro_rpc_server::async_start);
-
-  py::class_<py_coro_rpc_client_pool>(m, "py_coro_rpc_client_pool")
-      .def(py::init<std::string>())
-      .def("async_send_msg", &py_coro_rpc_client_pool::async_send_msg)
-      .def("sync_send_msg", &py_coro_rpc_client_pool::sync_send_msg,
-           py::call_guard<py::gil_scoped_release>())
-      .def("sync_send_msg1", &py_coro_rpc_client_pool::sync_send_msg1,
-           py::call_guard<py::gil_scoped_release>())
-      .def("async_send_tensor", &py_coro_rpc_client_pool::async_send_tensor,
-           py::call_guard<py::gil_scoped_release>())
-      .def("async_send_msg_with_outbuf",
-           &py_coro_rpc_client_pool::async_send_msg_with_outbuf);
-
-  py::class_<string_holder, std::shared_ptr<string_holder>>(m, "Holder")
-      .def(py::init<std::string>())
-      .def("str_view", &string_holder::str_view);
-
-  py::class_<rpc_result>(m, "rpc_result")
-      .def(py::init<>())
-      .def_readonly("code", &rpc_result::code)
-      .def_readonly("err_msg", &rpc_result::err_msg)
-      .def_readwrite("data_ptr", &rpc_result::data_ptr)
-      .def_readonly("data_size", &rpc_result::data_size)
-      .def("str_view", &rpc_result::str_view);
-
-  m.def("log", [](std::string str) {
-    ELOG_INFO << str;
-  });
-}
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py b/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
new file mode 100644
index 000000000..f531db0b9
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""
+Test the integration of coro_rpc_interface with mooncake_transfer_engine
+"""
+
+print("=== Testing integration of coro_rpc_interface with mooncake_transfer_engine ===\n")
+
+import mooncake.engine as mooncake_transfer_engine
+print("Imported mooncake_transfer_engine successfully")
+
+rpc_interface = mooncake_transfer_engine.coro_rpc_interface
+print("Accessed coro_rpc_interface submodule successfully")
+
+
+interface = rpc_interface.CoroRPCInterface()
+print("Created CoroRPCInterface instance successfully")
+
+public_methods = [m for m in dir(interface) if not m.startswith('_')]
+print(f"Number of available public methods: {len(public_methods)}")
+
+received_data = rpc_interface.ReceivedData()
+print("Created ReceivedData instance successfully")
+
+received_tensor = rpc_interface.ReceivedTensor()
+print("Created ReceivedTensor instance successfully")
+
+client = rpc_interface.create_rpc_client()
+print("Called create_rpc_client function successfully")
+
+print("\n=== Integration test completed ===")
\ No newline at end of file
diff --git a/mooncake-transfer-engine/tests/test_real_coro_rpc.py b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
new file mode 100644
index 000000000..ff6c8b1ba
--- /dev/null
+++ b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Test the actual coro_rpc implementation
+"""
+
+import mooncake.engine as te
+import time
+import threading
+
+print("=== Testing actual coro_rpc implementation ===\n")
+
+# Create CoroRPCInterface instance
+interface = te.coro_rpc_interface.CoroRPCInterface()
+print("Created CoroRPCInterface instance successfully")
+
+# Test initialization
+success = interface.initialize("127.0.0.1:8080", 2, 30, 10)
+print(f"Initialization result: {success}")
+
+# Start server asynchronously
+print("Starting server asynchronously...")
+server_started = interface.start_server_async()
+print(f"Server async start result: {server_started}")
+
+# Wait for server to start
+time.sleep(1)
+
+# Test adding remote connection
+print("\nTesting client connection...")
+connected = interface.add_remote_connection("127.0.0.1:8080")
+print(f"Connected to server: {connected}")
+
+# Test connection status
+is_connected = interface.is_connected("127.0.0.1:8080")
+print(f"Connection status: {is_connected}")
+
+print("\n=== coro_rpc implementation test completed ===")
+print("Real coro_rpc features are now integrated:")
+print("  - Using yalantinglibs coro_rpc library")
+print("  - Real client/server connectivity")
+print("  - Asynchronous coroutine support")
+print("  - Data and tensor transmission capabilities")
\ No newline at end of file

From 998a1c08e35ecc5dfdaf032c0e06acf2d625d1e8 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 1 Sep 2025 19:47:59 +0800
Subject: [PATCH 03/64] add integration test

---
 .../coro_rpc_connector/cororpc_interface.h    | 21 +++++
 .../coro_rpc_connector/cororpc_interface.cpp  | 89 ++++++++++++++++---
 .../coro_rpc_connector/test_integration.py    | 56 +++++++-----
 3 files changed, 132 insertions(+), 34 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index d3b238a3f..42a77e0e3 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -9,6 +9,27 @@ namespace mooncake {
 
 struct Config;
 
+// Tensor data type enumeration
+enum class TensorDtype : int {
+    UNKNOWN = 0,
+    FLOAT32 = 1,
+    FLOAT64 = 2,
+    INT32 = 3,
+    INT64 = 4,
+    INT8 = 5,
+    INT16 = 6,
+    UINT8 = 7,
+    BOOL = 8
+};
+
+// Tensor metadata structure for serialization
+struct TensorMetadata {
+    int ndim = 0;           // Number of dimensions
+    int shape[4] = {0};     // Shape array (max 4 dimensions)
+    int dtype = 0;          // Data type as integer
+    size_t total_size = 0;  // Total size in bytes
+};
+
 class CoroRPCInterface {
 public:
     struct ReceivedData {
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 8d7ecb76b..60a17c7c9 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <functional>
 #include <memory>
+#include <cstring>
 
 using namespace pybind11::literals;
 
@@ -284,26 +285,90 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const
         auto torch = pybind11::module::import("torch");
         auto numpy = pybind11::module::import("numpy");
         
+        // 检查数据大小是否足够包含元数据
+        if (data.size() < sizeof(TensorMetadata)) {
+            std::cerr << "Invalid data format: insufficient data for metadata" << std::endl;
+            return pybind11::none();
+        }
+        
+        // 从数据开头提取元数据
+        TensorMetadata metadata;
+        std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
+        
+        // 验证元数据的有效性
+        if (metadata.ndim < 0 || metadata.ndim > 4) {
+            std::cerr << "Invalid tensor metadata: ndim=" << metadata.ndim << std::endl;
+            return pybind11::none();
+        }
+        
+        TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
+        if (dtype_enum == TensorDtype::UNKNOWN) {
+            std::cerr << "Unknown tensor dtype!" << std::endl;
+            return pybind11::none();
+        }
+        
+        // 计算实际 tensor 数据的大小
+        size_t tensor_size = data.size() - sizeof(TensorMetadata);
+        if (tensor_size == 0) {
+            std::cerr << "Invalid data format: no tensor data found" << std::endl;
+            return pybind11::none();
+        }
+        
+        // 获取 tensor 数据指针（跳过元数据）
+        const char* tensor_data = data.data() + sizeof(TensorMetadata);
+        
+        // 根据数据类型创建对应的 numpy 数组
         std::string np_dtype;
-        if (dtype.find("float32") != std::string::npos) {
-            np_dtype = "float32";
-        } else if (dtype.find("float64") != std::string::npos) {
-            np_dtype = "float64";
-        } else if (dtype.find("int32") != std::string::npos) {
-            np_dtype = "int32";
-        } else if (dtype.find("int64") != std::string::npos) {
-            np_dtype = "int64";
-        } else {
-            np_dtype = "float32";
+        switch (dtype_enum) {
+            case TensorDtype::FLOAT32:
+                np_dtype = "float32";
+                break;
+            case TensorDtype::FLOAT64:
+                np_dtype = "float64";
+                break;
+            case TensorDtype::INT32:
+                np_dtype = "int32";
+                break;
+            case TensorDtype::INT64:
+                np_dtype = "int64";
+                break;
+            case TensorDtype::INT8:
+                np_dtype = "int8";
+                break;
+            case TensorDtype::INT16:
+                np_dtype = "int16";
+                break;
+            case TensorDtype::UINT8:
+                np_dtype = "uint8";
+                break;
+            case TensorDtype::BOOL:
+                np_dtype = "bool";
+                break;
+            default:
+                std::cerr << "Unsupported dtype enum: " << static_cast<int>(dtype_enum) << std::endl;
+                return pybind11::none();
         }
         
+        // 从原始数据创建 numpy 数组
         auto np_array = numpy.attr("frombuffer")(
-            pybind11::bytes(data), 
+            pybind11::bytes(tensor_data, tensor_size), 
             "dtype"_a=np_dtype
-        ).attr("reshape")(pybind11::cast(shape));
+        );
         
+        // 重建形状
+        if (metadata.ndim > 0) {
+            std::vector<int> shape_vec;
+            for (int i = 0; i < metadata.ndim; i++) {
+                shape_vec.push_back(metadata.shape[i]);
+            }
+            pybind11::tuple shape_tuple = pybind11::cast(shape_vec);
+            np_array = np_array.attr("reshape")(shape_tuple);
+        }
+        
+        // 转换为 PyTorch tensor
         auto tensor = torch.attr("from_numpy")(np_array);
         return tensor;
+        
     } catch (const std::exception& e) {
         std::cerr << "Error rebuilding tensor: " << e.what() << std::endl;
         return pybind11::none();
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py b/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
index f531db0b9..ff6c8b1ba 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
@@ -1,30 +1,42 @@
 #!/usr/bin/env python3
 """
-Test the integration of coro_rpc_interface with mooncake_transfer_engine
+Test the actual coro_rpc implementation
 """
 
-print("=== Testing integration of coro_rpc_interface with mooncake_transfer_engine ===\n")
+import mooncake.engine as te
+import time
+import threading
 
-import mooncake.engine as mooncake_transfer_engine
-print("Imported mooncake_transfer_engine successfully")
+print("=== Testing actual coro_rpc implementation ===\n")
 
-rpc_interface = mooncake_transfer_engine.coro_rpc_interface
-print("Accessed coro_rpc_interface submodule successfully")
-
-
-interface = rpc_interface.CoroRPCInterface()
+# Create CoroRPCInterface instance
+interface = te.coro_rpc_interface.CoroRPCInterface()
 print("Created CoroRPCInterface instance successfully")
 
-public_methods = [m for m in dir(interface) if not m.startswith('_')]
-print(f"Number of available public methods: {len(public_methods)}")
-
-received_data = rpc_interface.ReceivedData()
-print("Created ReceivedData instance successfully")
-
-received_tensor = rpc_interface.ReceivedTensor()
-print("Created ReceivedTensor instance successfully")
-
-client = rpc_interface.create_rpc_client()
-print("Called create_rpc_client function successfully")
-
-print("\n=== Integration test completed ===")
\ No newline at end of file
+# Test initialization
+success = interface.initialize("127.0.0.1:8080", 2, 30, 10)
+print(f"Initialization result: {success}")
+
+# Start server asynchronously
+print("Starting server asynchronously...")
+server_started = interface.start_server_async()
+print(f"Server async start result: {server_started}")
+
+# Wait for server to start
+time.sleep(1)
+
+# Test adding remote connection
+print("\nTesting client connection...")
+connected = interface.add_remote_connection("127.0.0.1:8080")
+print(f"Connected to server: {connected}")
+
+# Test connection status
+is_connected = interface.is_connected("127.0.0.1:8080")
+print(f"Connection status: {is_connected}")
+
+print("\n=== coro_rpc implementation test completed ===")
+print("Real coro_rpc features are now integrated:")
+print("  - Using yalantinglibs coro_rpc library")
+print("  - Real client/server connectivity")
+print("  - Asynchronous coroutine support")
+print("  - Data and tensor transmission capabilities")
\ No newline at end of file

From f8cee4d3e9a29cb4c2273471dba3271da72bef82 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 1 Sep 2025 20:54:41 +0800
Subject: [PATCH 04/64] added tensor rebuild logic

---
 .../coro_rpc_connector/cororpc_interface.h    |  25 +-
 .../cororpc_communicator.cpp                  |   2 +-
 .../coro_rpc_connector/cororpc_interface.cpp  | 620 ++++++++++--------
 .../coro_rpc_connector/test_integration.py    |  42 --
 .../tests/test_real_coro_rpc.py               | 167 ++++-
 test_enhanced_tensor_rebuilding.py            | 151 +++++
 6 files changed, 655 insertions(+), 352 deletions(-)
 delete mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
 create mode 100644 test_enhanced_tensor_rebuilding.py

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 42a77e0e3..d00697e0e 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -9,27 +9,6 @@ namespace mooncake {
 
 struct Config;
 
-// Tensor data type enumeration
-enum class TensorDtype : int {
-    UNKNOWN = 0,
-    FLOAT32 = 1,
-    FLOAT64 = 2,
-    INT32 = 3,
-    INT64 = 4,
-    INT8 = 5,
-    INT16 = 6,
-    UINT8 = 7,
-    BOOL = 8
-};
-
-// Tensor metadata structure for serialization
-struct TensorMetadata {
-    int ndim = 0;           // Number of dimensions
-    int shape[4] = {0};     // Shape array (max 4 dimensions)
-    int dtype = 0;          // Data type as integer
-    size_t total_size = 0;  // Total size in bytes
-};
-
 class CoroRPCInterface {
 public:
     struct ReceivedData {
@@ -98,8 +77,8 @@ class CoroRPCInterface {
     std::unique_ptr<Impl> impl_;
 };
 
-std::unique_ptr<CoroRPCInterface> createRPCClient(size_t pool_size = 10, size_t timeout_seconds = 30);
-std::unique_ptr<CoroRPCInterface> createRPCServer(const std::string& listen_address, size_t thread_count = 0);
+std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank = 0, uint64_t world_size = 1);
+std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank = 0, uint64_t world_size = 1);
 
 } // namespace mooncake
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 847f96658..46e68a8c5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -273,7 +273,7 @@ std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_
             }
         ).start([](auto &&) {});
     } else {
-        LOG("Client pool not available for async tensor send");
+        std::cerr << "Client pool not available for async tensor send" << std::endl;
     }
     
     return future;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 60a17c7c9..311c166ed 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -1,39 +1,123 @@
 #include "transport/coro_rpc_connector/cororpc_interface.h"
 #include "transport/coro_rpc_connector/cororpc_communicator.h"
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
 #include <iostream>
-#include <functional>
 #include <memory>
-#include <cstring>
+#include <thread>
+#include <future>
+#include <vector>
 
-using namespace pybind11::literals;
+namespace mooncake {
 
-#define PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF
+// Tensor dtype enumeration
+enum class TensorDtype : int32_t {
+    UNKNOWN = 0,
+    FLOAT16 = 1,
+    FLOAT32 = 2,
+    FLOAT64 = 3,
+    INT8 = 4,
+    INT16 = 5,
+    INT32 = 6,
+    INT64 = 7,
+    UINT8 = 8,
+    BOOL = 9
+};
 
-namespace mooncake {
+// Tensor metadata structure
+struct TensorMetadata {
+    int32_t dtype;   // TensorDtype enum value
+    int32_t ndim;    // Number of dimensions
+    int64_t shape[4]; // Shape array (max 4D)
+    char padding[32]; // For future extensions
+};
 
+// Implementation class
 class CoroRPCInterface::Impl {
 public:
     std::unique_ptr<CoroRPCCommunicator> communicator;
     pybind11::function data_receive_callback;
     pybind11::function tensor_receive_callback;
-    
-    void onDataReceived(const std::string& source, const std::string& data);
-    void onTensorReceived(const std::string& source, const std::string& data,
-                         const std::vector<size_t>& shape, const std::string& dtype);
 };
 
+// Helper function to get tensor dtype from Python tensor
+TensorDtype get_tensor_dtype(const pybind11::object& dtype_obj) {
+    std::string dtype_str = dtype_obj.attr("__str__")().cast<std::string>();
+    
+    if (dtype_str.find("float16") != std::string::npos) return TensorDtype::FLOAT16;
+    if (dtype_str.find("float32") != std::string::npos) return TensorDtype::FLOAT32;
+    if (dtype_str.find("float64") != std::string::npos) return TensorDtype::FLOAT64;
+    if (dtype_str.find("int8") != std::string::npos) return TensorDtype::INT8;
+    if (dtype_str.find("int16") != std::string::npos) return TensorDtype::INT16;
+    if (dtype_str.find("int32") != std::string::npos) return TensorDtype::INT32;
+    if (dtype_str.find("int64") != std::string::npos) return TensorDtype::INT64;
+    if (dtype_str.find("uint8") != std::string::npos) return TensorDtype::UINT8;
+    if (dtype_str.find("bool") != std::string::npos) return TensorDtype::BOOL;
+    
+    return TensorDtype::UNKNOWN;
+}
+
+size_t get_dtype_size(TensorDtype dtype) {
+    switch (dtype) {
+        case TensorDtype::FLOAT32: return 4;
+        case TensorDtype::FLOAT64: return 8;
+        case TensorDtype::INT32: return 4;
+        case TensorDtype::INT64: return 8;
+        case TensorDtype::INT8: return 1;
+        case TensorDtype::UINT8: return 1;
+        case TensorDtype::FLOAT16: return 2;
+        case TensorDtype::INT16: return 2;
+        case TensorDtype::BOOL: return 1;
+        default: return 0;
+    }
+}
+
+// Helper function to create numpy array from data
+pybind11::object create_numpy_array_from_data(const char* data, TensorDtype dtype, 
+                                               const std::vector<int64_t>& shape) {
+    pybind11::gil_scoped_acquire acquire;
+    
+    pybind11::module_ np = pybind11::module_::import("numpy");
+    
+    std::string np_dtype;
+    switch (dtype) {
+        case TensorDtype::FLOAT32: np_dtype = "float32"; break;
+        case TensorDtype::FLOAT64: np_dtype = "float64"; break;
+        case TensorDtype::INT32: np_dtype = "int32"; break;
+        case TensorDtype::INT64: np_dtype = "int64"; break;
+        case TensorDtype::INT8: np_dtype = "int8"; break;
+        case TensorDtype::UINT8: np_dtype = "uint8"; break;
+        case TensorDtype::FLOAT16: np_dtype = "float16"; break;
+        case TensorDtype::INT16: np_dtype = "int16"; break;
+        case TensorDtype::BOOL: np_dtype = "bool"; break;
+        default: 
+            throw std::runtime_error("Unknown tensor dtype");
+    }
+    
+    size_t element_size = get_dtype_size(dtype);
+    size_t total_elements = 1;
+    for (int64_t dim : shape) {
+        total_elements *= dim;
+    }
+    
+    // Create a copy of the data
+    std::vector<char> data_copy(data, data + total_elements * element_size);
+    
+    return np.attr("frombuffer")(pybind11::bytes(data_copy.data(), data_copy.size()), 
+                                 pybind11::arg("dtype")=np_dtype).attr("reshape")(shape);
+}
+
+// Constructor
 CoroRPCInterface::CoroRPCInterface() : impl_(std::make_unique<Impl>()) {}
 
+// Destructor
 CoroRPCInterface::~CoroRPCInterface() = default;
 
-bool CoroRPCInterface::initialize(const std::string& listen_address,
-                                 size_t thread_count,
-                                 size_t timeout_seconds,
-                                 size_t pool_size) {
+// Initialize
+bool CoroRPCInterface::initialize(const std::string& local_address, 
+                                  size_t thread_count, 
+                                  size_t timeout_seconds,
+                                  size_t pool_size) {
     Config config;
-    config.listen_address = listen_address;
+    config.listen_address = local_address;
     config.thread_count = thread_count;
     config.timeout_seconds = timeout_seconds;
     config.pool_size = pool_size;
@@ -80,147 +164,201 @@ int CoroRPCInterface::sendData(const std::string& target_address, pybind11::byte
     std::string data_str;
     {
         pybind11::gil_scoped_acquire acquire;
-        data_str = std::string(data);
+        data_str = data;
     }
-    
+
     pybind11::gil_scoped_release release;
     return impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
 }
 
-pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_address,
-                                                 pybind11::bytes data,
-                                                 pybind11::handle loop) {
-    pybind11::object future;
-    {
-        pybind11::gil_scoped_acquire acquire;
-        future = loop.attr("create_future")();
-    }
+pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_address, 
+                                                  pybind11::bytes data, 
+                                                  pybind11::handle loop) {
+    pybind11::gil_scoped_acquire acquire;
+    
+    auto future_module = pybind11::module_::import("asyncio");
+    auto future_obj = future_module.attr("Future")();
     
     if (!impl_->communicator) {
-        pybind11::gil_scoped_acquire acquire;
-        loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
-        return future;
+        future_obj.attr("set_exception")(pybind11::make_tuple(
+            pybind11::str("Communicator not initialized")));
+        return future_obj;
     }
-    
-    auto data_holder = std::make_shared<std::string>();
-    auto target_addr = std::make_shared<std::string>(target_address);
+
     auto communicator = impl_->communicator.get();
-    
-    PyObject* future_ptr = nullptr;
-    PyObject* loop_ptr = nullptr;
-    
-    {
-        pybind11::gil_scoped_acquire acquire;
-        *data_holder = std::string(data);
-        future_ptr = future.ptr();
-        loop_ptr = loop.ptr();
-        Py_INCREF(future_ptr);
-        Py_INCREF(loop_ptr);
-    }
-    
-    auto task_func = std::make_shared<std::function<void()>>();
-    *task_func = [communicator, target_addr, data_holder, future_ptr, loop_ptr]() {
+    auto target_addr = std::make_shared<std::string>(target_address);
+    auto data_holder = std::make_shared<std::string>(data);
+    auto future_ptr = std::make_shared<pybind11::object>(future_obj);
+    auto loop_ptr = std::make_shared<pybind11::object>(pybind11::reinterpret_borrow<pybind11::object>(loop));
+
+    auto task_func = std::make_shared<std::function<void()>>(
+        [communicator, target_addr, data_holder, future_ptr, loop_ptr]() {
         int result = communicator->sendData(*target_addr, data_holder->data(), data_holder->size());
         
-        {
+        auto call_soon_threadsafe = [future_ptr, loop_ptr, result]() {
             pybind11::gil_scoped_acquire acquire;
-            try {
-                pybind11::handle loop_handle(loop_ptr);
-                pybind11::handle future_handle(future_ptr);
-                loop_handle.attr("call_soon_threadsafe")(future_handle.attr("set_result"), result);
-            } catch (const std::exception& e) {
-                std::cerr << "Error in async callback: " << e.what() << std::endl;
+            if (result >= 0) {
+                future_ptr->attr("set_result")(result);
+            } else {
+                future_ptr->attr("set_exception")(pybind11::make_tuple(
+                    pybind11::str("Send data failed")));
             }
-            Py_DECREF(future_ptr);
-            Py_DECREF(loop_ptr);
-        }
-    };
-    
+        };
+
+        auto callback = pybind11::cpp_function(call_soon_threadsafe);
+        loop_ptr->attr("call_soon_threadsafe")(callback);
+    });
+
     std::thread([task_func]() { (*task_func)(); }).detach();
-    return future;
+    
+    return future_obj;
 }
 
 int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::handle tensor) {
     if (!impl_->communicator) return -1;
     
-    pybind11::object tensor_obj;
-    {
-        pybind11::gil_scoped_acquire acquire;
-        tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
-    }
+    try {
+        pybind11::object tensor_obj;
+        TensorMetadata metadata = {};
+        std::vector<char> combined_data;
+        
+        {
+            pybind11::gil_scoped_acquire acquire;
+            tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
+            
+            // Validate tensor type
+            if (!(tensor_obj.attr("__class__").attr("__name__").cast<std::string>().find("Tensor") != std::string::npos)) {
+                std::cerr << "Input is not a tensor" << std::endl;
+                return -1;
+            }
+            
+            // Extract tensor properties
+            uintptr_t data_ptr = tensor_obj.attr("data_ptr")().cast<uintptr_t>();
+            size_t numel = tensor_obj.attr("numel")().cast<size_t>();
+            size_t element_size = tensor_obj.attr("element_size")().cast<size_t>();
+            size_t tensor_size = numel * element_size;
+            
+            // Get tensor dtype
+            pybind11::object dtype_obj = tensor_obj.attr("dtype");
+            TensorDtype dtype_enum = get_tensor_dtype(dtype_obj);
+            if (dtype_enum == TensorDtype::UNKNOWN) {
+                std::cerr << "Unsupported tensor dtype" << std::endl;
+                return -1;
+            }
+            
+            // Get tensor shape
+            pybind11::object shape_obj = tensor_obj.attr("shape");
+            pybind11::tuple shape_tuple = pybind11::cast<pybind11::tuple>(shape_obj);
+            int32_t ndim = static_cast<int32_t>(shape_tuple.size());
+            if (ndim > 4) {
+                std::cerr << "Tensor has too many dimensions (max 4 supported)" << std::endl;
+                return -1;
+            }
+            
+            // Fill metadata
+            metadata.dtype = static_cast<int32_t>(dtype_enum);
+            metadata.ndim = ndim;
+            for (int i = 0; i < 4; i++) {
+                if (i < ndim) {
+                    metadata.shape[i] = shape_tuple[i].cast<int64_t>();
+                } else {
+                    metadata.shape[i] = 0;
+                }
+            }
+            
+            // Create combined data: metadata + tensor data
+            combined_data.resize(sizeof(TensorMetadata) + tensor_size);
+            
+            // Copy metadata
+            std::memcpy(combined_data.data(), &metadata, sizeof(TensorMetadata));
+            
+            // Copy tensor data
+            const char* tensor_data = reinterpret_cast<const char*>(data_ptr);
+            std::memcpy(combined_data.data() + sizeof(TensorMetadata), tensor_data, tensor_size);
+            
+            std::cout << "Sending tensor with shape: [";
+            for (int i = 0; i < ndim; i++) {
+                std::cout << metadata.shape[i];
+                if (i < ndim - 1) std::cout << ", ";
+            }
+            std::cout << "] and dtype: " << metadata.dtype << ", total size: " << combined_data.size() << " bytes" << std::endl;
+        }
 
-    pybind11::gil_scoped_release release;
-    return impl_->communicator->sendTensor(target_address, tensor_obj);
+        pybind11::gil_scoped_release release;
+        return impl_->communicator->sendData(target_address, combined_data.data(), combined_data.size());
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Send tensor error: " << e.what() << std::endl;
+        return -1;
+    }
 }
 
-pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_address,
-                                                   pybind11::handle tensor,
-                                                   pybind11::handle loop) {
-    pybind11::object future;
-    {
-        pybind11::gil_scoped_acquire acquire;
-        future = loop.attr("create_future")();
-    }
+pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_address, 
+                                                    pybind11::handle tensor, 
+                                                    pybind11::handle loop) {
+    pybind11::gil_scoped_acquire acquire;
+    
+    auto future_module = pybind11::module_::import("asyncio");
+    auto future_obj = future_module.attr("Future")();
     
     if (!impl_->communicator) {
-        pybind11::gil_scoped_acquire acquire;
-        loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
-        return future;
+        future_obj.attr("set_exception")(pybind11::make_tuple(
+            pybind11::str("Communicator not initialized")));
+        return future_obj;
     }
-    
-    auto tensor_info = std::make_shared<TensorInfo>();
-    auto target_addr = std::make_shared<std::string>(target_address);
+
     auto communicator = impl_->communicator.get();
+    auto target_addr = std::make_shared<std::string>(target_address);
     
-    PyObject* future_ptr = nullptr;
-    PyObject* loop_ptr = nullptr;
+    // Extract tensor info
+    pybind11::object tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
+    uintptr_t data_ptr = tensor_obj.attr("data_ptr")().cast<uintptr_t>();
+    size_t numel = tensor_obj.attr("numel")().cast<size_t>();
+    size_t element_size = tensor_obj.attr("element_size")().cast<size_t>();
+    size_t tensor_size = numel * element_size;
     
-    {
-        pybind11::gil_scoped_acquire acquire;
-        try {
-            tensor_info->data_ptr = reinterpret_cast<void*>(tensor.attr("data_ptr")().cast<uintptr_t>());
-            size_t numel = tensor.attr("numel")().cast<size_t>();
-            size_t element_size = tensor.attr("element_size")().cast<size_t>();
-            tensor_info->total_bytes = numel * element_size;
-            
-            auto shape_tuple = tensor.attr("shape");
-            for (pybind11::handle item : shape_tuple) {
-                tensor_info->shape.push_back(item.cast<size_t>());
-            }
-            tensor_info->dtype = tensor.attr("dtype").attr("__str__")().cast<std::string>();
-            
-            future_ptr = future.ptr();
-            loop_ptr = loop.ptr();
-            Py_INCREF(future_ptr);
-            Py_INCREF(loop_ptr);
-        } catch (const std::exception& e) {
-            std::cerr << "Error extracting tensor info: " << e.what() << std::endl;
-            loop.attr("call_soon_threadsafe")(future.attr("set_result"), -1);
-            return future;
-        }
+    // Get tensor shape and dtype
+    pybind11::object shape_obj = tensor_obj.attr("shape");
+    pybind11::tuple shape_tuple = pybind11::cast<pybind11::tuple>(shape_obj);
+    std::vector<size_t> shape;
+    for (size_t i = 0; i < shape_tuple.size(); i++) {
+        shape.push_back(shape_tuple[i].cast<size_t>());
     }
     
-    auto task_func = std::make_shared<std::function<void()>>();
-    *task_func = [communicator, target_addr, tensor_info, future_ptr, loop_ptr]() {
+    pybind11::object dtype_obj = tensor_obj.attr("dtype");
+    std::string dtype = dtype_obj.attr("__str__")().cast<std::string>();
+    
+    auto tensor_info = std::make_shared<TensorInfo>();
+    tensor_info->data_ptr = reinterpret_cast<void*>(data_ptr);
+    tensor_info->total_bytes = tensor_size;
+    tensor_info->shape = shape;
+    tensor_info->dtype = dtype;
+    
+    auto future_ptr = std::make_shared<pybind11::object>(future_obj);
+    auto loop_ptr = std::make_shared<pybind11::object>(pybind11::reinterpret_borrow<pybind11::object>(loop));
+
+    auto task_func = std::make_shared<std::function<void()>>(
+        [communicator, target_addr, tensor_info, future_ptr, loop_ptr]() {
         auto std_future = communicator->sendTensorAsync(*target_addr, *tensor_info);
         int result = std_future.get();
         
-        {
+        auto call_soon_threadsafe = [future_ptr, loop_ptr, result]() {
             pybind11::gil_scoped_acquire acquire;
-            try {
-                pybind11::handle loop_handle(loop_ptr);
-                pybind11::handle future_handle(future_ptr);
-                loop_handle.attr("call_soon_threadsafe")(future_handle.attr("set_result"), result);
-            } catch (const std::exception& e) {
-                std::cerr << "Error in tensor async callback: " << e.what() << std::endl;
+            if (result >= 0) {
+                future_ptr->attr("set_result")(result);
+            } else {
+                future_ptr->attr("set_exception")(pybind11::make_tuple(
+                    pybind11::str("Send tensor failed")));
             }
-            Py_DECREF(future_ptr);
-            Py_DECREF(loop_ptr);
-        }
-    };
-    
+        };
+
+        auto callback = pybind11::cpp_function(call_soon_threadsafe);
+        loop_ptr->attr("call_soon_threadsafe")(callback);
+    });
+
     std::thread([task_func]() { (*task_func)(); }).detach();
-    return future;
+    
+    return future_obj;
 }
 
 void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
@@ -233,16 +371,52 @@ void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
     impl_->tensor_receive_callback = callback;
 }
 
-void CoroRPCInterface::handleIncomingData(const std::string& source_address,
-                                         const std::string& data) {
+void CoroRPCInterface::handleIncomingData(const std::string& source, const std::string& data) {
+    // Check if this is tensor data by looking for metadata signature
+    if (data.size() >= sizeof(TensorMetadata)) {
+        const TensorMetadata* metadata = reinterpret_cast<const TensorMetadata*>(data.data());
+        
+        // Basic validation: check if dtype is in valid range
+        if (metadata->dtype >= 0 && metadata->dtype < static_cast<int32_t>(TensorDtype::UNKNOWN) && 
+            metadata->ndim >= 0 && metadata->ndim <= 4) {
+            
+            // This looks like tensor data, handle it as such
+            std::vector<size_t> shape;
+            for (int i = 0; i < metadata->ndim; i++) {
+                if (metadata->shape[i] > 0) {
+                    shape.push_back(static_cast<size_t>(metadata->shape[i]));
+                }
+            }
+            
+            // Get dtype name
+            std::string dtype_name;
+            switch (static_cast<TensorDtype>(metadata->dtype)) {
+                case TensorDtype::FLOAT16: dtype_name = "float16"; break;
+                case TensorDtype::FLOAT32: dtype_name = "float32"; break;
+                case TensorDtype::FLOAT64: dtype_name = "float64"; break;
+                case TensorDtype::INT8: dtype_name = "int8"; break;
+                case TensorDtype::INT16: dtype_name = "int16"; break;
+                case TensorDtype::INT32: dtype_name = "int32"; break;
+                case TensorDtype::INT64: dtype_name = "int64"; break;
+                case TensorDtype::UINT8: dtype_name = "uint8"; break;
+                case TensorDtype::BOOL: dtype_name = "bool"; break;
+                default: dtype_name = "unknown"; break;
+            }
+            
+            // Call tensor handler instead of data handler
+            handleIncomingTensor(source, data, shape, dtype_name);
+            return;
+        }
+    }
+    
+    // Handle as regular data if not tensor data
     if (!impl_->data_receive_callback) return;
     
-    pybind11::gil_scoped_acquire acquire;
     try {
-        ReceivedData received;
-        received.data = data;
-        received.source_address = source_address;
-        received.data_size = data.size();
+        pybind11::gil_scoped_acquire acquire;
+        pybind11::dict received;
+        received["source"] = source;
+        received["data"] = pybind11::bytes(data);
         
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
@@ -250,20 +424,20 @@ void CoroRPCInterface::handleIncomingData(const std::string& source_address,
     }
 }
 
-void CoroRPCInterface::handleIncomingTensor(const std::string& source_address,
-                                           const std::string& data,
-                                           const std::vector<size_t>& shape,
-                                           const std::string& dtype) {
+void CoroRPCInterface::handleIncomingTensor(const std::string& source, 
+                                            const std::string& data,
+                                            const std::vector<size_t>& shape, 
+                                            const std::string& dtype) {
     if (!impl_->tensor_receive_callback) return;
     
-    pybind11::gil_scoped_acquire acquire;
     try {
+        pybind11::gil_scoped_acquire acquire;
+        
         ReceivedTensor received;
+        received.source_address = source;
         received.data = data;
-        received.source_address = source_address;
         received.shape = shape;
         received.dtype = dtype;
-        received.total_bytes = data.size();
         
         impl_->tensor_receive_callback(received);
     } catch (const std::exception& e) {
@@ -272,135 +446,67 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source_address,
 }
 
 pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
-    if (!PyGILState_Check()) {
-        pybind11::gil_scoped_acquire acquire;
-        return rebuildTensorInternal();
-    } else {
-        return rebuildTensorInternal();
-    }
+    pybind11::gil_scoped_acquire acquire;
+    return rebuildTensorInternal();
 }
 
 pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const {
-    try {
-        auto torch = pybind11::module::import("torch");
-        auto numpy = pybind11::module::import("numpy");
-        
-        // 检查数据大小是否足够包含元数据
-        if (data.size() < sizeof(TensorMetadata)) {
-            std::cerr << "Invalid data format: insufficient data for metadata" << std::endl;
-            return pybind11::none();
-        }
-        
-        // 从数据开头提取元数据
-        TensorMetadata metadata;
-        std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
-        
-        // 验证元数据的有效性
-        if (metadata.ndim < 0 || metadata.ndim > 4) {
-            std::cerr << "Invalid tensor metadata: ndim=" << metadata.ndim << std::endl;
-            return pybind11::none();
-        }
-        
-        TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
-        if (dtype_enum == TensorDtype::UNKNOWN) {
-            std::cerr << "Unknown tensor dtype!" << std::endl;
-            return pybind11::none();
-        }
-        
-        // 计算实际 tensor 数据的大小
-        size_t tensor_size = data.size() - sizeof(TensorMetadata);
-        if (tensor_size == 0) {
-            std::cerr << "Invalid data format: no tensor data found" << std::endl;
-            return pybind11::none();
-        }
-        
-        // 获取 tensor 数据指针（跳过元数据）
-        const char* tensor_data = data.data() + sizeof(TensorMetadata);
-        
-        // 根据数据类型创建对应的 numpy 数组
-        std::string np_dtype;
-        switch (dtype_enum) {
-            case TensorDtype::FLOAT32:
-                np_dtype = "float32";
-                break;
-            case TensorDtype::FLOAT64:
-                np_dtype = "float64";
-                break;
-            case TensorDtype::INT32:
-                np_dtype = "int32";
-                break;
-            case TensorDtype::INT64:
-                np_dtype = "int64";
-                break;
-            case TensorDtype::INT8:
-                np_dtype = "int8";
-                break;
-            case TensorDtype::INT16:
-                np_dtype = "int16";
-                break;
-            case TensorDtype::UINT8:
-                np_dtype = "uint8";
-                break;
-            case TensorDtype::BOOL:
-                np_dtype = "bool";
-                break;
-            default:
-                std::cerr << "Unsupported dtype enum: " << static_cast<int>(dtype_enum) << std::endl;
-                return pybind11::none();
-        }
-        
-        // 从原始数据创建 numpy 数组
-        auto np_array = numpy.attr("frombuffer")(
-            pybind11::bytes(tensor_data, tensor_size), 
-            "dtype"_a=np_dtype
-        );
-        
-        // 重建形状
-        if (metadata.ndim > 0) {
-            std::vector<int> shape_vec;
-            for (int i = 0; i < metadata.ndim; i++) {
-                shape_vec.push_back(metadata.shape[i]);
-            }
-            pybind11::tuple shape_tuple = pybind11::cast(shape_vec);
-            np_array = np_array.attr("reshape")(shape_tuple);
-        }
-        
-        // 转换为 PyTorch tensor
-        auto tensor = torch.attr("from_numpy")(np_array);
-        return tensor;
-        
-    } catch (const std::exception& e) {
-        std::cerr << "Error rebuilding tensor: " << e.what() << std::endl;
-        return pybind11::none();
+    if (data.size() < sizeof(TensorMetadata)) {
+        throw std::runtime_error("Data too small to contain tensor metadata");
     }
-}
-
-void CoroRPCInterface::Impl::onDataReceived(const std::string& source, const std::string& data) {
-}
-
-void CoroRPCInterface::Impl::onTensorReceived(const std::string& source, const std::string& data,
-                                             const std::vector<size_t>& shape, const std::string& dtype) {
-}
-
-std::unique_ptr<CoroRPCInterface> createRPCClient(size_t pool_size, size_t timeout_seconds) {
-    auto interface = std::make_unique<CoroRPCInterface>();
-    if (interface->initialize("", 0, timeout_seconds, pool_size)) {
-        return interface;
+    
+    // Extract metadata
+    TensorMetadata metadata;
+    std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
+    
+    // Validate metadata
+    if (metadata.ndim < 0 || metadata.ndim > 4) {
+        throw std::runtime_error("Invalid tensor dimensions");
     }
-    return nullptr;
-}
-
-std::unique_ptr<CoroRPCInterface> createRPCServer(const std::string& listen_address, size_t thread_count) {
-    auto interface = std::make_unique<CoroRPCInterface>();
-    if (interface->initialize(listen_address, thread_count)) {
-        return interface;
+    
+    TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
+    size_t element_size = get_dtype_size(dtype_enum);
+    if (element_size == 0) {
+        throw std::runtime_error("Unsupported tensor dtype");
+    }
+    
+    // Extract shape
+    std::vector<int64_t> tensor_shape;
+    size_t total_elements = 1;
+    for (int i = 0; i < metadata.ndim; i++) {
+        tensor_shape.push_back(metadata.shape[i]);
+        total_elements *= metadata.shape[i];
+    }
+    
+    // Validate data size
+    size_t expected_data_size = total_elements * element_size;
+    size_t actual_data_size = data.size() - sizeof(TensorMetadata);
+    if (actual_data_size != expected_data_size) {
+        throw std::runtime_error("Data size mismatch with tensor metadata");
     }
-    return nullptr;
+    
+    // Create numpy array from raw data
+    const char* tensor_data = data.data() + sizeof(TensorMetadata);
+    pybind11::object numpy_array = create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
+    
+    // Convert to PyTorch tensor
+    pybind11::module_ torch = pybind11::module_::import("torch");
+    return torch.attr("from_numpy")(numpy_array);
 }
 
-} // namespace mooncake
+// Factory functions for creating RPC client and server
+std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank, uint64_t world_size) {
+    auto client = std::make_unique<CoroRPCInterface>();
+    // Initialize client with default settings
+    client->initialize("", 0, 30, 10);
+    return client;
+}
 
-namespace py = pybind11;
+std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank, uint64_t world_size) {
+    auto server = std::make_unique<CoroRPCInterface>();
+    // Initialize server with default settings
+    server->initialize("0.0.0.0:8080", 0, 30, 10);
+    return server;
+}
 
-// Note: bind_coro_rpc_interface function is now implemented in transfer_engine_py.cpp
-// to avoid linking issues
\ No newline at end of file
+}  // namespace mooncake
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py b/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
deleted file mode 100644
index ff6c8b1ba..000000000
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/test_integration.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test the actual coro_rpc implementation
-"""
-
-import mooncake.engine as te
-import time
-import threading
-
-print("=== Testing actual coro_rpc implementation ===\n")
-
-# Create CoroRPCInterface instance
-interface = te.coro_rpc_interface.CoroRPCInterface()
-print("Created CoroRPCInterface instance successfully")
-
-# Test initialization
-success = interface.initialize("127.0.0.1:8080", 2, 30, 10)
-print(f"Initialization result: {success}")
-
-# Start server asynchronously
-print("Starting server asynchronously...")
-server_started = interface.start_server_async()
-print(f"Server async start result: {server_started}")
-
-# Wait for server to start
-time.sleep(1)
-
-# Test adding remote connection
-print("\nTesting client connection...")
-connected = interface.add_remote_connection("127.0.0.1:8080")
-print(f"Connected to server: {connected}")
-
-# Test connection status
-is_connected = interface.is_connected("127.0.0.1:8080")
-print(f"Connection status: {is_connected}")
-
-print("\n=== coro_rpc implementation test completed ===")
-print("Real coro_rpc features are now integrated:")
-print("  - Using yalantinglibs coro_rpc library")
-print("  - Real client/server connectivity")
-print("  - Asynchronous coroutine support")
-print("  - Data and tensor transmission capabilities")
\ No newline at end of file
diff --git a/mooncake-transfer-engine/tests/test_real_coro_rpc.py b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
index ff6c8b1ba..64ab7977c 100644
--- a/mooncake-transfer-engine/tests/test_real_coro_rpc.py
+++ b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
@@ -1,42 +1,151 @@
 #!/usr/bin/env python3
 """
-Test the actual coro_rpc implementation
+Test enhanced CoroRPC tensor rebuilding functionality
 """
 
-import mooncake.engine as te
-import time
+import torch
+import numpy as np
+import asyncio
 import threading
+import time
+import sys
+
+try:
+    import mooncake.engine as engine
+    print("Successfully imported mooncake.engine")
+    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    print("Successfully imported CoroRPCInterface")
+except ImportError as e:
+    print(f"Failed to import mooncake: {e}")
+    sys.exit(1)
+except AttributeError as e:
+    print(f"Failed to import CoroRPCInterface: {e}")
+    sys.exit(1)
+
+
+def test_enhanced_tensor_rebuilding():
+    print("\n=== Testing Enhanced Tensor Rebuilding ===")
+
+    # Create server and client instances
+    server = CoroRPCInterface()
+    client = CoroRPCInterface()
+
+    # Store received tensors
+    received_tensors = []
+
+    def tensor_receive_callback(received_tensor):
+        print(f"Received tensor from: {received_tensor.source_address}")
+        print(f"Original data size: {len(received_tensor.data)} bytes")
+        print(f"Shape info: {received_tensor.shape}")
+        print(f"Dtype info: {received_tensor.dtype}")
+
+        try:
+            # Use enhanced rebuild functionality to reconstruct tensor
+            rebuilt_tensor = received_tensor.rebuild_tensor()
+            received_tensors.append(rebuilt_tensor)
+
+            print("Successfully rebuilt tensor:")
+            print(f"  - Shape: {rebuilt_tensor.shape}")
+            print(f"  - Dtype: {rebuilt_tensor.dtype}")
+            print(f"  - Device: {rebuilt_tensor.device}")
+            print(f"  - Data sample: {rebuilt_tensor.flatten()[:5]}")
+
+        except Exception as e:
+            print(f"Failed to rebuild tensor: {e}")
+            import traceback
+            traceback.print_exc()
+
+    try:
+        # Initialize server and client
+        server_addr = "127.0.0.1:8888"
+        if not server.initialize(server_addr, 1, 30, 4):
+            print("Server initialization failed")
+            return False
+
+        if not client.initialize("", 0, 30, 4):
+            print("Client initialization failed")
+            return False
+
+        # Set tensor receive callback
+        server.set_tensor_receive_callback(tensor_receive_callback)
+
+        # Start server asynchronously
+        if not server.start_server_async():
+            print("Failed to start server")
+            return False
+
+        print(f"Server started on {server_addr}")
+        time.sleep(1)  # Wait for server to start
+
+        # Connect client to server
+        if not client.add_remote_connection(server_addr):
+            print("Failed to connect to server")
+            return False
+
+        print("Client connected to server")
+        time.sleep(0.5)  # Wait for connection establishment
+
+        # Define test cases with various tensor types
+        test_cases = [
+            ("Float32 2D", torch.randn(3, 4, dtype=torch.float32)),
+            ("Int64 1D", torch.arange(10, dtype=torch.int64)),
+            ("Float64 3D", torch.ones(2, 3, 4, dtype=torch.float64)),
+            ("Int32 Vector", torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)),
+            ("Bool Matrix", torch.tensor([[True, False], [False, True]], dtype=torch.bool)),
+        ]
+
+        for test_name, original_tensor in test_cases:
+            print(f"\n--- Testing {test_name} ---")
+            print("Original tensor:")
+            print(f"  - Shape: {original_tensor.shape}")
+            print(f"  - Dtype: {original_tensor.dtype}")
+            print(f"  - Data sample: {original_tensor.flatten()[:5]}")
+
+            # Send tensor
+            result = client.send_tensor(server_addr, original_tensor)
+            print(f"Send result: {result}")
+
+            if result < 0:
+                print(f"Failed to send {test_name}")
+                continue
+
+            # Wait for reception and processing
+            time.sleep(1)
 
-print("=== Testing actual coro_rpc implementation ===\n")
+            if len(received_tensors) == 0:
+                print(f"No tensor received for {test_name}")
+                continue
 
-# Create CoroRPCInterface instance
-interface = te.coro_rpc_interface.CoroRPCInterface()
-print("Created CoroRPCInterface instance successfully")
+            # Validate the rebuilt tensor
+            rebuilt_tensor = received_tensors[-1]
 
-# Test initialization
-success = interface.initialize("127.0.0.1:8080", 2, 30, 10)
-print(f"Initialization result: {success}")
+            # Check shape
+            if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
+                print(f"Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
+                continue
 
-# Start server asynchronously
-print("Starting server asynchronously...")
-server_started = interface.start_server_async()
-print(f"Server async start result: {server_started}")
+            # Check data type
+            if rebuilt_tensor.dtype != original_tensor.dtype:
+                print(f"Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
+                continue
 
-# Wait for server to start
-time.sleep(1)
+            # Check data content (move to CPU for comparison)
+            try:
+                if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
+                    print(f"{test_name} passed - data integrity verified")
+                else:
+                    print(f"{test_name} failed - data content mismatch")
+                    print(f"  Original: {original_tensor.flatten()[:5]}")
+                    print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
+            except Exception as e:
+                print(f"{test_name} failed - comparison error: {e}")
 
-# Test adding remote connection
-print("\nTesting client connection...")
-connected = interface.add_remote_connection("127.0.0.1:8080")
-print(f"Connected to server: {connected}")
+        print(f"\nEnhanced tensor rebuilding test completed")
+        print(f"Total tensors processed: {len(received_tensors)}")
 
-# Test connection status
-is_connected = interface.is_connected("127.0.0.1:8080")
-print(f"Connection status: {is_connected}")
+        return len(received_tensors) == len(test_cases)
 
-print("\n=== coro_rpc implementation test completed ===")
-print("Real coro_rpc features are now integrated:")
-print("  - Using yalantinglibs coro_rpc library")
-print("  - Real client/server connectivity")
-print("  - Asynchronous coroutine support")
-print("  - Data and tensor transmission capabilities")
\ No newline at end of file
+    except Exception as e:
+        print(f"Test failed with exception: {e}")
+        import traceback
+        traceback.pri
\ No newline at end of file
diff --git a/test_enhanced_tensor_rebuilding.py b/test_enhanced_tensor_rebuilding.py
new file mode 100644
index 000000000..64ab7977c
--- /dev/null
+++ b/test_enhanced_tensor_rebuilding.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Test enhanced CoroRPC tensor rebuilding functionality
+"""
+
+import torch
+import numpy as np
+import asyncio
+import threading
+import time
+import sys
+
+try:
+    import mooncake.engine as engine
+    print("Successfully imported mooncake.engine")
+    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    print("Successfully imported CoroRPCInterface")
+except ImportError as e:
+    print(f"Failed to import mooncake: {e}")
+    sys.exit(1)
+except AttributeError as e:
+    print(f"Failed to import CoroRPCInterface: {e}")
+    sys.exit(1)
+
+
+def test_enhanced_tensor_rebuilding():
+    print("\n=== Testing Enhanced Tensor Rebuilding ===")
+
+    # Create server and client instances
+    server = CoroRPCInterface()
+    client = CoroRPCInterface()
+
+    # Store received tensors
+    received_tensors = []
+
+    def tensor_receive_callback(received_tensor):
+        print(f"Received tensor from: {received_tensor.source_address}")
+        print(f"Original data size: {len(received_tensor.data)} bytes")
+        print(f"Shape info: {received_tensor.shape}")
+        print(f"Dtype info: {received_tensor.dtype}")
+
+        try:
+            # Use enhanced rebuild functionality to reconstruct tensor
+            rebuilt_tensor = received_tensor.rebuild_tensor()
+            received_tensors.append(rebuilt_tensor)
+
+            print("Successfully rebuilt tensor:")
+            print(f"  - Shape: {rebuilt_tensor.shape}")
+            print(f"  - Dtype: {rebuilt_tensor.dtype}")
+            print(f"  - Device: {rebuilt_tensor.device}")
+            print(f"  - Data sample: {rebuilt_tensor.flatten()[:5]}")
+
+        except Exception as e:
+            print(f"Failed to rebuild tensor: {e}")
+            import traceback
+            traceback.print_exc()
+
+    try:
+        # Initialize server and client
+        server_addr = "127.0.0.1:8888"
+        if not server.initialize(server_addr, 1, 30, 4):
+            print("Server initialization failed")
+            return False
+
+        if not client.initialize("", 0, 30, 4):
+            print("Client initialization failed")
+            return False
+
+        # Set tensor receive callback
+        server.set_tensor_receive_callback(tensor_receive_callback)
+
+        # Start server asynchronously
+        if not server.start_server_async():
+            print("Failed to start server")
+            return False
+
+        print(f"Server started on {server_addr}")
+        time.sleep(1)  # Wait for server to start
+
+        # Connect client to server
+        if not client.add_remote_connection(server_addr):
+            print("Failed to connect to server")
+            return False
+
+        print("Client connected to server")
+        time.sleep(0.5)  # Wait for connection establishment
+
+        # Define test cases with various tensor types
+        test_cases = [
+            ("Float32 2D", torch.randn(3, 4, dtype=torch.float32)),
+            ("Int64 1D", torch.arange(10, dtype=torch.int64)),
+            ("Float64 3D", torch.ones(2, 3, 4, dtype=torch.float64)),
+            ("Int32 Vector", torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)),
+            ("Bool Matrix", torch.tensor([[True, False], [False, True]], dtype=torch.bool)),
+        ]
+
+        for test_name, original_tensor in test_cases:
+            print(f"\n--- Testing {test_name} ---")
+            print("Original tensor:")
+            print(f"  - Shape: {original_tensor.shape}")
+            print(f"  - Dtype: {original_tensor.dtype}")
+            print(f"  - Data sample: {original_tensor.flatten()[:5]}")
+
+            # Send tensor
+            result = client.send_tensor(server_addr, original_tensor)
+            print(f"Send result: {result}")
+
+            if result < 0:
+                print(f"Failed to send {test_name}")
+                continue
+
+            # Wait for reception and processing
+            time.sleep(1)
+
+            if len(received_tensors) == 0:
+                print(f"No tensor received for {test_name}")
+                continue
+
+            # Validate the rebuilt tensor
+            rebuilt_tensor = received_tensors[-1]
+
+            # Check shape
+            if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
+                print(f"Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
+                continue
+
+            # Check data type
+            if rebuilt_tensor.dtype != original_tensor.dtype:
+                print(f"Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
+                continue
+
+            # Check data content (move to CPU for comparison)
+            try:
+                if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
+                    print(f"{test_name} passed - data integrity verified")
+                else:
+                    print(f"{test_name} failed - data content mismatch")
+                    print(f"  Original: {original_tensor.flatten()[:5]}")
+                    print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
+            except Exception as e:
+                print(f"{test_name} failed - comparison error: {e}")
+
+        print(f"\nEnhanced tensor rebuilding test completed")
+        print(f"Total tensors processed: {len(received_tensors)}")
+
+        return len(received_tensors) == len(test_cases)
+
+    except Exception as e:
+        print(f"Test failed with exception: {e}")
+        import traceback
+        traceback.pri
\ No newline at end of file

From 81cbfddc36c2e2f62351394355ece4f3a5225a95 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 2 Sep 2025 12:08:42 +0800
Subject: [PATCH 05/64] add tensor rebuild and tests

---
 .../tests/test_real_coro_rpc.py               | 87 ++++++++++++++++---
 1 file changed, 73 insertions(+), 14 deletions(-)

diff --git a/mooncake-transfer-engine/tests/test_real_coro_rpc.py b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
index 64ab7977c..27ae8bb0c 100644
--- a/mooncake-transfer-engine/tests/test_real_coro_rpc.py
+++ b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
@@ -30,30 +30,57 @@ def test_enhanced_tensor_rebuilding():
     server = CoroRPCInterface()
     client = CoroRPCInterface()
 
-    # Store received tensors
+    # Store received tensors and callback status
     received_tensors = []
+    callback_info = {
+        'called_count': 0,
+        'success_count': 0,
+        'error_count': 0,
+        'errors': []
+    }
 
     def tensor_receive_callback(received_tensor):
+        callback_info['called_count'] += 1
+        
+        print(f"\n=== CALLBACK #{callback_info['called_count']} TRIGGERED ===")
         print(f"Received tensor from: {received_tensor.source_address}")
-        print(f"Original data size: {len(received_tensor.data)} bytes")
+        
+        # Use safe method to get data size
+        data_size = received_tensor.get_data_size()
+        print(f"Data size: {data_size} bytes")
+            
         print(f"Shape info: {received_tensor.shape}")
         print(f"Dtype info: {received_tensor.dtype}")
+        
+        # Check if total_bytes is available
+        if hasattr(received_tensor, 'total_bytes'):
+            print(f"Total bytes (from metadata): {received_tensor.total_bytes}")
 
         try:
             # Use enhanced rebuild functionality to reconstruct tensor
+            print("Attempting to rebuild tensor...")
+            print(f"Tensor metadata - Shape: {received_tensor.shape}, Dtype: {received_tensor.dtype}")
+            
+            # Now try the actual rebuild
             rebuilt_tensor = received_tensor.rebuild_tensor()
+            
             received_tensors.append(rebuilt_tensor)
+            callback_info['success_count'] += 1
 
-            print("Successfully rebuilt tensor:")
+            print("✅ Successfully rebuilt tensor:")
             print(f"  - Shape: {rebuilt_tensor.shape}")
             print(f"  - Dtype: {rebuilt_tensor.dtype}")
             print(f"  - Device: {rebuilt_tensor.device}")
             print(f"  - Data sample: {rebuilt_tensor.flatten()[:5]}")
 
         except Exception as e:
-            print(f"Failed to rebuild tensor: {e}")
+            callback_info['error_count'] += 1
+            callback_info['errors'].append(str(e))
+            print(f"❌ Failed to rebuild tensor: {e}")
             import traceback
             traceback.print_exc()
+        
+        print(f"=== CALLBACK #{callback_info['called_count']} COMPLETED ===\n")
 
     try:
         # Initialize server and client
@@ -112,8 +139,14 @@ def tensor_receive_callback(received_tensor):
             # Wait for reception and processing
             time.sleep(1)
 
+            # Check if callback was triggered for this tensor
+            expected_callbacks = test_cases.index((test_name, original_tensor)) + 1
+            if callback_info['called_count'] < expected_callbacks:
+                print(f"❌ No callback received for {test_name}")
+                continue
+
             if len(received_tensors) == 0:
-                print(f"No tensor received for {test_name}")
+                print(f"❌ No tensor received for {test_name}")
                 continue
 
             # Validate the rebuilt tensor
@@ -121,31 +154,57 @@ def tensor_receive_callback(received_tensor):
 
             # Check shape
             if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
-                print(f"Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
+                print(f"❌ Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
                 continue
 
             # Check data type
             if rebuilt_tensor.dtype != original_tensor.dtype:
-                print(f"Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
+                print(f"❌ Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
                 continue
 
             # Check data content (move to CPU for comparison)
             try:
                 if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
-                    print(f"{test_name} passed - data integrity verified")
+                    print(f"✅ {test_name} passed - data integrity verified")
                 else:
-                    print(f"{test_name} failed - data content mismatch")
+                    print(f"❌ {test_name} failed - data content mismatch")
                     print(f"  Original: {original_tensor.flatten()[:5]}")
                     print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
             except Exception as e:
-                print(f"{test_name} failed - comparison error: {e}")
+                print(f"❌ {test_name} failed - comparison error: {e}")
 
-        print(f"\nEnhanced tensor rebuilding test completed")
+        # Print summary
+        print(f"\n=== TEST SUMMARY ===")
+        print(f"Total callbacks received: {callback_info['called_count']}")
+        print(f"Successful rebuilds: {callback_info['success_count']}")
+        print(f"Failed rebuilds: {callback_info['error_count']}")
         print(f"Total tensors processed: {len(received_tensors)}")
-
-        return len(received_tensors) == len(test_cases)
+        
+        if callback_info['errors']:
+            print(f"Errors encountered:")
+            for i, error in enumerate(callback_info['errors'], 1):
+                print(f"  {i}. {error}")
+
+        success = (callback_info['called_count'] == len(test_cases) and 
+                  callback_info['success_count'] == len(test_cases) and
+                  len(received_tensors) == len(test_cases))
+        
+        print(f"Enhanced tensor rebuilding test {'✅ PASSED' if success else '❌ FAILED'}")
+        return success
 
     except Exception as e:
         print(f"Test failed with exception: {e}")
         import traceback
-        traceback.pri
\ No newline at end of file
+        traceback.print_exc()
+        return False
+    finally:
+        # Cleanup
+        try:
+            server.stop_server()
+        except:
+            pass
+
+if __name__ == "__main__":
+    success = test_enhanced_tensor_rebuilding()
+    print(f"\nFinal result: {'✅ SUCCESS' if success else '❌ FAILURE'}")
+    sys.exit(0 if success else 1)
\ No newline at end of file

From 714ef166b2df506b6fee00fdc20ab239b481b7c9 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 2 Sep 2025 14:13:22 +0800
Subject: [PATCH 06/64] removed useless lines

---
 .../tests/test_real_coro_rpc.py               | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/mooncake-transfer-engine/tests/test_real_coro_rpc.py b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
index 27ae8bb0c..810ea03f5 100644
--- a/mooncake-transfer-engine/tests/test_real_coro_rpc.py
+++ b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
@@ -67,7 +67,7 @@ def tensor_receive_callback(received_tensor):
             received_tensors.append(rebuilt_tensor)
             callback_info['success_count'] += 1
 
-            print("✅ Successfully rebuilt tensor:")
+            print("SUCCESS: Successfully rebuilt tensor:")
             print(f"  - Shape: {rebuilt_tensor.shape}")
             print(f"  - Dtype: {rebuilt_tensor.dtype}")
             print(f"  - Device: {rebuilt_tensor.device}")
@@ -76,7 +76,7 @@ def tensor_receive_callback(received_tensor):
         except Exception as e:
             callback_info['error_count'] += 1
             callback_info['errors'].append(str(e))
-            print(f"❌ Failed to rebuild tensor: {e}")
+            print(f"FAILED: Failed to rebuild tensor: {e}")
             import traceback
             traceback.print_exc()
         
@@ -142,11 +142,11 @@ def tensor_receive_callback(received_tensor):
             # Check if callback was triggered for this tensor
             expected_callbacks = test_cases.index((test_name, original_tensor)) + 1
             if callback_info['called_count'] < expected_callbacks:
-                print(f"❌ No callback received for {test_name}")
+                print(f"FAILED: No callback received for {test_name}")
                 continue
 
             if len(received_tensors) == 0:
-                print(f"❌ No tensor received for {test_name}")
+                print(f"FAILED: No tensor received for {test_name}")
                 continue
 
             # Validate the rebuilt tensor
@@ -154,24 +154,24 @@ def tensor_receive_callback(received_tensor):
 
             # Check shape
             if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
-                print(f"❌ Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
+                print(f"FAILED: Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
                 continue
 
             # Check data type
             if rebuilt_tensor.dtype != original_tensor.dtype:
-                print(f"❌ Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
+                print(f"FAILED: Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
                 continue
 
             # Check data content (move to CPU for comparison)
             try:
                 if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
-                    print(f"✅ {test_name} passed - data integrity verified")
+                    print(f"SUCCESS: {test_name} passed - data integrity verified")
                 else:
-                    print(f"❌ {test_name} failed - data content mismatch")
+                    print(f"FAILED: {test_name} failed - data content mismatch")
                     print(f"  Original: {original_tensor.flatten()[:5]}")
                     print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
             except Exception as e:
-                print(f"❌ {test_name} failed - comparison error: {e}")
+                print(f"FAILED: {test_name} failed - comparison error: {e}")
 
         # Print summary
         print(f"\n=== TEST SUMMARY ===")
@@ -189,7 +189,7 @@ def tensor_receive_callback(received_tensor):
                   callback_info['success_count'] == len(test_cases) and
                   len(received_tensors) == len(test_cases))
         
-        print(f"Enhanced tensor rebuilding test {'✅ PASSED' if success else '❌ FAILED'}")
+        print(f"Enhanced tensor rebuilding test {'PASSED' if success else 'FAILED'}")
         return success
 
     except Exception as e:
@@ -204,7 +204,8 @@ def tensor_receive_callback(received_tensor):
         except:
             pass
 
+
 if __name__ == "__main__":
     success = test_enhanced_tensor_rebuilding()
-    print(f"\nFinal result: {'✅ SUCCESS' if success else '❌ FAILURE'}")
+    print(f"\nFinal result: {'SUCCESS' if success else 'FAILURE'}")
     sys.exit(0 if success else 1)
\ No newline at end of file

From 59c510539ef2fb43017a294c69c8f543992120ef Mon Sep 17 00:00:00 2001
From: Yixin Zhang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 2 Sep 2025 14:15:33 +0800
Subject: [PATCH 07/64] removed blank lines

---
 .gitignore | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1bf3c24d3..c2568c1af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,5 +198,3 @@ mooncake-wheel/mooncake/transfer_engine_bench
 
 # Claude Code Memory
 CLAUDE.md
-
-

From 06a672f81f4b7ecfa9652aa6ce5208da62f2e3f8 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 2 Sep 2025 17:20:21 +0800
Subject: [PATCH 08/64] turn single client creation to client pool

---
 .gitignore                                    |   4 +-
 .../transfer_engine/transfer_engine_py.cpp    |   2 +
 .../coro_rpc_connector/cororpc_communicator.h |   9 +-
 .../coro_rpc_connector/cororpc_interface.h    |   8 +
 .../cororpc_communicator.cpp                  | 239 ++++---
 .../coro_rpc_connector/cororpc_interface.cpp  | 123 +++-
 .../tests/test_coro_rpc_performance.py        | 632 ++++++++++++++++++
 test_enhanced_tensor_rebuilding.py            | 151 -----
 8 files changed, 904 insertions(+), 264 deletions(-)
 create mode 100644 mooncake-transfer-engine/tests/test_coro_rpc_performance.py
 delete mode 100644 test_enhanced_tensor_rebuilding.py

diff --git a/.gitignore b/.gitignore
index 1bf3c24d3..d0ad29f0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,6 +197,4 @@ mooncake-wheel/mooncake/mooncake_master
 mooncake-wheel/mooncake/transfer_engine_bench
 
 # Claude Code Memory
-CLAUDE.md
-
-
+CLAUDE.md
\ No newline at end of file
diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index 2c405b4c8..e32bd7206 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -663,6 +663,8 @@ void bind_coro_rpc_interface(py::module_ &m) {
         .def_readonly("shape", &CoroRPCInterface::ReceivedTensor::shape)
         .def_readonly("dtype", &CoroRPCInterface::ReceivedTensor::dtype)
         .def_readonly("total_bytes", &CoroRPCInterface::ReceivedTensor::total_bytes)
+        .def("get_data_size", &CoroRPCInterface::ReceivedTensor::getDataSize)
+        .def("get_data_as_bytes", &CoroRPCInterface::ReceivedTensor::getDataAsBytes)
         .def("rebuild_tensor", &CoroRPCInterface::ReceivedTensor::rebuildTensor);
     
     py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index cd7a67233..819c8f152 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -5,6 +5,7 @@
 #include <vector>
 #include <future>
 #include <unordered_map>
+#include <functional>
 #include <pybind11/pybind11.h>
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
@@ -44,10 +45,10 @@ class CoroRPCCommunicator {
         Config config;
         bool is_server_started = false;
         
-        // 真实的 coro_rpc 组件
         std::unique_ptr<coro_rpc::coro_rpc_server> server_;
-        std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>> client_pool_;
-        std::unordered_map<std::string, coro_rpc::coro_rpc_client> clients_;
+        std::unordered_map<std::string, std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>>> client_pools_;
+        
+        std::function<void(const std::string&, const std::string&)> data_receive_callback;
         
         void handleDataTransfer(coro_rpc::context<void> context, std::string_view data);
         void handleTensorTransfer(coro_rpc::context<void> context);
@@ -76,6 +77,8 @@ class CoroRPCCommunicator {
     int receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms = -1);
     std::future<std::string> receiveDataAsync(const std::string& source_address, int timeout_ms = -1);
 
+    void setDataReceiveCallback(std::function<void(const std::string&, const std::string&)> callback);
+
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
 private:
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index d00697e0e..2c4102708 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -30,6 +30,14 @@ class CoroRPCInterface {
         
         pybind11::object rebuildTensor() const;
         
+        // Safe method to get data size without triggering string decoding
+        size_t getDataSize() const { return data.size(); }
+        
+        // Safe method to get data as bytes
+        pybind11::bytes getDataAsBytes() const { 
+            return pybind11::bytes(data); 
+        }
+        
     private:
         pybind11::object rebuildTensorInternal() const;
     };
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 46e68a8c5..24c406ea0 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -15,6 +15,12 @@ CoroRPCCommunicator::~CoroRPCCommunicator() {
     stopServer();
 }
 
+void CoroRPCCommunicator::setDataReceiveCallback(std::function<void(const std::string&, const std::string&)> callback) {
+    std::cout << "Setting data receive callback..." << std::endl;
+    impl_->data_receive_callback = callback;
+    std::cout << "Data receive callback set successfully" << std::endl;
+}
+
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
     
@@ -31,6 +37,7 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
                                         &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
     
+    std::cout << "Communicator initialized with client pool support" << std::endl;
     return true;
 }
 
@@ -81,41 +88,47 @@ void CoroRPCCommunicator::stopServer() {
 
 bool CoroRPCCommunicator::addRemoteConnection(const std::string& remote_address) {
     try {
-        if (!impl_->client_pool_) {
-            impl_->client_pool_ = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
+        // Check if client pool already exists for this address
+        auto it = impl_->client_pools_.find(remote_address);
+        if (it != impl_->client_pools_.end()) {
+            std::cout << "Client pool for " << remote_address << " already exists" << std::endl;
+            return true;
         }
         
-        auto& client = impl_->clients_[remote_address];
-        auto task = [&client, remote_address]() -> async_simple::coro::Lazy<bool> {
-            auto ec = co_await client.connect(remote_address);
-            co_return !ec;
-        };
+        // Create new client pool for this remote address
+        auto client_pool = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
         
-        bool connected = async_simple::coro::syncAwait(task());
-        if (connected) {
-            std::cout << "Successfully connected to " << remote_address << std::endl;
-        } else {
-            std::cout << "Failed to connect to " << remote_address << std::endl;
+        if (!client_pool) {
+            std::cerr << "Failed to create client pool for " << remote_address << std::endl;
+            return false;
         }
-        return connected;
+        
+        impl_->client_pools_[remote_address] = client_pool;
+        std::cout << "Successfully created client pool for " << remote_address 
+                  << " with pool size: " << impl_->config.pool_size << std::endl;
+        return true;
+        
     } catch (const std::exception& e) {
-        std::cerr << "Exception while connecting to " << remote_address << ": " << e.what() << std::endl;
+        std::cerr << "Exception while creating client pool for " << remote_address << ": " << e.what() << std::endl;
         return false;
     }
 }
 
 void CoroRPCCommunicator::removeRemoteConnection(const std::string& remote_address) {
-    auto it = impl_->clients_.find(remote_address);
-    if (it != impl_->clients_.end()) {
-        impl_->clients_.erase(it);
-        std::cout << "Removed connection to " << remote_address << std::endl;
+    auto it = impl_->client_pools_.find(remote_address);
+    if (it != impl_->client_pools_.end()) {
+        impl_->client_pools_.erase(it);
+        std::cout << "Removed client pool for " << remote_address << std::endl;
+    } else {
+        std::cout << "No client pool found for " << remote_address << std::endl;
     }
 }
 
 bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
-    auto it = impl_->clients_.find(remote_address);
-    if (it != impl_->clients_.end()) {
-        return it->second.has_closed() == false;
+    auto it = impl_->client_pools_.find(remote_address);
+    if (it != impl_->client_pools_.end()) {
+        // Client pool exists, assume it can provide connections
+        return true;
     }
     return false;
 }
@@ -124,27 +137,37 @@ int CoroRPCCommunicator::sendData(const std::string& target_address,
                                   const void* data,
                                   size_t data_size) {
     try {
-        if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+        // Ensure client pool exists for this target
+        if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
             if (!addRemoteConnection(target_address)) {
+                std::cerr << "Failed to create client pool for " << target_address << std::endl;
                 return -1;
             }
         }
         
-        auto& client = impl_->clients_[target_address];
+        auto& client_pool = impl_->client_pools_[target_address];
         
-        auto task = [&client, data, data_size]() -> async_simple::coro::Lazy<int> {
-            std::string_view data_view(static_cast<const char*>(data), data_size);
-            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
-            
-            if (result.has_value()) {
-                co_return 0; 
-            } else {
-                std::cerr << "RPC call failed: " << result.error().msg << std::endl;
-                co_return -1;
+        // Use promise/future to convert async operation to sync
+        auto promise = std::make_shared<std::promise<int>>();
+        auto future = promise->get_future();
+        
+        client_pool->send_request(
+            [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                
+                std::string_view data_view(static_cast<const char*>(data), data_size);
+                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+                
+                if (result.has_value()) {
+                    promise->set_value(0);
+                } else {
+                    std::cerr << "RPC call failed: " << result.error().msg << std::endl;
+                    promise->set_value(-1);
+                }
             }
-        };
+        ).start([](auto &&) {});
         
-        int result = async_simple::coro::syncAwait(task());
+        int result = future.get();
         
         if (result == 0) {
             std::cout << "Successfully sent " << data_size << " bytes to " << target_address << std::endl;
@@ -163,75 +186,80 @@ std::future<result> CoroRPCCommunicator::sendDataAsync(const std::string& target
     auto promise = std::make_shared<std::promise<result>>();
     auto future = promise->get_future();
     
-    if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+    // Ensure client pool exists for this target
+    if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
         if (!addRemoteConnection(target_address)) {
             result res;
             res.code = -1;
-            res.err_msg = "Failed to connect to " + target_address;
+            res.err_msg = "Failed to create client pool for " + target_address;
             promise->set_value(res);
             return future;
         }
     }
     
-    if (impl_->client_pool_) {
-        impl_->client_pool_->send_request(
-            [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
-                -> async_simple::coro::Lazy<void> {
-                
-                std::string_view data_view(static_cast<const char*>(data), data_size);
-                auto rpc_result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
-                
-                result res;
-                if (rpc_result.has_value()) {
-                    res.code = 0;
-                } else {
-                    res.code = rpc_result.error().val();
-                    res.err_msg = rpc_result.error().msg;
-                }
-                
-                promise->set_value(res);
-            }
-        ).start([](auto &&) {});
-    } else {
-        std::thread([this, target_address, data, data_size, promise]() {
+    auto& client_pool = impl_->client_pools_[target_address];
+    
+    client_pool->send_request(
+        [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
+            -> async_simple::coro::Lazy<void> {
+            
+            std::string_view data_view(static_cast<const char*>(data), data_size);
+            auto rpc_result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+            
             result res;
-            res.code = sendData(target_address, data, data_size);
+            if (rpc_result.has_value()) {
+                res.code = 0;
+            } else {
+                res.code = rpc_result.error().val();
+                res.err_msg = rpc_result.error().msg;
+            }
+            
             promise->set_value(res);
-        }).detach();
-    }
+        }
+    ).start([](auto &&) {});
     
     return future;
 }
 
 int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pybind11::object& tensor) {
     try {
-        if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+        // Ensure client pool exists for this target
+        if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
             if (!addRemoteConnection(target_address)) {
+                std::cerr << "Failed to create client pool for " << target_address << std::endl;
                 return -1;
             }
         }
         
-        auto& client = impl_->clients_[target_address];
+        auto& client_pool = impl_->client_pools_[target_address];
         
-        auto task = [&client, &tensor]() -> async_simple::coro::Lazy<int> {
-            uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
-            size_t numel = tensor.attr("numel")().cast<size_t>();
-            size_t element_size = tensor.attr("element_size")().cast<size_t>();
-            size_t tensor_size = numel * element_size;
-
-            client.set_req_attachment(std::string_view((char*)data_ptr, tensor_size));
-            
-            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
-            
-            if (result.has_value()) {
-                co_return 0; 
-            } else {
-                std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
-                co_return -1;
+        // Use promise/future to convert async operation to sync
+        auto promise = std::make_shared<std::promise<int>>();
+        auto future = promise->get_future();
+        
+        uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
+        size_t numel = tensor.attr("numel")().cast<size_t>();
+        size_t element_size = tensor.attr("element_size")().cast<size_t>();
+        size_t tensor_size = numel * element_size;
+        
+        client_pool->send_request(
+            [data_ptr, tensor_size, promise](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                
+                client.set_req_attachment(std::string_view((char*)data_ptr, tensor_size));
+                
+                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
+                
+                if (result.has_value()) {
+                    promise->set_value(0);
+                } else {
+                    std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
+                    promise->set_value(-1);
+                }
             }
-        };
+        ).start([](auto &&) {});
         
-        int result = async_simple::coro::syncAwait(task());
+        int result = future.get();
         
         if (result == 0) {
             std::cout << "Successfully sent tensor to " << target_address << std::endl;
@@ -248,33 +276,32 @@ std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_
     auto promise = std::make_shared<std::promise<int>>();
     auto future = promise->get_future();
     
-    if (impl_->clients_.find(target_address) == impl_->clients_.end()) {
+    // Ensure client pool exists for this target
+    if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
         if (!addRemoteConnection(target_address)) {
             promise->set_value(-1);
             return future;
         }
     }
     
-    if (impl_->client_pool_) {
-        impl_->client_pool_->send_request(
-            [tensor, promise](coro_rpc::coro_rpc_client &client) 
-                -> async_simple::coro::Lazy<void> {
-                
-                client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
-                
-                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
-                
-                if (result.has_value()) {
-                    promise->set_value(0);
-                } else {
-                    std::cerr << "Async tensor RPC call failed: " << result.error().msg << std::endl;
-                    promise->set_value(-1);
-                }
+    auto& client_pool = impl_->client_pools_[target_address];
+    
+    client_pool->send_request(
+        [tensor, promise](coro_rpc::coro_rpc_client &client) 
+            -> async_simple::coro::Lazy<void> {
+            
+            client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
+            
+            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
+            
+            if (result.has_value()) {
+                promise->set_value(0);
+            } else {
+                std::cerr << "Async tensor RPC call failed: " << result.error().msg << std::endl;
+                promise->set_value(-1);
             }
-        ).start([](auto &&) {});
-    } else {
-        std::cerr << "Client pool not available for async tensor send" << std::endl;
-    }
+        }
+    ).start([](auto &&) {});
     
     return future;
 }
@@ -296,6 +323,17 @@ std::future<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string
 
 void CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<void> context, std::string_view data) {
     std::cout << "Handling data transfer: " << data.size() << " bytes" << std::endl;
+    
+    // Call the data receive callback if set
+    if (data_receive_callback) {
+        std::cout << "Calling data receive callback..." << std::endl;
+        std::string source_address = "unknown"; // You may want to extract this from context
+        std::string data_str(data);
+        data_receive_callback(source_address, data_str);
+    } else {
+        std::cout << "No data receive callback set!" << std::endl;
+    }
+    
     context.response_msg();
 }
 
@@ -337,6 +375,7 @@ std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size, size_t t
     
     auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
+        std::cout << "Created communicator with default pool size: " << pool_size << std::endl;
         return communicator;
     }
     return nullptr;
@@ -346,9 +385,11 @@ std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_addr
     Config config;
     config.listen_address = listen_address;
     config.thread_count = thread_count;
+    config.pool_size = 10; // Default pool size for server-side client pools
     
     auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
+        std::cout << "Created server communicator with pool size: " << config.pool_size << std::endl;
         return communicator;
     }
     return nullptr;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 311c166ed..52919b827 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -73,9 +73,15 @@ size_t get_dtype_size(TensorDtype dtype) {
 // Helper function to create numpy array from data
 pybind11::object create_numpy_array_from_data(const char* data, TensorDtype dtype, 
                                                const std::vector<int64_t>& shape) {
+    std::cout << "DEBUG: create_numpy_array_from_data called" << std::endl;
+    std::cout << "DEBUG: dtype = " << static_cast<int>(dtype) << std::endl;
+    std::cout << "DEBUG: shape size = " << shape.size() << std::endl;
+    
     pybind11::gil_scoped_acquire acquire;
     
+    std::cout << "DEBUG: About to import numpy..." << std::endl;
     pybind11::module_ np = pybind11::module_::import("numpy");
+    std::cout << "DEBUG: Successfully imported numpy" << std::endl;
     
     std::string np_dtype;
     switch (dtype) {
@@ -92,17 +98,46 @@ pybind11::object create_numpy_array_from_data(const char* data, TensorDtype dtyp
             throw std::runtime_error("Unknown tensor dtype");
     }
     
+    std::cout << "DEBUG: np_dtype = " << np_dtype << std::endl;
+    
     size_t element_size = get_dtype_size(dtype);
     size_t total_elements = 1;
     for (int64_t dim : shape) {
         total_elements *= dim;
     }
     
+    std::cout << "DEBUG: element_size = " << element_size << std::endl;
+    std::cout << "DEBUG: total_elements = " << total_elements << std::endl;
+    
     // Create a copy of the data
+    std::cout << "DEBUG: Creating data copy..." << std::endl;
     std::vector<char> data_copy(data, data + total_elements * element_size);
+    std::cout << "DEBUG: Data copy created, size = " << data_copy.size() << std::endl;
     
-    return np.attr("frombuffer")(pybind11::bytes(data_copy.data(), data_copy.size()), 
-                                 pybind11::arg("dtype")=np_dtype).attr("reshape")(shape);
+    std::cout << "DEBUG: About to call frombuffer..." << std::endl;
+    
+    try {
+        pybind11::bytes bytes_obj(data_copy.data(), data_copy.size());
+        std::cout << "DEBUG: Created bytes object" << std::endl;
+        
+        pybind11::object array = np.attr("frombuffer")(bytes_obj, pybind11::arg("dtype")=np_dtype);
+        std::cout << "DEBUG: Created array from buffer successfully" << std::endl;
+        
+        // Convert shape to tuple manually
+        pybind11::tuple shape_tuple = pybind11::tuple(shape.size());
+        for (size_t i = 0; i < shape.size(); ++i) {
+            shape_tuple[i] = shape[i];
+        }
+        std::cout << "DEBUG: About to create shape tuple for reshape" << std::endl;
+        
+        pybind11::object result = array.attr("reshape")(shape_tuple);
+        std::cout << "DEBUG: Reshaped array successfully" << std::endl;
+        
+        return result;
+    } catch (const std::exception& e) {
+        std::cout << "DEBUG: Exception in numpy operations: " << e.what() << std::endl;
+        throw;
+    }
 }
 
 // Constructor
@@ -364,22 +399,46 @@ pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_add
 void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->data_receive_callback = callback;
+    
+    if (impl_->communicator) {
+        auto interface_ptr = this;
+        impl_->communicator->setDataReceiveCallback(
+            [interface_ptr](const std::string& source, const std::string& data) {
+                interface_ptr->handleIncomingData(source, data);
+            }
+        );
+    }
 }
 
 void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->tensor_receive_callback = callback;
+    
+    if (impl_->communicator) {
+        auto interface_ptr = this;
+        impl_->communicator->setDataReceiveCallback(
+            [interface_ptr](const std::string& source, const std::string& data) {
+                interface_ptr->handleIncomingData(source, data);
+            }
+        );
+    }
 }
 
 void CoroRPCInterface::handleIncomingData(const std::string& source, const std::string& data) {
+    std::cout << "CoroRPCInterface::handleIncomingData called with " << data.size() << " bytes" << std::endl;
+    
     // Check if this is tensor data by looking for metadata signature
     if (data.size() >= sizeof(TensorMetadata)) {
         const TensorMetadata* metadata = reinterpret_cast<const TensorMetadata*>(data.data());
         
+        std::cout << "Checking tensor metadata: dtype=" << metadata->dtype << ", ndim=" << metadata->ndim << std::endl;
+        
         // Basic validation: check if dtype is in valid range
-        if (metadata->dtype >= 0 && metadata->dtype < static_cast<int32_t>(TensorDtype::UNKNOWN) && 
+        if (metadata->dtype > 0 && metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) && 
             metadata->ndim >= 0 && metadata->ndim <= 4) {
             
+            std::cout << "Data recognized as tensor, calling handleIncomingTensor" << std::endl;
+            
             // This looks like tensor data, handle it as such
             std::vector<size_t> shape;
             for (int i = 0; i < metadata->ndim; i++) {
@@ -428,7 +487,18 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source,
                                             const std::string& data,
                                             const std::vector<size_t>& shape, 
                                             const std::string& dtype) {
-    if (!impl_->tensor_receive_callback) return;
+    std::cout << "CoroRPCInterface::handleIncomingTensor called" << std::endl;
+    std::cout << "  source: " << source << std::endl;
+    std::cout << "  data size: " << data.size() << std::endl;
+    std::cout << "  dtype: " << dtype << std::endl;
+    std::cout << "  shape size: " << shape.size() << std::endl;
+    
+    if (!impl_->tensor_receive_callback) {
+        std::cout << "No tensor receive callback set!" << std::endl;
+        return;
+    }
+    
+    std::cout << "Calling Python tensor receive callback..." << std::endl;
     
     try {
         pybind11::gil_scoped_acquire acquire;
@@ -451,6 +521,10 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
 }
 
 pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const {
+    std::cout << "DEBUG: Starting rebuildTensorInternal" << std::endl;
+    std::cout << "DEBUG: Data size: " << data.size() << " bytes" << std::endl;
+    std::cout << "DEBUG: TensorMetadata size: " << sizeof(TensorMetadata) << " bytes" << std::endl;
+    
     if (data.size() < sizeof(TensorMetadata)) {
         throw std::runtime_error("Data too small to contain tensor metadata");
     }
@@ -459,6 +533,8 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const
     TensorMetadata metadata;
     std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
     
+    std::cout << "DEBUG: Extracted metadata - dtype: " << metadata.dtype << ", ndim: " << metadata.ndim << std::endl;
+    
     // Validate metadata
     if (metadata.ndim < 0 || metadata.ndim > 4) {
         throw std::runtime_error("Invalid tensor dimensions");
@@ -470,28 +546,59 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const
         throw std::runtime_error("Unsupported tensor dtype");
     }
     
+    std::cout << "DEBUG: Element size: " << element_size << " bytes" << std::endl;
+    
     // Extract shape
     std::vector<int64_t> tensor_shape;
     size_t total_elements = 1;
     for (int i = 0; i < metadata.ndim; i++) {
         tensor_shape.push_back(metadata.shape[i]);
         total_elements *= metadata.shape[i];
+        std::cout << "DEBUG: Shape[" << i << "] = " << metadata.shape[i] << std::endl;
     }
     
+    std::cout << "DEBUG: Total elements: " << total_elements << std::endl;
+    
     // Validate data size
     size_t expected_data_size = total_elements * element_size;
     size_t actual_data_size = data.size() - sizeof(TensorMetadata);
+    
+    std::cout << "DEBUG: Expected data size: " << expected_data_size << " bytes" << std::endl;
+    std::cout << "DEBUG: Actual data size: " << actual_data_size << " bytes" << std::endl;
+    
     if (actual_data_size != expected_data_size) {
         throw std::runtime_error("Data size mismatch with tensor metadata");
     }
     
     // Create numpy array from raw data
     const char* tensor_data = data.data() + sizeof(TensorMetadata);
-    pybind11::object numpy_array = create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
+    std::cout << "DEBUG: About to create numpy array..." << std::endl;
+    std::cout << "DEBUG: Data pointer: " << static_cast<const void*>(tensor_data) << std::endl;
+    std::cout << "DEBUG: Base data pointer: " << static_cast<const void*>(data.data()) << std::endl;
+    std::cout << "DEBUG: Offset: " << sizeof(TensorMetadata) << std::endl;
     
-    // Convert to PyTorch tensor
-    pybind11::module_ torch = pybind11::module_::import("torch");
-    return torch.attr("from_numpy")(numpy_array);
+    // Check first few bytes of tensor data
+    std::cout << "DEBUG: First few bytes of tensor data: ";
+    for (int i = 0; i < std::min(16, static_cast<int>(actual_data_size)); ++i) {
+        std::cout << std::hex << (unsigned char)tensor_data[i] << " ";
+    }
+    std::cout << std::dec << std::endl;
+    
+    try {
+        pybind11::object numpy_array = create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
+        std::cout << "DEBUG: Successfully created numpy array" << std::endl;
+        
+        // Convert to PyTorch tensor
+        std::cout << "DEBUG: About to convert to PyTorch tensor..." << std::endl;
+        pybind11::module_ torch = pybind11::module_::import("torch");
+        pybind11::object result = torch.attr("from_numpy")(numpy_array);
+        std::cout << "DEBUG: Successfully created PyTorch tensor" << std::endl;
+        
+        return result;
+    } catch (const std::exception& e) {
+        std::cout << "DEBUG: Error in tensor creation: " << e.what() << std::endl;
+        throw;
+    }
 }
 
 // Factory functions for creating RPC client and server
diff --git a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
new file mode 100644
index 000000000..fab27b2da
--- /dev/null
+++ b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
@@ -0,0 +1,632 @@
+#!/usr/bin/env python3
+"""
+CoroRPC Performance Testing Suite
+Tests bandwidth performance for data and tensor interfaces
+"""
+
+import torch
+import numpy as np
+import time
+import sys
+import threading
+from typing import List, Tuple, Dict, Any
+
+try:
+    import mooncake.engine as engine
+    print("Successfully imported mooncake.engine")
+    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    print("Successfully imported CoroRPCInterface")
+except ImportError as e:
+    print(f"Failed to import mooncake: {e}")
+    sys.exit(1)
+except AttributeError as e:
+    print(f"Failed to import CoroRPCInterface: {e}")
+    sys.exit(1)
+
+
+class PerformanceTestResults:
+    """Container for performance test results"""
+    
+    def __init__(self):
+        self.data_results: List[Dict[str, Any]] = []
+        self.tensor_results: List[Dict[str, Any]] = []
+        
+    def add_data_result(self, size_mb: float, time_ms: float, bandwidth_mbps: float):
+        self.data_results.append({
+            'size_mb': size_mb,
+            'time_ms': time_ms,
+            'bandwidth_mbps': bandwidth_mbps
+        })
+        
+    def add_tensor_result(self, tensor_type: str, shape: tuple, size_mb: float, 
+                         time_ms: float, bandwidth_mbps: float):
+        self.tensor_results.append({
+            'tensor_type': tensor_type,
+            'shape': shape,
+            'size_mb': size_mb,
+            'time_ms': time_ms,
+            'bandwidth_mbps': bandwidth_mbps
+        })
+        
+    def print_summary(self):
+        print("\n" + "="*60)
+        print("PERFORMANCE TEST RESULTS SUMMARY")
+        print("="*60)
+        
+        if self.data_results:
+            print("\nDATA INTERFACE PERFORMANCE:")
+            print(f"{'Size (MB)':<12} {'Time (ms)':<12} {'Bandwidth (MB/s)':<16}")
+            print("-" * 40)
+            for result in self.data_results:
+                print(f"{result['size_mb']:<12.2f} {result['time_ms']:<12.2f} {result['bandwidth_mbps']:<16.2f}")
+                
+        if self.tensor_results:
+            print("\nTENSOR INTERFACE PERFORMANCE:")
+            print(f"{'Type':<12} {'Shape':<20} {'Size (MB)':<12} {'Time (ms)':<12} {'Bandwidth (MB/s)':<16}")
+            print("-" * 80)
+            for result in self.tensor_results:
+                shape_str = str(result['shape'])[:18]
+                print(f"{result['tensor_type']:<12} {shape_str:<20} {result['size_mb']:<12.2f} "
+                      f"{result['time_ms']:<12.2f} {result['bandwidth_mbps']:<16.2f}")
+
+
+class CoroRPCPerformanceTester:
+    """Main performance testing class"""
+    
+    def __init__(self):
+        self.server = None
+        self.client = None
+        self.server_addr = "127.0.0.1:8889"  # Use different port to avoid conflicts
+        self.results = PerformanceTestResults()
+        
+        # Callback tracking
+        self.data_received_count = 0
+        self.tensor_received_count = 0
+        self.data_receive_times = []
+        self.tensor_receive_times = []
+        self.receive_lock = threading.Lock()
+        
+    def setup(self) -> bool:
+        """Initialize server and client"""
+        print("Setting up CoroRPC performance test environment...")
+        
+        try:
+            # Create server and client instances
+            self.server = CoroRPCInterface()
+            self.client = CoroRPCInterface()
+            
+            # Initialize server
+            if not self.server.initialize(self.server_addr, 1, 30, 4):
+                print("ERROR: Failed to initialize server")
+                return False
+                
+            # Initialize client
+            if not self.client.initialize("", 0, 30, 4):
+                print("ERROR: Failed to initialize client")
+                return False
+                
+            # Set up callbacks
+            self.server.set_data_receive_callback(self._data_receive_callback)
+            self.server.set_tensor_receive_callback(self._tensor_receive_callback)
+            
+            # Start server
+            if not self.server.start_server_async():
+                print("ERROR: Failed to start server")
+                return False
+                
+            print(f"Server started on {self.server_addr}")
+            time.sleep(1)  # Wait for server startup
+            
+            # Connect client to server
+            if not self.client.add_remote_connection(self.server_addr):
+                print("ERROR: Failed to connect client to server")
+                return False
+                
+            print("Client connected to server")
+            time.sleep(0.5)  # Wait for connection establishment
+            
+            return True
+            
+        except Exception as e:
+            print(f"ERROR: Setup failed with exception: {e}")
+            return False
+    
+    def teardown(self):
+        """Clean up resources"""
+        try:
+            if self.server:
+                self.server.stop_server()
+        except:
+            pass
+            
+    def _data_receive_callback(self, received_data):
+        """Callback for data reception"""
+        with self.receive_lock:
+            self.data_received_count += 1
+            self.data_receive_times.append(time.time())
+            source_address = received_data.get("source", "unknown")
+            data = received_data.get("data", b"")
+            print(f"Data callback #{self.data_received_count}: received {len(data)} bytes from {source_address}")
+    
+    def _tensor_receive_callback(self, received_tensor):
+        """Callback for tensor reception"""
+        with self.receive_lock:
+            self.tensor_received_count += 1
+            self.tensor_receive_times.append(time.time())
+            print(f"Tensor callback #{self.tensor_received_count}: received tensor from {received_tensor.source_address}")
+    
+    def test_data_interface_simple(self) -> bool:
+        """Simple test for data interface to verify correctness"""
+        print("\n--- Testing Data Interface (Simple) ---")
+        
+        # Test with small data size first
+        test_data = b"Hello, CoroRPC Performance Test!"
+        data_size_mb = len(test_data) / (1024 * 1024)
+        
+        print(f"Sending {len(test_data)} bytes ({data_size_mb:.6f} MB)")
+        
+        # Reset counters
+        with self.receive_lock:
+            self.data_received_count = 0
+            self.data_receive_times.clear()
+        
+        # Send data and measure time
+        start_time = time.time()
+        result = self.client.send_data(self.server_addr, test_data)
+        send_time = time.time()
+        
+        if result < 0:
+            print(f"ERROR: Failed to send data, result: {result}")
+            return False
+            
+        print(f"Data sent successfully in {(send_time - start_time)*1000:.2f} ms")
+        
+        # Wait for reception
+        max_wait_time = 5.0  # 5 seconds timeout
+        wait_start = time.time()
+        
+        while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+            time.sleep(0.1)
+            
+        if self.data_received_count == 0:
+            print("ERROR: No data received within timeout")
+            return False
+            
+        print(f"SUCCESS: Data interface test passed - sent and received {len(test_data)} bytes")
+        return True
+    
+    def test_tensor_interface_simple(self) -> bool:
+        """Simple test for tensor interface to verify correctness"""
+        print("\n--- Testing Tensor Interface (Simple) ---")
+        
+        # Create a small test tensor
+        test_tensor = torch.randn(10, 10, dtype=torch.float32)
+        tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
+        
+        print(f"Sending tensor {test_tensor.shape} ({tensor_size_mb:.6f} MB)")
+        
+        # Reset counters
+        with self.receive_lock:
+            self.tensor_received_count = 0
+            self.tensor_receive_times.clear()
+        
+        # Send tensor and measure time
+        start_time = time.time()
+        result = self.client.send_tensor(self.server_addr, test_tensor)
+        send_time = time.time()
+        
+        if result < 0:
+            print(f"ERROR: Failed to send tensor, result: {result}")
+            return False
+            
+        print(f"Tensor sent successfully in {(send_time - start_time)*1000:.2f} ms")
+        
+        # Wait for reception
+        max_wait_time = 5.0  # 5 seconds timeout
+        wait_start = time.time()
+        
+        while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+            time.sleep(0.1)
+            
+        if self.tensor_received_count == 0:
+            print("ERROR: No tensor received within timeout")
+            return False
+            
+        print(f"SUCCESS: Tensor interface test passed - sent and received tensor {test_tensor.shape}")
+        return True
+    
+    def test_data_bandwidth_performance(self, sizes_mb: List[float]) -> bool:
+        """Test data interface bandwidth performance with various sizes"""
+        print("\n--- Testing Data Interface Bandwidth Performance ---")
+        
+        for size_mb in sizes_mb:
+            print(f"\nTesting data size: {size_mb} MB")
+            
+            # Create test data
+            data_size_bytes = int(size_mb * 1024 * 1024)
+            test_data = bytes(range(256)) * (data_size_bytes // 256 + 1)
+            test_data = test_data[:data_size_bytes]
+            
+            # Reset counters before each test
+            with self.receive_lock:
+                self.data_received_count = 0
+                self.data_receive_times.clear()
+            
+            # Measure send time
+            start_time = time.time()
+            result = self.client.send_data(self.server_addr, test_data)
+            end_time = time.time()
+            
+            if result < 0:
+                print(f"ERROR: Failed to send {size_mb} MB data")
+                continue
+                
+            elapsed_ms = (end_time - start_time) * 1000
+            bandwidth_mbps = size_mb / (elapsed_ms / 1000) if elapsed_ms > 0 else 0
+            
+            print(f"  Size: {size_mb:.2f} MB")
+            print(f"  Time: {elapsed_ms:.2f} ms")
+            print(f"  Bandwidth: {bandwidth_mbps:.2f} MB/s")
+            
+            self.results.add_data_result(size_mb, elapsed_ms, bandwidth_mbps)
+            
+            # Wait for reception with timeout
+            max_wait_time = 2.0
+            wait_start = time.time()
+            while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+                time.sleep(0.1)
+            
+            if self.data_received_count > 0:
+                print(f"  Reception confirmed: callback received")
+            else:
+                print(f"  WARNING: No reception callback within {max_wait_time}s timeout")
+            
+            # Wait between tests
+            time.sleep(0.2)
+            
+        return True
+    
+    def test_tensor_bandwidth_performance(self, tensor_configs: List[Tuple[str, tuple, torch.dtype]]) -> bool:
+        """Test tensor interface bandwidth performance with various tensor types"""
+        print("\n--- Testing Tensor Interface Bandwidth Performance ---")
+        
+        for tensor_name, shape, dtype in tensor_configs:
+            print(f"\nTesting tensor: {tensor_name} {shape}")
+            
+            # Create test tensor
+            if dtype == torch.bool:
+                test_tensor = torch.randint(0, 2, shape, dtype=dtype).bool()
+            elif dtype in [torch.int32, torch.int64]:
+                test_tensor = torch.randint(-100, 100, shape, dtype=dtype)
+            else:
+                test_tensor = torch.randn(shape, dtype=dtype)
+            
+            tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
+            
+            # Reset counters before each test
+            with self.receive_lock:
+                self.tensor_received_count = 0
+                self.tensor_receive_times.clear()
+            
+            # Measure send time
+            start_time = time.time()
+            result = self.client.send_tensor(self.server_addr, test_tensor)
+            end_time = time.time()
+            
+            if result < 0:
+                print(f"ERROR: Failed to send tensor {tensor_name}")
+                continue
+                
+            elapsed_ms = (end_time - start_time) * 1000
+            bandwidth_mbps = tensor_size_mb / (elapsed_ms / 1000) if elapsed_ms > 0 else 0
+            
+            print(f"  Type: {tensor_name}")
+            print(f"  Shape: {shape}")
+            print(f"  Size: {tensor_size_mb:.2f} MB")
+            print(f"  Time: {elapsed_ms:.2f} ms")
+            print(f"  Bandwidth: {bandwidth_mbps:.2f} MB/s")
+            
+            self.results.add_tensor_result(tensor_name, shape, tensor_size_mb, elapsed_ms, bandwidth_mbps)
+            
+            # Wait for reception with timeout
+            max_wait_time = 2.0
+            wait_start = time.time()
+            while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+                time.sleep(0.1)
+            
+            if self.tensor_received_count > 0:
+                print(f"  Reception confirmed: callback received")
+            else:
+                print(f"  WARNING: No reception callback within {max_wait_time}s timeout")
+            
+            # Wait between tests
+            time.sleep(0.2)
+            
+        return True
+    
+    def test_data_bandwidth_performance_large_scale(self, sizes_mb: List[float]) -> bool:
+        """Test data interface bandwidth performance with large data sizes (optimized for GB scale)"""
+        print("\n--- Testing Data Interface Bandwidth Performance (Large Scale) ---")
+        
+        for size_mb in sizes_mb:
+            print(f"\nTesting large data size: {size_mb} MB ({size_mb/1024:.2f} GB)")
+            
+            # Create test data efficiently for large sizes
+            data_size_bytes = int(size_mb * 1024 * 1024)
+            print(f"  Allocating {data_size_bytes} bytes ({data_size_bytes/(1024*1024*1024):.2f} GB)...")
+            
+            try:
+                # Use more efficient data generation for large sizes
+                # Create a pattern and repeat it to avoid memory issues
+                pattern_size = min(1024 * 1024, data_size_bytes)  # 1MB pattern max
+                pattern = bytes(range(256)) * (pattern_size // 256 + 1)
+                pattern = pattern[:pattern_size]
+                
+                # For very large data, we create it in chunks
+                if data_size_bytes > 100 * 1024 * 1024:  # If > 100MB
+                    # Create data as repeated pattern
+                    repeat_count = data_size_bytes // len(pattern)
+                    remainder = data_size_bytes % len(pattern)
+                    test_data = pattern * repeat_count + pattern[:remainder]
+                else:
+                    test_data = bytes(range(256)) * (data_size_bytes // 256 + 1)
+                    test_data = test_data[:data_size_bytes]
+                
+                print(f"  Data allocated successfully: {len(test_data)} bytes")
+                
+            except MemoryError:
+                print(f"  ERROR: Not enough memory to allocate {size_mb} MB")
+                continue
+            except Exception as e:
+                print(f"  ERROR: Failed to create test data: {e}")
+                continue
+            
+            # Reset counters before each test
+            with self.receive_lock:
+                self.data_received_count = 0
+                self.data_receive_times.clear()
+            
+            # Measure send time
+            print(f"  Starting transmission...")
+            start_time = time.time()
+            result = self.client.send_data(self.server_addr, test_data)
+            end_time = time.time()
+            
+            if result < 0:
+                print(f"  ERROR: Failed to send {size_mb} MB data")
+                continue
+                
+            elapsed_ms = (end_time - start_time) * 1000
+            elapsed_seconds = elapsed_ms / 1000
+            bandwidth_mbps = size_mb / elapsed_seconds if elapsed_seconds > 0 else 0
+            bandwidth_gbps = bandwidth_mbps / 1024
+            
+            print(f"  Size: {size_mb:.1f} MB ({size_mb/1024:.2f} GB)")
+            print(f"  Time: {elapsed_ms:.1f} ms ({elapsed_seconds:.2f} seconds)")
+            print(f"  Bandwidth: {bandwidth_mbps:.1f} MB/s ({bandwidth_gbps:.3f} GB/s)")
+            
+            self.results.add_data_result(size_mb, elapsed_ms, bandwidth_mbps)
+            
+            # Wait for reception with longer timeout for large data
+            max_wait_time = max(10.0, size_mb / 100)  # At least 10s, or 1s per 100MB
+            print(f"  Waiting for reception confirmation (timeout: {max_wait_time:.1f}s)...")
+            wait_start = time.time()
+            while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+                time.sleep(0.5)  # Check less frequently for large transfers
+            
+            if self.data_received_count > 0:
+                reception_time = self.data_receive_times[0] - start_time
+                print(f"  Reception confirmed: callback received after {reception_time:.2f}s")
+            else:
+                print(f"  WARNING: No reception callback within {max_wait_time:.1f}s timeout")
+            
+            # Clean up large data object
+            del test_data
+            
+            # Wait between tests (longer for large data)
+            time.sleep(1.0)
+            
+        return True
+    
+    def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tuple[str, tuple, torch.dtype]]) -> bool:
+        """Test tensor interface bandwidth performance with large tensors (optimized for GB scale)"""
+        print("\n--- Testing Tensor Interface Bandwidth Performance (Large Scale) ---")
+        
+        for tensor_name, shape, dtype in tensor_configs:
+            print(f"\nTesting large tensor: {tensor_name} {shape}")
+            
+            # Calculate expected size
+            numel = 1
+            for dim in shape:
+                numel *= dim
+            
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            expected_size_mb = numel * element_size / (1024 * 1024)
+            expected_size_gb = expected_size_mb / 1024
+            
+            print(f"  Expected size: {expected_size_mb:.1f} MB ({expected_size_gb:.2f} GB)")
+            print(f"  Creating tensor...")
+            
+            try:
+                # Create test tensor with memory monitoring
+                if dtype == torch.bool:
+                    test_tensor = torch.randint(0, 2, shape, dtype=dtype).bool()
+                elif dtype in [torch.int32, torch.int64]:
+                    test_tensor = torch.randint(-100, 100, shape, dtype=dtype)
+                else:
+                    test_tensor = torch.randn(shape, dtype=dtype)
+                
+                actual_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
+                print(f"  Tensor created successfully: {actual_size_mb:.1f} MB")
+                
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower():
+                    print(f"  ERROR: Out of memory creating tensor: {e}")
+                    continue
+                else:
+                    print(f"  ERROR: Failed to create tensor: {e}")
+                    continue
+            except Exception as e:
+                print(f"  ERROR: Failed to create tensor: {e}")
+                continue
+            
+            tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
+            
+            # Reset counters before each test
+            with self.receive_lock:
+                self.tensor_received_count = 0
+                self.tensor_receive_times.clear()
+            
+            # Measure send time
+            print(f"  Starting tensor transmission...")
+            start_time = time.time()
+            result = self.client.send_tensor(self.server_addr, test_tensor)
+            end_time = time.time()
+            
+            if result < 0:
+                print(f"  ERROR: Failed to send tensor {tensor_name}")
+                continue
+                
+            elapsed_ms = (end_time - start_time) * 1000
+            elapsed_seconds = elapsed_ms / 1000
+            bandwidth_mbps = tensor_size_mb / elapsed_seconds if elapsed_seconds > 0 else 0
+            bandwidth_gbps = bandwidth_mbps / 1024
+            
+            print(f"  Type: {tensor_name}")
+            print(f"  Shape: {shape}")
+            print(f"  Size: {tensor_size_mb:.1f} MB ({tensor_size_mb/1024:.2f} GB)")
+            print(f"  Time: {elapsed_ms:.1f} ms ({elapsed_seconds:.2f} seconds)")
+            print(f"  Bandwidth: {bandwidth_mbps:.1f} MB/s ({bandwidth_gbps:.3f} GB/s)")
+            
+            self.results.add_tensor_result(tensor_name, shape, tensor_size_mb, elapsed_ms, bandwidth_mbps)
+            
+            # Wait for reception with longer timeout for large tensors
+            max_wait_time = max(10.0, tensor_size_mb / 100)  # At least 10s, or 1s per 100MB
+            print(f"  Waiting for reception confirmation (timeout: {max_wait_time:.1f}s)...")
+            wait_start = time.time()
+            while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
+                time.sleep(0.5)  # Check less frequently for large transfers
+            
+            if self.tensor_received_count > 0:
+                reception_time = self.tensor_receive_times[0] - start_time
+                print(f"  Reception confirmed: callback received after {reception_time:.2f}s")
+            else:
+                print(f"  WARNING: No reception callback within {max_wait_time:.1f}s timeout")
+            
+            # Clean up large tensor
+            del test_tensor
+            
+            # Wait between tests (longer for large tensors)
+            time.sleep(1.0)
+            
+        return True
+
+
+def main():
+    """Main test function"""
+    print("CoroRPC Performance Testing Suite")
+    print("="*50)
+    
+    tester = CoroRPCPerformanceTester()
+    
+    try:
+        # Setup
+        if not tester.setup():
+            print("FAILED: Setup failed")
+            return False
+            
+        # Run simple correctness tests first
+        print("\nPhase 1: Correctness Verification")
+        print("-" * 40)
+        
+        if not tester.test_data_interface_simple():
+            print("FAILED: Data interface simple test failed")
+            return False
+            
+        if not tester.test_tensor_interface_simple():
+            print("FAILED: Tensor interface simple test failed")
+            return False
+            
+        print("SUCCESS: All correctness tests passed!")
+        
+        # Run basic performance tests (small sizes for verification)
+        print("\nPhase 2: Basic Performance Testing")
+        print("-" * 40)
+        
+        # Test small data sizes first
+        small_data_sizes = [0.001, 0.01, 0.1]  # 1KB, 10KB, 100KB
+        if not tester.test_data_bandwidth_performance(small_data_sizes):
+            print("FAILED: Data bandwidth performance test failed")
+            return False
+            
+        # Test small tensors
+        small_tensor_configs = [
+            ("Float32_Small", (100, 100), torch.float32),
+            ("Int64_Small", (50, 50), torch.int64),
+            ("Bool_Small", (200, 200), torch.bool),
+        ]
+        if not tester.test_tensor_bandwidth_performance(small_tensor_configs):
+            print("FAILED: Tensor bandwidth performance test failed")
+            return False
+        
+        # Additional test with medium sizes for better performance insights
+        print("\nPhase 3: Medium-scale Performance Testing")
+        print("-" * 40)
+        
+        # Test medium data sizes
+        medium_data_sizes = [1.0, 5.0, 10.0]  # 1MB, 5MB, 10MB
+        if not tester.test_data_bandwidth_performance(medium_data_sizes):
+            print("FAILED: Medium data bandwidth performance test failed")
+            return False
+            
+        # Test medium tensors
+        medium_tensor_configs = [
+            ("Float32_Medium", (500, 500), torch.float32),      # ~1MB
+            ("Int64_Medium", (1024, 256), torch.int64),         # ~2MB  
+            ("Float64_Medium", (512, 512), torch.float64),      # ~2MB
+        ]
+        if not tester.test_tensor_bandwidth_performance(medium_tensor_configs):
+            print("FAILED: Medium tensor bandwidth performance test failed")
+            return False
+        
+        # Optional large-scale performance testing (1GB scale)
+        print("\nPhase 4: Large-scale Performance Testing (1GB)")
+        print("-" * 40)
+        print("WARNING: This phase will test ~1GB data transfers and may take several minutes")
+        
+        # Test large data sizes (around 1GB)
+        large_data_sizes = [100.0, 500.0, 1000.0]  # 100MB, 500MB, 1GB
+        if not tester.test_data_bandwidth_performance_large_scale(large_data_sizes):
+            print("FAILED: Large data bandwidth performance test failed")
+            return False
+            
+        # Test large tensors (around 1GB)
+        large_tensor_configs = [
+            ("Float32_Large", (8192, 8192), torch.float32),     # ~256MB
+            ("Float32_XLarge", (16384, 8192), torch.float32),   # ~512MB  
+            ("Float32_XXLarge", (16384, 16384), torch.float32), # ~1GB
+        ]
+        if not tester.test_tensor_bandwidth_performance_large_scale(large_tensor_configs):
+            print("FAILED: Large tensor bandwidth performance test failed")
+            return False
+        
+        # Print results
+        tester.results.print_summary()
+        
+        print("\nSUCCESS: All performance tests completed!")
+        return True
+        
+    except Exception as e:
+        print(f"ERROR: Test failed with exception: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+        
+    finally:
+        tester.teardown()
+
+
+if __name__ == "__main__":
+    success = main()
+    print(f"\nFinal result: {'SUCCESS' if success else 'FAILURE'}")
+    sys.exit(0 if success else 1)
diff --git a/test_enhanced_tensor_rebuilding.py b/test_enhanced_tensor_rebuilding.py
deleted file mode 100644
index 64ab7977c..000000000
--- a/test_enhanced_tensor_rebuilding.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test enhanced CoroRPC tensor rebuilding functionality
-"""
-
-import torch
-import numpy as np
-import asyncio
-import threading
-import time
-import sys
-
-try:
-    import mooncake.engine as engine
-    print("Successfully imported mooncake.engine")
-    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
-    print("Successfully imported CoroRPCInterface")
-except ImportError as e:
-    print(f"Failed to import mooncake: {e}")
-    sys.exit(1)
-except AttributeError as e:
-    print(f"Failed to import CoroRPCInterface: {e}")
-    sys.exit(1)
-
-
-def test_enhanced_tensor_rebuilding():
-    print("\n=== Testing Enhanced Tensor Rebuilding ===")
-
-    # Create server and client instances
-    server = CoroRPCInterface()
-    client = CoroRPCInterface()
-
-    # Store received tensors
-    received_tensors = []
-
-    def tensor_receive_callback(received_tensor):
-        print(f"Received tensor from: {received_tensor.source_address}")
-        print(f"Original data size: {len(received_tensor.data)} bytes")
-        print(f"Shape info: {received_tensor.shape}")
-        print(f"Dtype info: {received_tensor.dtype}")
-
-        try:
-            # Use enhanced rebuild functionality to reconstruct tensor
-            rebuilt_tensor = received_tensor.rebuild_tensor()
-            received_tensors.append(rebuilt_tensor)
-
-            print("Successfully rebuilt tensor:")
-            print(f"  - Shape: {rebuilt_tensor.shape}")
-            print(f"  - Dtype: {rebuilt_tensor.dtype}")
-            print(f"  - Device: {rebuilt_tensor.device}")
-            print(f"  - Data sample: {rebuilt_tensor.flatten()[:5]}")
-
-        except Exception as e:
-            print(f"Failed to rebuild tensor: {e}")
-            import traceback
-            traceback.print_exc()
-
-    try:
-        # Initialize server and client
-        server_addr = "127.0.0.1:8888"
-        if not server.initialize(server_addr, 1, 30, 4):
-            print("Server initialization failed")
-            return False
-
-        if not client.initialize("", 0, 30, 4):
-            print("Client initialization failed")
-            return False
-
-        # Set tensor receive callback
-        server.set_tensor_receive_callback(tensor_receive_callback)
-
-        # Start server asynchronously
-        if not server.start_server_async():
-            print("Failed to start server")
-            return False
-
-        print(f"Server started on {server_addr}")
-        time.sleep(1)  # Wait for server to start
-
-        # Connect client to server
-        if not client.add_remote_connection(server_addr):
-            print("Failed to connect to server")
-            return False
-
-        print("Client connected to server")
-        time.sleep(0.5)  # Wait for connection establishment
-
-        # Define test cases with various tensor types
-        test_cases = [
-            ("Float32 2D", torch.randn(3, 4, dtype=torch.float32)),
-            ("Int64 1D", torch.arange(10, dtype=torch.int64)),
-            ("Float64 3D", torch.ones(2, 3, 4, dtype=torch.float64)),
-            ("Int32 Vector", torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)),
-            ("Bool Matrix", torch.tensor([[True, False], [False, True]], dtype=torch.bool)),
-        ]
-
-        for test_name, original_tensor in test_cases:
-            print(f"\n--- Testing {test_name} ---")
-            print("Original tensor:")
-            print(f"  - Shape: {original_tensor.shape}")
-            print(f"  - Dtype: {original_tensor.dtype}")
-            print(f"  - Data sample: {original_tensor.flatten()[:5]}")
-
-            # Send tensor
-            result = client.send_tensor(server_addr, original_tensor)
-            print(f"Send result: {result}")
-
-            if result < 0:
-                print(f"Failed to send {test_name}")
-                continue
-
-            # Wait for reception and processing
-            time.sleep(1)
-
-            if len(received_tensors) == 0:
-                print(f"No tensor received for {test_name}")
-                continue
-
-            # Validate the rebuilt tensor
-            rebuilt_tensor = received_tensors[-1]
-
-            # Check shape
-            if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
-                print(f"Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
-                continue
-
-            # Check data type
-            if rebuilt_tensor.dtype != original_tensor.dtype:
-                print(f"Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
-                continue
-
-            # Check data content (move to CPU for comparison)
-            try:
-                if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
-                    print(f"{test_name} passed - data integrity verified")
-                else:
-                    print(f"{test_name} failed - data content mismatch")
-                    print(f"  Original: {original_tensor.flatten()[:5]}")
-                    print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
-            except Exception as e:
-                print(f"{test_name} failed - comparison error: {e}")
-
-        print(f"\nEnhanced tensor rebuilding test completed")
-        print(f"Total tensors processed: {len(received_tensors)}")
-
-        return len(received_tensors) == len(test_cases)
-
-    except Exception as e:
-        print(f"Test failed with exception: {e}")
-        import traceback
-        traceback.pri
\ No newline at end of file

From f5deb6c2f65cf395e880a6f7dc1f0f401e196f52 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 2 Sep 2025 19:58:29 +0800
Subject: [PATCH 09/64] change std::future to async_simple coro lazy

---
 .../coro_rpc_connector/cororpc_communicator.h |   7 +-
 .../cororpc_communicator.cpp                  | 137 +++---------------
 .../coro_rpc_connector/cororpc_interface.cpp  |   5 +-
 3 files changed, 31 insertions(+), 118 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 819c8f152..4b37905ab 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -10,6 +10,7 @@
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
 #include <ylt/coro_io/client_pool.hpp>
+#include <async_simple/coro/Lazy.h>
 
 namespace mooncake {
 
@@ -69,13 +70,13 @@ class CoroRPCCommunicator {
     bool isConnected(const std::string& remote_address);
     
     int sendData(const std::string& target_address, const void* data, size_t data_size);
-    std::future<result> sendDataAsync(const std::string& target_address, const void* data, size_t data_size);
+    async_simple::coro::Lazy<result> sendDataAsync(const std::string& target_address, const void* data, size_t data_size);
     
     int sendTensor(const std::string& target_address, const pybind11::object& tensor);
-    std::future<int> sendTensorAsync(const std::string& target_address, const TensorInfo& tensor);
+    async_simple::coro::Lazy<int> sendTensorAsync(const std::string& target_address, const TensorInfo& tensor);
     
     int receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms = -1);
-    std::future<std::string> receiveDataAsync(const std::string& source_address, int timeout_ms = -1);
+    async_simple::coro::Lazy<std::string> receiveDataAsync(const std::string& source_address, int timeout_ms = -1);
 
     void setDataReceiveCallback(std::function<void(const std::string&, const std::string&)> callback);
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 24c406ea0..e05bcfd36 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -136,69 +136,28 @@ bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
 int CoroRPCCommunicator::sendData(const std::string& target_address,
                                   const void* data,
                                   size_t data_size) {
-    try {
-        // Ensure client pool exists for this target
-        if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
-            if (!addRemoteConnection(target_address)) {
-                std::cerr << "Failed to create client pool for " << target_address << std::endl;
-                return -1;
-            }
-        }
-        
-        auto& client_pool = impl_->client_pools_[target_address];
-        
-        // Use promise/future to convert async operation to sync
-        auto promise = std::make_shared<std::promise<int>>();
-        auto future = promise->get_future();
-        
-        client_pool->send_request(
-            [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
-                -> async_simple::coro::Lazy<void> {
-                
-                std::string_view data_view(static_cast<const char*>(data), data_size);
-                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
-                
-                if (result.has_value()) {
-                    promise->set_value(0);
-                } else {
-                    std::cerr << "RPC call failed: " << result.error().msg << std::endl;
-                    promise->set_value(-1);
-                }
-            }
-        ).start([](auto &&) {});
-        
-        int result = future.get();
-        
-        if (result == 0) {
-            std::cout << "Successfully sent " << data_size << " bytes to " << target_address << std::endl;
-        }
-        
-        return result;
-    } catch (const std::exception& e) {
-        std::cerr << "Send data error: " << e.what() << std::endl;
-        return -1;
-    }
+    auto result = async_simple::coro::syncAwait(sendDataAsync(target_address, data, data_size));
+    return result.code;
 }
 
-std::future<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
+async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
                                                        const void* data,
                                                        size_t data_size) {
-    auto promise = std::make_shared<std::promise<result>>();
-    auto future = promise->get_future();
-    
     // Ensure client pool exists for this target
     if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
         if (!addRemoteConnection(target_address)) {
             result res;
             res.code = -1;
             res.err_msg = "Failed to create client pool for " + target_address;
-            promise->set_value(res);
-            return future;
+            co_return res;
         }
     }
     
     auto& client_pool = impl_->client_pools_[target_address];
     
+    auto promise = std::make_shared<std::promise<result>>();
+    auto future = promise->get_future();
+    
     client_pool->send_request(
         [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
             -> async_simple::coro::Lazy<void> {
@@ -218,74 +177,30 @@ std::future<result> CoroRPCCommunicator::sendDataAsync(const std::string& target
         }
     ).start([](auto &&) {});
     
-    return future;
+    co_return future.get();
 }
 
 int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pybind11::object& tensor) {
-    try {
-        // Ensure client pool exists for this target
-        if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
-            if (!addRemoteConnection(target_address)) {
-                std::cerr << "Failed to create client pool for " << target_address << std::endl;
-                return -1;
-            }
-        }
-        
-        auto& client_pool = impl_->client_pools_[target_address];
-        
-        // Use promise/future to convert async operation to sync
-        auto promise = std::make_shared<std::promise<int>>();
-        auto future = promise->get_future();
-        
-        uintptr_t data_ptr = tensor.attr("data_ptr")().cast<uintptr_t>();
-        size_t numel = tensor.attr("numel")().cast<size_t>();
-        size_t element_size = tensor.attr("element_size")().cast<size_t>();
-        size_t tensor_size = numel * element_size;
-        
-        client_pool->send_request(
-            [data_ptr, tensor_size, promise](coro_rpc::coro_rpc_client &client) 
-                -> async_simple::coro::Lazy<void> {
-                
-                client.set_req_attachment(std::string_view((char*)data_ptr, tensor_size));
-                
-                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
-                
-                if (result.has_value()) {
-                    promise->set_value(0);
-                } else {
-                    std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
-                    promise->set_value(-1);
-                }
-            }
-        ).start([](auto &&) {});
-        
-        int result = future.get();
-        
-        if (result == 0) {
-            std::cout << "Successfully sent tensor to " << target_address << std::endl;
-        }
-        
-        return result;
-    } catch (const std::exception& e) {
-        std::cerr << "Send tensor error: " << e.what() << std::endl;
-        return -1;
-    }
+    // Convert pybind11::object to TensorInfo
+    TensorInfo tensor_info;
+    // TODO: Extract tensor information from pybind11::object
+    auto result = async_simple::coro::syncAwait(sendTensorAsync(target_address, tensor_info));
+    return result;
 }
 
-std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address, const TensorInfo& tensor) {
-    auto promise = std::make_shared<std::promise<int>>();
-    auto future = promise->get_future();
-    
+async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address, const TensorInfo& tensor) {
     // Ensure client pool exists for this target
     if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
         if (!addRemoteConnection(target_address)) {
-            promise->set_value(-1);
-            return future;
+            co_return -1;
         }
     }
     
     auto& client_pool = impl_->client_pools_[target_address];
     
+    auto promise = std::make_shared<std::promise<int>>();
+    auto future = promise->get_future();
+    
     client_pool->send_request(
         [tensor, promise](coro_rpc::coro_rpc_client &client) 
             -> async_simple::coro::Lazy<void> {
@@ -303,22 +218,18 @@ std::future<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_
         }
     ).start([](auto &&) {});
     
-    return future;
+    co_return future.get();
 }
 
 int CoroRPCCommunicator::receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms) {
+    auto result = async_simple::coro::syncAwait(receiveDataAsync(source_address, timeout_ms));
+    // TODO: Copy result to buffer and return size
     return 0;
 }
 
-std::future<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address, int timeout_ms) {
-    auto promise = std::make_shared<std::promise<std::string>>();
-    auto future = promise->get_future();
-    
-    std::thread([promise]() {
-        promise->set_value(std::string());
-    }).detach();
-    
-    return future;
+async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address, int timeout_ms) {
+    // TODO: Implement actual receive logic
+    co_return std::string();
 }
 
 void CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<void> context, std::string_view data) {
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 52919b827..3630bb7e5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -5,6 +5,7 @@
 #include <thread>
 #include <future>
 #include <vector>
+#include "async_simple/coro/SyncAwait.h"
 
 namespace mooncake {
 
@@ -374,8 +375,8 @@ pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_add
 
     auto task_func = std::make_shared<std::function<void()>>(
         [communicator, target_addr, tensor_info, future_ptr, loop_ptr]() {
-        auto std_future = communicator->sendTensorAsync(*target_addr, *tensor_info);
-        int result = std_future.get();
+        auto lazy_result = communicator->sendTensorAsync(*target_addr, *tensor_info);
+        int result = async_simple::coro::syncAwait(lazy_result);
         
         auto call_soon_threadsafe = [future_ptr, loop_ptr, result]() {
             pybind11::gil_scoped_acquire acquire;

From 65ee0c80ab5130f937f37a4946a1a9d25527b637 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 11:50:07 +0800
Subject: [PATCH 10/64] 1. removed some data copies 2. replace synchronous
 thread with asynchronous coroutine 3. change std::future to lazy return value

---
 .../coro_rpc_connector/cororpc_communicator.h |   7 +-
 .../coro_rpc_connector/cororpc_interface.h    |   4 +-
 .../cororpc_communicator.cpp                  | 161 ++++---------
 .../coro_rpc_connector/cororpc_interface.cpp  | 216 ++++++++++--------
 4 files changed, 170 insertions(+), 218 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 4b37905ab..5de08f987 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -10,6 +10,7 @@
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
 #include <ylt/coro_io/client_pool.hpp>
+#include <ylt/coro_io/coro_io.hpp>
 #include <async_simple/coro/Lazy.h>
 
 namespace mooncake {
@@ -47,7 +48,6 @@ class CoroRPCCommunicator {
         bool is_server_started = false;
         
         std::unique_ptr<coro_rpc::coro_rpc_server> server_;
-        std::unordered_map<std::string, std::shared_ptr<coro_io::client_pool<coro_rpc::coro_rpc_client>>> client_pools_;
         
         std::function<void(const std::string&, const std::string&)> data_receive_callback;
         
@@ -65,10 +65,6 @@ class CoroRPCCommunicator {
     bool startServerAsync();
     void stopServer();
     
-    bool addRemoteConnection(const std::string& remote_address);
-    void removeRemoteConnection(const std::string& remote_address);
-    bool isConnected(const std::string& remote_address);
-    
     int sendData(const std::string& target_address, const void* data, size_t data_size);
     async_simple::coro::Lazy<result> sendDataAsync(const std::string& target_address, const void* data, size_t data_size);
     
@@ -83,6 +79,7 @@ class CoroRPCCommunicator {
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
 private:
+    coro_io::client_pools<coro_rpc::coro_rpc_client> client_pools_;
     std::shared_ptr<Impl> impl_;
 };
 
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 2c4102708..bd7b7e530 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -62,12 +62,12 @@ class CoroRPCInterface {
     bool isConnected(const std::string& remote_address);
 
     int sendData(const std::string& target_address, pybind11::bytes data);
-    pybind11::object sendDataAsync(const std::string& target_address,
+    pybind11::object sendDataAsync(std::string& target_address,
                                    pybind11::bytes data,
                                    pybind11::handle loop);
 
     int sendTensor(const std::string& target_address, pybind11::handle tensor);
-    pybind11::object sendTensorAsync(const std::string& target_address,
+    pybind11::object sendTensorAsync(std::string& target_address,
                                      pybind11::handle tensor,
                                      pybind11::handle loop);
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index e05bcfd36..7942ba342 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -5,11 +5,15 @@
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
 #include <ylt/coro_io/client_pool.hpp>
+#include <ylt/coro_io/coro_io.hpp>
 #include "async_simple/coro/SyncAwait.h"
 
 namespace mooncake {
 
-CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {}
+CoroRPCCommunicator::CoroRPCCommunicator() 
+    : impl_(std::make_shared<Impl>()) {
+    // 可以设置默认的 pool_config 如果需要的话
+}
 
 CoroRPCCommunicator::~CoroRPCCommunicator() {
     stopServer();
@@ -86,53 +90,6 @@ void CoroRPCCommunicator::stopServer() {
     }
 }
 
-bool CoroRPCCommunicator::addRemoteConnection(const std::string& remote_address) {
-    try {
-        // Check if client pool already exists for this address
-        auto it = impl_->client_pools_.find(remote_address);
-        if (it != impl_->client_pools_.end()) {
-            std::cout << "Client pool for " << remote_address << " already exists" << std::endl;
-            return true;
-        }
-        
-        // Create new client pool for this remote address
-        auto client_pool = coro_io::client_pool<coro_rpc::coro_rpc_client>::create(remote_address);
-        
-        if (!client_pool) {
-            std::cerr << "Failed to create client pool for " << remote_address << std::endl;
-            return false;
-        }
-        
-        impl_->client_pools_[remote_address] = client_pool;
-        std::cout << "Successfully created client pool for " << remote_address 
-                  << " with pool size: " << impl_->config.pool_size << std::endl;
-        return true;
-        
-    } catch (const std::exception& e) {
-        std::cerr << "Exception while creating client pool for " << remote_address << ": " << e.what() << std::endl;
-        return false;
-    }
-}
-
-void CoroRPCCommunicator::removeRemoteConnection(const std::string& remote_address) {
-    auto it = impl_->client_pools_.find(remote_address);
-    if (it != impl_->client_pools_.end()) {
-        impl_->client_pools_.erase(it);
-        std::cout << "Removed client pool for " << remote_address << std::endl;
-    } else {
-        std::cout << "No client pool found for " << remote_address << std::endl;
-    }
-}
-
-bool CoroRPCCommunicator::isConnected(const std::string& remote_address) {
-    auto it = impl_->client_pools_.find(remote_address);
-    if (it != impl_->client_pools_.end()) {
-        // Client pool exists, assume it can provide connections
-        return true;
-    }
-    return false;
-}
-
 int CoroRPCCommunicator::sendData(const std::string& target_address,
                                   const void* data,
                                   size_t data_size) {
@@ -143,41 +100,31 @@ int CoroRPCCommunicator::sendData(const std::string& target_address,
 async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
                                                        const void* data,
                                                        size_t data_size) {
-    // Ensure client pool exists for this target
-    if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
-        if (!addRemoteConnection(target_address)) {
-            result res;
-            res.code = -1;
-            res.err_msg = "Failed to create client pool for " + target_address;
-            co_return res;
-        }
-    }
-    
-    auto& client_pool = impl_->client_pools_[target_address];
-    
-    auto promise = std::make_shared<std::promise<result>>();
-    auto future = promise->get_future();
-    
-    client_pool->send_request(
-        [data, data_size, promise](coro_rpc::coro_rpc_client &client) 
-            -> async_simple::coro::Lazy<void> {
-            
-            std::string_view data_view(static_cast<const char*>(data), data_size);
-            auto rpc_result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
-            
-            result res;
-            if (rpc_result.has_value()) {
-                res.code = 0;
-            } else {
-                res.code = rpc_result.error().val();
-                res.err_msg = rpc_result.error().msg;
+    try {
+        std::string_view data_view(static_cast<const char*>(data), data_size);
+        
+        auto rpc_result = co_await client_pools_.send_request(
+            target_address,
+            [data_view](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+                if (!result.has_value()) {
+                    std::cerr << "RPC call failed: " << result.error().msg << std::endl;
+                }
             }
-            
-            promise->set_value(res);
-        }
-    ).start([](auto &&) {});
-    
-    co_return future.get();
+        );
+        
+        result res;
+        res.code = 0;
+        co_return res;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Exception in sendDataAsync: " << e.what() << std::endl;
+        result res;
+        res.code = -1;
+        res.err_msg = e.what();
+        co_return res;
+    }
 }
 
 int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pybind11::object& tensor) {
@@ -189,36 +136,28 @@ int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pyb
 }
 
 async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address, const TensorInfo& tensor) {
-    // Ensure client pool exists for this target
-    if (impl_->client_pools_.find(target_address) == impl_->client_pools_.end()) {
-        if (!addRemoteConnection(target_address)) {
-            co_return -1;
-        }
-    }
-    
-    auto& client_pool = impl_->client_pools_[target_address];
-    
-    auto promise = std::make_shared<std::promise<int>>();
-    auto future = promise->get_future();
-    
-    client_pool->send_request(
-        [tensor, promise](coro_rpc::coro_rpc_client &client) 
-            -> async_simple::coro::Lazy<void> {
-            
-            client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
-            
-            auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
-            
-            if (result.has_value()) {
-                promise->set_value(0);
-            } else {
-                std::cerr << "Async tensor RPC call failed: " << result.error().msg << std::endl;
-                promise->set_value(-1);
+    try {
+        auto rpc_result = co_await client_pools_.send_request(
+            target_address,
+            [&tensor](coro_rpc::coro_rpc_client &client) 
+                -> async_simple::coro::Lazy<void> {
+                
+                client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
+                
+                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
+                
+                if (!result.has_value()) {
+                    std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
+                }
             }
-        }
-    ).start([](auto &&) {});
-    
-    co_return future.get();
+        );
+        
+        co_return 0;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Exception in sendTensorAsync: " << e.what() << std::endl;
+        co_return -1;
+    }
 }
 
 int CoroRPCCommunicator::receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms) {
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 3630bb7e5..a4a2782e3 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -179,35 +179,32 @@ void CoroRPCInterface::stopServer() {
 }
 
 bool CoroRPCInterface::addRemoteConnection(const std::string& remote_address) {
-    if (!impl_->communicator) return false;
-    return impl_->communicator->addRemoteConnection(remote_address);
+    // client_pools 自动管理连接，不需要手动添加
+    std::cout << "Remote connection for " << remote_address << " will be managed automatically" << std::endl;
+    return true;
 }
 
 void CoroRPCInterface::removeRemoteConnection(const std::string& remote_address) {
-    if (impl_->communicator) {
-        impl_->communicator->removeRemoteConnection(remote_address);
-    }
+    // client_pools 自动管理连接，不需要手动移除
+    std::cout << "Remote connection for " << remote_address << " is managed automatically" << std::endl;
 }
 
 bool CoroRPCInterface::isConnected(const std::string& remote_address) {
-    if (!impl_->communicator) return false;
-    return impl_->communicator->isConnected(remote_address);
+    // client_pools 会自动建立连接，总是返回 true
+    return true;
 }
 
 int CoroRPCInterface::sendData(const std::string& target_address, pybind11::bytes data) {
     if (!impl_->communicator) return -1;
     
-    std::string data_str;
-    {
-        pybind11::gil_scoped_acquire acquire;
-        data_str = data;
-    }
-
+    pybind11::gil_scoped_acquire acquire;
+    
+    std::string_view data_view = data;
     pybind11::gil_scoped_release release;
-    return impl_->communicator->sendData(target_address, data_str.data(), data_str.size());
+    return impl_->communicator->sendData(target_address, data_view.data(), data_view.size());
 }
 
-pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_address, 
+pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address, 
                                                   pybind11::bytes data, 
                                                   pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
@@ -222,30 +219,48 @@ pybind11::object CoroRPCInterface::sendDataAsync(const std::string& target_addre
     }
 
     auto communicator = impl_->communicator.get();
-    auto target_addr = std::make_shared<std::string>(target_address);
-    auto data_holder = std::make_shared<std::string>(data);
+    auto target_addr = std::move(target_address);
+    
+    std::string data_str = data;
+    
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    auto loop_ptr = std::make_shared<pybind11::object>(pybind11::reinterpret_borrow<pybind11::object>(loop));
+    pybind11::object loop_obj = pybind11::reinterpret_borrow<pybind11::object>(loop);
 
-    auto task_func = std::make_shared<std::function<void()>>(
-        [communicator, target_addr, data_holder, future_ptr, loop_ptr]() {
-        int result = communicator->sendData(*target_addr, data_holder->data(), data_holder->size());
-        
-        auto call_soon_threadsafe = [future_ptr, loop_ptr, result]() {
-            pybind11::gil_scoped_acquire acquire;
-            if (result >= 0) {
-                future_ptr->attr("set_result")(result);
-            } else {
+    auto coro_lambda = [communicator, target_addr, data_str, future_ptr, loop_obj]() -> async_simple::coro::Lazy<void> {
+        try {
+            auto result_struct = co_await communicator->sendDataAsync(target_addr, data_str.data(), data_str.size());
+            int result = result_struct.code;
+
+            auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
+                pybind11::gil_scoped_acquire acquire;
+                if (result >= 0) {
+                    future_ptr->attr("set_result")(result);
+                } else {
+                    future_ptr->attr("set_exception")(pybind11::make_tuple(
+                        pybind11::str("Send data failed")));
+                }
+            };
+
+            auto callback = pybind11::cpp_function(call_soon_threadsafe);
+            loop_obj.attr("call_soon_threadsafe")(callback);
+        } catch (const std::exception& e) {
+            auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
+                pybind11::gil_scoped_acquire acquire;
                 future_ptr->attr("set_exception")(pybind11::make_tuple(
-                    pybind11::str("Send data failed")));
-            }
-        };
+                    pybind11::str(std::string("Send data error: ") + e.what())));
+            };
 
-        auto callback = pybind11::cpp_function(call_soon_threadsafe);
-        loop_ptr->attr("call_soon_threadsafe")(callback);
-    });
+            auto callback = pybind11::cpp_function(call_soon_threadsafe);
+            loop_obj.attr("call_soon_threadsafe")(callback);
+        }
+    };
 
-    std::thread([task_func]() { (*task_func)(); }).detach();
+    auto lazy = coro_lambda();
+    lazy.start([](auto &&result) {
+       if (result.hasError()) {
+            std::cerr << "Coroutine completed with error" << std::endl;
+        }
+    });
     
     return future_obj;
 }
@@ -254,13 +269,11 @@ int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::ha
     if (!impl_->communicator) return -1;
     
     try {
-        pybind11::object tensor_obj;
-        TensorMetadata metadata = {};
-        std::vector<char> combined_data;
+        TensorInfo tensor_info;
         
         {
             pybind11::gil_scoped_acquire acquire;
-            tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
+            pybind11::object tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
             
             // Validate tensor type
             if (!(tensor_obj.attr("__class__").attr("__name__").cast<std::string>().find("Tensor") != std::string::npos)) {
@@ -268,60 +281,42 @@ int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::ha
                 return -1;
             }
             
-            // Extract tensor properties
+            // Extract tensor properties - zero copy, just get pointers and metadata
             uintptr_t data_ptr = tensor_obj.attr("data_ptr")().cast<uintptr_t>();
             size_t numel = tensor_obj.attr("numel")().cast<size_t>();
             size_t element_size = tensor_obj.attr("element_size")().cast<size_t>();
             size_t tensor_size = numel * element_size;
             
-            // Get tensor dtype
-            pybind11::object dtype_obj = tensor_obj.attr("dtype");
-            TensorDtype dtype_enum = get_tensor_dtype(dtype_obj);
-            if (dtype_enum == TensorDtype::UNKNOWN) {
-                std::cerr << "Unsupported tensor dtype" << std::endl;
-                return -1;
-            }
-            
             // Get tensor shape
             pybind11::object shape_obj = tensor_obj.attr("shape");
             pybind11::tuple shape_tuple = pybind11::cast<pybind11::tuple>(shape_obj);
-            int32_t ndim = static_cast<int32_t>(shape_tuple.size());
-            if (ndim > 4) {
-                std::cerr << "Tensor has too many dimensions (max 4 supported)" << std::endl;
-                return -1;
-            }
-            
-            // Fill metadata
-            metadata.dtype = static_cast<int32_t>(dtype_enum);
-            metadata.ndim = ndim;
-            for (int i = 0; i < 4; i++) {
-                if (i < ndim) {
-                    metadata.shape[i] = shape_tuple[i].cast<int64_t>();
-                } else {
-                    metadata.shape[i] = 0;
-                }
+            std::vector<size_t> shape;
+            for (size_t i = 0; i < shape_tuple.size(); i++) {
+                shape.push_back(shape_tuple[i].cast<size_t>());
             }
             
-            // Create combined data: metadata + tensor data
-            combined_data.resize(sizeof(TensorMetadata) + tensor_size);
-            
-            // Copy metadata
-            std::memcpy(combined_data.data(), &metadata, sizeof(TensorMetadata));
+            // Get tensor dtype string
+            pybind11::object dtype_obj = tensor_obj.attr("dtype");
+            std::string dtype = dtype_obj.attr("__str__")().cast<std::string>();
             
-            // Copy tensor data
-            const char* tensor_data = reinterpret_cast<const char*>(data_ptr);
-            std::memcpy(combined_data.data() + sizeof(TensorMetadata), tensor_data, tensor_size);
+            // Fill TensorInfo structure (no data copying, just metadata)
+            tensor_info.data_ptr = reinterpret_cast<void*>(data_ptr);
+            tensor_info.total_bytes = tensor_size;
+            tensor_info.shape = std::move(shape);
+            tensor_info.dtype = std::move(dtype);
             
             std::cout << "Sending tensor with shape: [";
-            for (int i = 0; i < ndim; i++) {
-                std::cout << metadata.shape[i];
-                if (i < ndim - 1) std::cout << ", ";
+            for (size_t i = 0; i < tensor_info.shape.size(); i++) {
+                std::cout << tensor_info.shape[i];
+                if (i < tensor_info.shape.size() - 1) std::cout << ", ";
             }
-            std::cout << "] and dtype: " << metadata.dtype << ", total size: " << combined_data.size() << " bytes" << std::endl;
+            std::cout << "] and dtype: " << tensor_info.dtype << ", tensor size: " << tensor_info.total_bytes << " bytes" << std::endl;
         }
 
+        // Use the async version which supports zero-copy via attachments
         pybind11::gil_scoped_release release;
-        return impl_->communicator->sendData(target_address, combined_data.data(), combined_data.size());
+        auto result = async_simple::coro::syncAwait(impl_->communicator->sendTensorAsync(target_address, tensor_info));
+        return result;
         
     } catch (const std::exception& e) {
         std::cerr << "Send tensor error: " << e.what() << std::endl;
@@ -329,7 +324,7 @@ int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::ha
     }
 }
 
-pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_address, 
+pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address, 
                                                     pybind11::handle tensor, 
                                                     pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
@@ -344,7 +339,7 @@ pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_add
     }
 
     auto communicator = impl_->communicator.get();
-    auto target_addr = std::make_shared<std::string>(target_address);
+    std::string target_addr = std::move(target_address);
     
     // Extract tensor info
     pybind11::object tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
@@ -364,35 +359,56 @@ pybind11::object CoroRPCInterface::sendTensorAsync(const std::string& target_add
     pybind11::object dtype_obj = tensor_obj.attr("dtype");
     std::string dtype = dtype_obj.attr("__str__")().cast<std::string>();
     
-    auto tensor_info = std::make_shared<TensorInfo>();
-    tensor_info->data_ptr = reinterpret_cast<void*>(data_ptr);
-    tensor_info->total_bytes = tensor_size;
-    tensor_info->shape = shape;
-    tensor_info->dtype = dtype;
+    // Create TensorInfo on stack (no need for shared_ptr)
+    TensorInfo tensor_info;
+    tensor_info.data_ptr = reinterpret_cast<void*>(data_ptr);
+    tensor_info.total_bytes = tensor_size;
+    tensor_info.shape = std::move(shape);
+    tensor_info.dtype = std::move(dtype);
     
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    auto loop_ptr = std::make_shared<pybind11::object>(pybind11::reinterpret_borrow<pybind11::object>(loop));
+    pybind11::object loop_obj = pybind11::reinterpret_borrow<pybind11::object>(loop);
+
+    // Schedule coroutine to run asynchronously
+    auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr, loop_obj]() -> async_simple::coro::Lazy<void> {
+        try {
+            // Call the async version which returns a coroutine
+            auto result = co_await communicator->sendTensorAsync(target_addr, tensor_info);
+
+            auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
+                pybind11::gil_scoped_acquire acquire;
+                if (result >= 0) {
+                    future_ptr->attr("set_result")(result);
+                } else {
+                    future_ptr->attr("set_exception")(pybind11::make_tuple(
+                        pybind11::str("Send tensor failed")));
+                }
+            };
 
-    auto task_func = std::make_shared<std::function<void()>>(
-        [communicator, target_addr, tensor_info, future_ptr, loop_ptr]() {
-        auto lazy_result = communicator->sendTensorAsync(*target_addr, *tensor_info);
-        int result = async_simple::coro::syncAwait(lazy_result);
-        
-        auto call_soon_threadsafe = [future_ptr, loop_ptr, result]() {
-            pybind11::gil_scoped_acquire acquire;
-            if (result >= 0) {
-                future_ptr->attr("set_result")(result);
-            } else {
+            auto callback = pybind11::cpp_function(call_soon_threadsafe);
+            loop_obj.attr("call_soon_threadsafe")(callback);
+        } catch (const std::exception& e) {
+            auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
+                pybind11::gil_scoped_acquire acquire;
                 future_ptr->attr("set_exception")(pybind11::make_tuple(
-                    pybind11::str("Send tensor failed")));
-            }
-        };
+                    pybind11::str(std::string("Send tensor error: ") + e.what())));
+            };
 
-        auto callback = pybind11::cpp_function(call_soon_threadsafe);
-        loop_ptr->attr("call_soon_threadsafe")(callback);
+            auto callback = pybind11::cpp_function(call_soon_threadsafe);
+            loop_obj.attr("call_soon_threadsafe")(callback);
+        }
+    };
+
+    // Start the coroutine in a detached manner (fire and forget)
+    auto lazy = coro_lambda();
+    lazy.start([](auto &&result) {
+        // This callback will be called when the coroutine completes
+        // We don't need to do anything here since the result is handled in the coroutine itself
+        if (result.hasError()) {
+            // Log error if needed
+            std::cerr << "Tensor coroutine completed with error" << std::endl;
+        }
     });
-
-    std::thread([task_func]() { (*task_func)(); }).detach();
     
     return future_obj;
 }

From 8c061e39490e04c82f43cdce95a2c022edf80264 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 13:57:38 +0800
Subject: [PATCH 11/64] 1.added tests to ci 2. removed CLAUDE.md

---
 .gitignore           | 5 +----
 scripts/run_tests.sh | 6 ++++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index d0ad29f0f..5314dc685 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,7 +194,4 @@ libetcd_wrapper.h
 
 mooncake-wheel/mooncake/allocator.py
 mooncake-wheel/mooncake/mooncake_master
-mooncake-wheel/mooncake/transfer_engine_bench
-
-# Claude Code Memory
-CLAUDE.md
\ No newline at end of file
+mooncake-wheel/mooncake/transfer_engine_bench
\ No newline at end of file
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index bda869e43..12bf5e3e5 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -15,6 +15,12 @@ TARGET_PID=$!
 MC_METADATA_SERVER=http://127.0.0.1:8080/metadata python transfer_engine_initiator_test.py
 kill $TARGET_PID || true
 
+echo "Running CoroRPC performance tests..."
+cd ../mooncake-transfer-engine/tests
+pip install torch numpy
+python test_coro_rpc_performance.py
+cd ../../mooncake-wheel/tests
+
 echo "Running master tests..."
 
 which mooncake_master 2>/dev/null | grep -q '/usr/local/bin/mooncake_master' && \

From aed64b1ee8e354e09acb0d292dfa9f9ff269d385 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 14:55:07 +0800
Subject: [PATCH 12/64] add pybind installation in CI

---
 .github/workflows/ci.yml                      |   2 +
 .../coro_rpc_connector/cororpc_communicator.h |  63 ++-
 .../coro_rpc_connector/cororpc_interface.h    |  62 +--
 .../cororpc_communicator.cpp                  | 191 ++++---
 .../coro_rpc_connector/cororpc_interface.cpp  | 481 +++++++++++-------
 5 files changed, 457 insertions(+), 342 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 049d0d60a..e93f3d614 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,6 +65,7 @@ jobs:
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
+        pip install pybind11
         mkdir build
         cd build
         cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON
@@ -226,6 +227,7 @@ jobs:
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
+        pip install pybind11
       shell: bash
 
     - name: Build transfer engine only
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 5de08f987..04b5f5951 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -34,27 +34,31 @@ struct Config {
     size_t pool_size = 10;
 };
 
-template<typename T>
+template <typename T>
 struct SimpleContext {
     coro_rpc::context<T> context_;
     void response_msg() { context_.response_msg(); }
 };
 
 class CoroRPCCommunicator {
-public:
+   public:
     class Impl {
-    public:
+       public:
         Config config;
         bool is_server_started = false;
-        
+
         std::unique_ptr<coro_rpc::coro_rpc_server> server_;
-        
-        std::function<void(const std::string&, const std::string&)> data_receive_callback;
-        
-        void handleDataTransfer(coro_rpc::context<void> context, std::string_view data);
+
+        std::function<void(const std::string&, const std::string&)>
+            data_receive_callback;
+
+        void handleDataTransfer(coro_rpc::context<void> context,
+                                std::string_view data);
         void handleTensorTransfer(coro_rpc::context<void> context);
-        void handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data);
-        void handleTensorTransferWithAttachment(coro_rpc::context<void> context);
+        void handleDataTransferWithAttachment(coro_rpc::context<void> context,
+                                              std::string_view data);
+        void handleTensorTransferWithAttachment(
+            coro_rpc::context<void> context);
     };
 
     CoroRPCCommunicator();
@@ -64,26 +68,35 @@ class CoroRPCCommunicator {
     bool startServer();
     bool startServerAsync();
     void stopServer();
-    
-    int sendData(const std::string& target_address, const void* data, size_t data_size);
-    async_simple::coro::Lazy<result> sendDataAsync(const std::string& target_address, const void* data, size_t data_size);
-    
-    int sendTensor(const std::string& target_address, const pybind11::object& tensor);
-    async_simple::coro::Lazy<int> sendTensorAsync(const std::string& target_address, const TensorInfo& tensor);
-    
-    int receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms = -1);
-    async_simple::coro::Lazy<std::string> receiveDataAsync(const std::string& source_address, int timeout_ms = -1);
-
-    void setDataReceiveCallback(std::function<void(const std::string&, const std::string&)> callback);
+
+    int sendData(const std::string& target_address, const void* data,
+                 size_t data_size);
+    async_simple::coro::Lazy<result> sendDataAsync(
+        const std::string& target_address, const void* data, size_t data_size);
+
+    int sendTensor(const std::string& target_address,
+                   const pybind11::object& tensor);
+    async_simple::coro::Lazy<int> sendTensorAsync(
+        const std::string& target_address, const TensorInfo& tensor);
+
+    int receiveData(const std::string& source_address, void* buffer,
+                    size_t buffer_size, int timeout_ms = -1);
+    async_simple::coro::Lazy<std::string> receiveDataAsync(
+        const std::string& source_address, int timeout_ms = -1);
+
+    void setDataReceiveCallback(
+        std::function<void(const std::string&, const std::string&)> callback);
 
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
-private:
+   private:
     coro_io::client_pools<coro_rpc::coro_rpc_client> client_pools_;
     std::shared_ptr<Impl> impl_;
 };
 
-std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size = 10, size_t timeout_seconds = 30);
-std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_address, size_t thread_count = 0);
+std::unique_ptr<CoroRPCCommunicator> createClientPool(
+    size_t pool_size = 10, size_t timeout_seconds = 30);
+std::unique_ptr<CoroRPCCommunicator> createServer(
+    const std::string& listen_address, size_t thread_count = 0);
 
-} // namespace mooncake
\ No newline at end of file
+}  // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index bd7b7e530..74e5391f5 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -10,35 +10,27 @@ namespace mooncake {
 struct Config;
 
 class CoroRPCInterface {
-public:
+   public:
     struct ReceivedData {
         std::string source_address;
         std::string data;
         size_t data_size = 0;
-        
-        pybind11::bytes getBytes() const {
-            return pybind11::bytes(data);
-        }
+
+        pybind11::bytes getBytes() const { return pybind11::bytes(data); }
     };
-    
+
     struct ReceivedTensor {
         std::string source_address;
         std::string data;
         std::vector<size_t> shape;
         std::string dtype;
         size_t total_bytes = 0;
-        
+
         pybind11::object rebuildTensor() const;
-        
-        // Safe method to get data size without triggering string decoding
         size_t getDataSize() const { return data.size(); }
-        
-        // Safe method to get data as bytes
-        pybind11::bytes getDataAsBytes() const { 
-            return pybind11::bytes(data); 
-        }
-        
-    private:
+        pybind11::bytes getDataAsBytes() const { return pybind11::bytes(data); }
+
+       private:
         pybind11::object rebuildTensorInternal() const;
     };
 
@@ -47,24 +39,17 @@ class CoroRPCInterface {
     CoroRPCInterface();
     ~CoroRPCInterface();
 
-    // 初始化
     bool initialize(const std::string& listen_address = "",
-                   size_t thread_count = 0,
-                   size_t timeout_seconds = 30,
-                   size_t pool_size = 10);
+                    size_t thread_count = 0, size_t timeout_seconds = 30,
+                    size_t pool_size = 10);
 
     bool startServer();
     bool startServerAsync();
     void stopServer();
 
-    bool addRemoteConnection(const std::string& remote_address);
-    void removeRemoteConnection(const std::string& remote_address);
-    bool isConnected(const std::string& remote_address);
-
     int sendData(const std::string& target_address, pybind11::bytes data);
     pybind11::object sendDataAsync(std::string& target_address,
-                                   pybind11::bytes data,
-                                   pybind11::handle loop);
+                                   pybind11::bytes data, pybind11::handle loop);
 
     int sendTensor(const std::string& target_address, pybind11::handle tensor);
     pybind11::object sendTensorAsync(std::string& target_address,
@@ -75,21 +60,24 @@ class CoroRPCInterface {
     void setTensorReceiveCallback(pybind11::function callback);
 
     void handleIncomingData(const std::string& source_address,
-                           const std::string& data);
+                            const std::string& data);
     void handleIncomingTensor(const std::string& source_address,
-                             const std::string& data,
-                             const std::vector<size_t>& shape,
-                             const std::string& dtype);
+                              const std::string& data,
+                              const std::vector<size_t>& shape,
+                              const std::string& dtype);
 
-private:
+   private:
     std::unique_ptr<Impl> impl_;
 };
 
-std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank = 0, uint64_t world_size = 1);
-std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank = 0, uint64_t world_size = 1);
+std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank = 0,
+                                                  uint64_t world_size = 1);
+std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank = 0,
+                                                  uint64_t world_size = 1);
 
-} // namespace mooncake
+}  // namespace mooncake
 
-// Forward declaration for pybind11 integration
-namespace pybind11 { class module_; }
-void bind_coro_rpc_interface(pybind11::module_ &m);
\ No newline at end of file
+namespace pybind11 {
+class module_;
+}
+void bind_coro_rpc_interface(pybind11::module_& m);
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 7942ba342..67f967366 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -10,16 +10,13 @@
 
 namespace mooncake {
 
-CoroRPCCommunicator::CoroRPCCommunicator() 
-    : impl_(std::make_shared<Impl>()) {
-    // 可以设置默认的 pool_config 如果需要的话
+CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {
 }
 
-CoroRPCCommunicator::~CoroRPCCommunicator() {
-    stopServer();
-}
+CoroRPCCommunicator::~CoroRPCCommunicator() { stopServer(); }
 
-void CoroRPCCommunicator::setDataReceiveCallback(std::function<void(const std::string&, const std::string&)> callback) {
+void CoroRPCCommunicator::setDataReceiveCallback(
+    std::function<void(const std::string&, const std::string&)> callback) {
     std::cout << "Setting data receive callback..." << std::endl;
     impl_->data_receive_callback = callback;
     std::cout << "Data receive callback set successfully" << std::endl;
@@ -27,35 +24,38 @@ void CoroRPCCommunicator::setDataReceiveCallback(std::function<void(const std::s
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
-    
+
     if (!config.listen_address.empty()) {
-        std::cout << "Initializing server on " << config.listen_address << std::endl;
-        
+        std::cout << "Initializing server on " << config.listen_address
+                  << std::endl;
+
         impl_->server_ = std::make_unique<coro_rpc::coro_rpc_server>(
-            config.thread_count, 
-            config.listen_address,
-            std::chrono::seconds(config.timeout_seconds)
-        );
-        
-        impl_->server_->register_handler<&CoroRPCCommunicator::Impl::handleDataTransfer,
-                                        &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
+            config.thread_count, config.listen_address,
+            std::chrono::seconds(config.timeout_seconds));
+
+        impl_->server_->register_handler<
+            &CoroRPCCommunicator::Impl::handleDataTransfer,
+            &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
-    
-    std::cout << "Communicator initialized with client pool support" << std::endl;
+
+    std::cout << "Communicator initialized with client pool support"
+              << std::endl;
     return true;
 }
 
 bool CoroRPCCommunicator::startServer() {
     if (!impl_->server_ || impl_->config.listen_address.empty()) return false;
-    
+
     try {
         auto ec = impl_->server_->start();
         if (ec.val() == 0) {
             impl_->is_server_started = true;
-            std::cout << "Server started on " << impl_->config.listen_address << std::endl;
+            std::cout << "Server started on " << impl_->config.listen_address
+                      << std::endl;
             return true;
         } else {
-            std::cerr << "Failed to start server: " << ec.message() << std::endl;
+            std::cerr << "Failed to start server: " << ec.message()
+                      << std::endl;
             return false;
         }
     } catch (const std::exception& e) {
@@ -66,19 +66,21 @@ bool CoroRPCCommunicator::startServer() {
 
 bool CoroRPCCommunicator::startServerAsync() {
     if (!impl_->server_ || impl_->config.listen_address.empty()) return false;
-    
+
     try {
         auto ec = impl_->server_->async_start();
         if (!ec.hasResult()) {
             impl_->is_server_started = true;
-            std::cout << "Server started asynchronously on " << impl_->config.listen_address << std::endl;
+            std::cout << "Server started asynchronously on "
+                      << impl_->config.listen_address << std::endl;
             return true;
         } else {
             std::cerr << "Failed to start server asynchronously" << std::endl;
             return false;
         }
     } catch (const std::exception& e) {
-        std::cerr << "Failed to start server asynchronously: " << e.what() << std::endl;
+        std::cerr << "Failed to start server asynchronously: " << e.what()
+                  << std::endl;
         return false;
     }
 }
@@ -91,33 +93,35 @@ void CoroRPCCommunicator::stopServer() {
 }
 
 int CoroRPCCommunicator::sendData(const std::string& target_address,
-                                  const void* data,
-                                  size_t data_size) {
-    auto result = async_simple::coro::syncAwait(sendDataAsync(target_address, data, data_size));
+                                  const void* data, size_t data_size) {
+    auto result = async_simple::coro::syncAwait(
+        sendDataAsync(target_address, data, data_size));
     return result.code;
 }
 
-async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::string& target_address,
-                                                       const void* data,
-                                                       size_t data_size) {
+async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
+    const std::string& target_address, const void* data, size_t data_size) {
     try {
         std::string_view data_view(static_cast<const char*>(data), data_size);
-        
+
         auto rpc_result = co_await client_pools_.send_request(
             target_address,
-            [data_view](coro_rpc::coro_rpc_client &client) 
+            [data_view](coro_rpc::coro_rpc_client& client)
                 -> async_simple::coro::Lazy<void> {
-                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleDataTransfer>(data_view);
+                auto result =
+                    co_await client
+                        .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
+                            data_view);
                 if (!result.has_value()) {
-                    std::cerr << "RPC call failed: " << result.error().msg << std::endl;
+                    std::cerr << "RPC call failed: " << result.error().msg
+                              << std::endl;
                 }
-            }
-        );
-        
+            });
+
         result res;
         res.code = 0;
         co_return res;
-        
+
     } catch (const std::exception& e) {
         std::cerr << "Exception in sendDataAsync: " << e.what() << std::endl;
         result res;
@@ -127,122 +131,143 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(const std::s
     }
 }
 
-int CoroRPCCommunicator::sendTensor(const std::string& target_address, const pybind11::object& tensor) {
+int CoroRPCCommunicator::sendTensor(const std::string& target_address,
+                                    const pybind11::object& tensor) {
     // Convert pybind11::object to TensorInfo
     TensorInfo tensor_info;
     // TODO: Extract tensor information from pybind11::object
-    auto result = async_simple::coro::syncAwait(sendTensorAsync(target_address, tensor_info));
+    auto result = async_simple::coro::syncAwait(
+        sendTensorAsync(target_address, tensor_info));
     return result;
 }
 
-async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(const std::string& target_address, const TensorInfo& tensor) {
+async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
+    const std::string& target_address, const TensorInfo& tensor) {
     try {
         auto rpc_result = co_await client_pools_.send_request(
             target_address,
-            [&tensor](coro_rpc::coro_rpc_client &client) 
+            [&tensor](coro_rpc::coro_rpc_client& client)
                 -> async_simple::coro::Lazy<void> {
-                
-                client.set_req_attachment(std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
-                
-                auto result = co_await client.call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
-                
+                client.set_req_attachment(std::string_view(
+                    (char*)tensor.data_ptr, tensor.total_bytes));
+
+                auto result = co_await client.call<
+                    &CoroRPCCommunicator::Impl::handleTensorTransfer>();
+
                 if (!result.has_value()) {
-                    std::cerr << "Tensor RPC call failed: " << result.error().msg << std::endl;
+                    std::cerr
+                        << "Tensor RPC call failed: " << result.error().msg
+                        << std::endl;
                 }
-            }
-        );
-        
+            });
+
         co_return 0;
-        
+
     } catch (const std::exception& e) {
         std::cerr << "Exception in sendTensorAsync: " << e.what() << std::endl;
         co_return -1;
     }
 }
 
-int CoroRPCCommunicator::receiveData(const std::string& source_address, void* buffer, size_t buffer_size, int timeout_ms) {
-    auto result = async_simple::coro::syncAwait(receiveDataAsync(source_address, timeout_ms));
+int CoroRPCCommunicator::receiveData(const std::string& source_address,
+                                     void* buffer, size_t buffer_size,
+                                     int timeout_ms) {
+    auto result = async_simple::coro::syncAwait(
+        receiveDataAsync(source_address, timeout_ms));
     // TODO: Copy result to buffer and return size
     return 0;
 }
 
-async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(const std::string& source_address, int timeout_ms) {
+async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(
+    const std::string& source_address, int timeout_ms) {
     // TODO: Implement actual receive logic
     co_return std::string();
 }
 
-void CoroRPCCommunicator::Impl::handleDataTransfer(coro_rpc::context<void> context, std::string_view data) {
-    std::cout << "Handling data transfer: " << data.size() << " bytes" << std::endl;
-    
+void CoroRPCCommunicator::Impl::handleDataTransfer(
+    coro_rpc::context<void> context, std::string_view data) {
+    std::cout << "Handling data transfer: " << data.size() << " bytes"
+              << std::endl;
+
     // Call the data receive callback if set
     if (data_receive_callback) {
         std::cout << "Calling data receive callback..." << std::endl;
-        std::string source_address = "unknown"; // You may want to extract this from context
+        std::string source_address =
+            "unknown";  // You may want to extract this from context
         std::string data_str(data);
         data_receive_callback(source_address, data_str);
     } else {
         std::cout << "No data receive callback set!" << std::endl;
     }
-    
+
     context.response_msg();
 }
 
-void CoroRPCCommunicator::Impl::handleTensorTransfer(coro_rpc::context<void> context) {
+void CoroRPCCommunicator::Impl::handleTensorTransfer(
+    coro_rpc::context<void> context) {
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
-    
-    std::cout << "Handling tensor transfer: " << attachment.size() << " bytes" << std::endl;
-    
+
+    std::cout << "Handling tensor transfer: " << attachment.size() << " bytes"
+              << std::endl;
+
     ctx_info->set_response_attachment(attachment);
     context.response_msg();
 }
 
-void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(coro_rpc::context<void> context, std::string_view data) {
+void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
+    coro_rpc::context<void> context, std::string_view data) {
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
-    
-    std::cout << "Handling data transfer with attachment - Data: " << data.size() 
-              << " bytes, Attachment: " << attachment.size() << " bytes" << std::endl;
-    
-    
+
+    std::cout << "Handling data transfer with attachment - Data: "
+              << data.size() << " bytes, Attachment: " << attachment.size()
+              << " bytes" << std::endl;
+
     context.response_msg();
 }
 
-void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(coro_rpc::context<void> context) {
+void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
+    coro_rpc::context<void> context) {
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
-    
-    std::cout << "Handling tensor transfer with attachment: " << attachment.size() << " bytes" << std::endl;
-    
+
+    std::cout << "Handling tensor transfer with attachment: "
+              << attachment.size() << " bytes" << std::endl;
+
     ctx_info->set_response_attachment(attachment);
     context.response_msg();
 }
 
-std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size, size_t timeout_seconds) {
+std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size,
+                                                      size_t timeout_seconds) {
     Config config;
     config.pool_size = pool_size;
     config.timeout_seconds = timeout_seconds;
-    
+
     auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
-        std::cout << "Created communicator with default pool size: " << pool_size << std::endl;
+        std::cout << "Created communicator with default pool size: "
+                  << pool_size << std::endl;
         return communicator;
     }
     return nullptr;
 }
 
-std::unique_ptr<CoroRPCCommunicator> createServer(const std::string& listen_address, size_t thread_count) {
+std::unique_ptr<CoroRPCCommunicator> createServer(
+    const std::string& listen_address, size_t thread_count) {
     Config config;
     config.listen_address = listen_address;
     config.thread_count = thread_count;
-    config.pool_size = 10; // Default pool size for server-side client pools
-    
+    config.pool_size = 10;  // Default pool size for server-side client pools
+
     auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
-        std::cout << "Created server communicator with pool size: " << config.pool_size << std::endl;
+        std::cout << "Created server communicator with pool size: "
+                  << config.pool_size << std::endl;
         return communicator;
     }
     return nullptr;
 }
 
-} // namespace mooncake
\ No newline at end of file
+}  // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index a4a2782e3..47f99fd14 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -25,15 +25,15 @@ enum class TensorDtype : int32_t {
 
 // Tensor metadata structure
 struct TensorMetadata {
-    int32_t dtype;   // TensorDtype enum value
-    int32_t ndim;    // Number of dimensions
-    int64_t shape[4]; // Shape array (max 4D)
-    char padding[32]; // For future extensions
+    int32_t dtype;     // TensorDtype enum value
+    int32_t ndim;      // Number of dimensions
+    int64_t shape[4];  // Shape array (max 4D)
+    char padding[32];  // For future extensions
 };
 
 // Implementation class
 class CoroRPCInterface::Impl {
-public:
+   public:
     std::unique_ptr<CoroRPCCommunicator> communicator;
     pybind11::function data_receive_callback;
     pybind11::function tensor_receive_callback;
@@ -42,101 +42,137 @@ class CoroRPCInterface::Impl {
 // Helper function to get tensor dtype from Python tensor
 TensorDtype get_tensor_dtype(const pybind11::object& dtype_obj) {
     std::string dtype_str = dtype_obj.attr("__str__")().cast<std::string>();
-    
-    if (dtype_str.find("float16") != std::string::npos) return TensorDtype::FLOAT16;
-    if (dtype_str.find("float32") != std::string::npos) return TensorDtype::FLOAT32;
-    if (dtype_str.find("float64") != std::string::npos) return TensorDtype::FLOAT64;
+
+    if (dtype_str.find("float16") != std::string::npos)
+        return TensorDtype::FLOAT16;
+    if (dtype_str.find("float32") != std::string::npos)
+        return TensorDtype::FLOAT32;
+    if (dtype_str.find("float64") != std::string::npos)
+        return TensorDtype::FLOAT64;
     if (dtype_str.find("int8") != std::string::npos) return TensorDtype::INT8;
     if (dtype_str.find("int16") != std::string::npos) return TensorDtype::INT16;
     if (dtype_str.find("int32") != std::string::npos) return TensorDtype::INT32;
     if (dtype_str.find("int64") != std::string::npos) return TensorDtype::INT64;
     if (dtype_str.find("uint8") != std::string::npos) return TensorDtype::UINT8;
     if (dtype_str.find("bool") != std::string::npos) return TensorDtype::BOOL;
-    
+
     return TensorDtype::UNKNOWN;
 }
 
 size_t get_dtype_size(TensorDtype dtype) {
     switch (dtype) {
-        case TensorDtype::FLOAT32: return 4;
-        case TensorDtype::FLOAT64: return 8;
-        case TensorDtype::INT32: return 4;
-        case TensorDtype::INT64: return 8;
-        case TensorDtype::INT8: return 1;
-        case TensorDtype::UINT8: return 1;
-        case TensorDtype::FLOAT16: return 2;
-        case TensorDtype::INT16: return 2;
-        case TensorDtype::BOOL: return 1;
-        default: return 0;
+        case TensorDtype::FLOAT32:
+            return 4;
+        case TensorDtype::FLOAT64:
+            return 8;
+        case TensorDtype::INT32:
+            return 4;
+        case TensorDtype::INT64:
+            return 8;
+        case TensorDtype::INT8:
+            return 1;
+        case TensorDtype::UINT8:
+            return 1;
+        case TensorDtype::FLOAT16:
+            return 2;
+        case TensorDtype::INT16:
+            return 2;
+        case TensorDtype::BOOL:
+            return 1;
+        default:
+            return 0;
     }
 }
 
 // Helper function to create numpy array from data
-pybind11::object create_numpy_array_from_data(const char* data, TensorDtype dtype, 
-                                               const std::vector<int64_t>& shape) {
+pybind11::object create_numpy_array_from_data(
+    const char* data, TensorDtype dtype, const std::vector<int64_t>& shape) {
     std::cout << "DEBUG: create_numpy_array_from_data called" << std::endl;
     std::cout << "DEBUG: dtype = " << static_cast<int>(dtype) << std::endl;
     std::cout << "DEBUG: shape size = " << shape.size() << std::endl;
-    
+
     pybind11::gil_scoped_acquire acquire;
-    
+
     std::cout << "DEBUG: About to import numpy..." << std::endl;
     pybind11::module_ np = pybind11::module_::import("numpy");
     std::cout << "DEBUG: Successfully imported numpy" << std::endl;
-    
+
     std::string np_dtype;
     switch (dtype) {
-        case TensorDtype::FLOAT32: np_dtype = "float32"; break;
-        case TensorDtype::FLOAT64: np_dtype = "float64"; break;
-        case TensorDtype::INT32: np_dtype = "int32"; break;
-        case TensorDtype::INT64: np_dtype = "int64"; break;
-        case TensorDtype::INT8: np_dtype = "int8"; break;
-        case TensorDtype::UINT8: np_dtype = "uint8"; break;
-        case TensorDtype::FLOAT16: np_dtype = "float16"; break;
-        case TensorDtype::INT16: np_dtype = "int16"; break;
-        case TensorDtype::BOOL: np_dtype = "bool"; break;
-        default: 
+        case TensorDtype::FLOAT32:
+            np_dtype = "float32";
+            break;
+        case TensorDtype::FLOAT64:
+            np_dtype = "float64";
+            break;
+        case TensorDtype::INT32:
+            np_dtype = "int32";
+            break;
+        case TensorDtype::INT64:
+            np_dtype = "int64";
+            break;
+        case TensorDtype::INT8:
+            np_dtype = "int8";
+            break;
+        case TensorDtype::UINT8:
+            np_dtype = "uint8";
+            break;
+        case TensorDtype::FLOAT16:
+            np_dtype = "float16";
+            break;
+        case TensorDtype::INT16:
+            np_dtype = "int16";
+            break;
+        case TensorDtype::BOOL:
+            np_dtype = "bool";
+            break;
+        default:
             throw std::runtime_error("Unknown tensor dtype");
     }
-    
+
     std::cout << "DEBUG: np_dtype = " << np_dtype << std::endl;
-    
+
     size_t element_size = get_dtype_size(dtype);
     size_t total_elements = 1;
     for (int64_t dim : shape) {
         total_elements *= dim;
     }
-    
+
     std::cout << "DEBUG: element_size = " << element_size << std::endl;
     std::cout << "DEBUG: total_elements = " << total_elements << std::endl;
-    
+
     // Create a copy of the data
     std::cout << "DEBUG: Creating data copy..." << std::endl;
     std::vector<char> data_copy(data, data + total_elements * element_size);
-    std::cout << "DEBUG: Data copy created, size = " << data_copy.size() << std::endl;
-    
+    std::cout << "DEBUG: Data copy created, size = " << data_copy.size()
+              << std::endl;
+
     std::cout << "DEBUG: About to call frombuffer..." << std::endl;
-    
+
     try {
         pybind11::bytes bytes_obj(data_copy.data(), data_copy.size());
         std::cout << "DEBUG: Created bytes object" << std::endl;
-        
-        pybind11::object array = np.attr("frombuffer")(bytes_obj, pybind11::arg("dtype")=np_dtype);
-        std::cout << "DEBUG: Created array from buffer successfully" << std::endl;
-        
+
+        pybind11::object array =
+            np.attr("frombuffer")(bytes_obj, pybind11::arg("dtype") = np_dtype);
+        std::cout << "DEBUG: Created array from buffer successfully"
+                  << std::endl;
+
         // Convert shape to tuple manually
         pybind11::tuple shape_tuple = pybind11::tuple(shape.size());
         for (size_t i = 0; i < shape.size(); ++i) {
             shape_tuple[i] = shape[i];
         }
-        std::cout << "DEBUG: About to create shape tuple for reshape" << std::endl;
-        
+        std::cout << "DEBUG: About to create shape tuple for reshape"
+                  << std::endl;
+
         pybind11::object result = array.attr("reshape")(shape_tuple);
         std::cout << "DEBUG: Reshaped array successfully" << std::endl;
-        
+
         return result;
     } catch (const std::exception& e) {
-        std::cout << "DEBUG: Exception in numpy operations: " << e.what() << std::endl;
+        std::cout << "DEBUG: Exception in numpy operations: " << e.what()
+                  << std::endl;
         throw;
     }
 }
@@ -148,16 +184,15 @@ CoroRPCInterface::CoroRPCInterface() : impl_(std::make_unique<Impl>()) {}
 CoroRPCInterface::~CoroRPCInterface() = default;
 
 // Initialize
-bool CoroRPCInterface::initialize(const std::string& local_address, 
-                                  size_t thread_count, 
-                                  size_t timeout_seconds,
+bool CoroRPCInterface::initialize(const std::string& local_address,
+                                  size_t thread_count, size_t timeout_seconds,
                                   size_t pool_size) {
     Config config;
     config.listen_address = local_address;
     config.thread_count = thread_count;
     config.timeout_seconds = timeout_seconds;
     config.pool_size = pool_size;
-    
+
     impl_->communicator = std::make_unique<CoroRPCCommunicator>();
     return impl_->communicator->initialize(config);
 }
@@ -178,40 +213,28 @@ void CoroRPCInterface::stopServer() {
     }
 }
 
-bool CoroRPCInterface::addRemoteConnection(const std::string& remote_address) {
-    // client_pools 自动管理连接，不需要手动添加
-    std::cout << "Remote connection for " << remote_address << " will be managed automatically" << std::endl;
-    return true;
-}
 
-void CoroRPCInterface::removeRemoteConnection(const std::string& remote_address) {
-    // client_pools 自动管理连接，不需要手动移除
-    std::cout << "Remote connection for " << remote_address << " is managed automatically" << std::endl;
-}
-
-bool CoroRPCInterface::isConnected(const std::string& remote_address) {
-    // client_pools 会自动建立连接，总是返回 true
-    return true;
-}
 
-int CoroRPCInterface::sendData(const std::string& target_address, pybind11::bytes data) {
+int CoroRPCInterface::sendData(const std::string& target_address,
+                               pybind11::bytes data) {
     if (!impl_->communicator) return -1;
-    
+
     pybind11::gil_scoped_acquire acquire;
-    
+
     std::string_view data_view = data;
     pybind11::gil_scoped_release release;
-    return impl_->communicator->sendData(target_address, data_view.data(), data_view.size());
+    return impl_->communicator->sendData(target_address, data_view.data(),
+                                         data_view.size());
 }
 
-pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address, 
-                                                  pybind11::bytes data, 
-                                                  pybind11::handle loop) {
+pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
+                                                 pybind11::bytes data,
+                                                 pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
-    
+
     auto future_module = pybind11::module_::import("asyncio");
     auto future_obj = future_module.attr("Future")();
-    
+
     if (!impl_->communicator) {
         future_obj.attr("set_exception")(pybind11::make_tuple(
             pybind11::str("Communicator not initialized")));
@@ -220,15 +243,18 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
 
     auto communicator = impl_->communicator.get();
     auto target_addr = std::move(target_address);
-    
+
     std::string data_str = data;
-    
+
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    pybind11::object loop_obj = pybind11::reinterpret_borrow<pybind11::object>(loop);
+    pybind11::object loop_obj =
+        pybind11::reinterpret_borrow<pybind11::object>(loop);
 
-    auto coro_lambda = [communicator, target_addr, data_str, future_ptr, loop_obj]() -> async_simple::coro::Lazy<void> {
+    auto coro_lambda = [communicator, target_addr, data_str, future_ptr,
+                        loop_obj]() -> async_simple::coro::Lazy<void> {
         try {
-            auto result_struct = co_await communicator->sendDataAsync(target_addr, data_str.data(), data_str.size());
+            auto result_struct = co_await communicator->sendDataAsync(
+                target_addr, data_str.data(), data_str.size());
             int result = result_struct.code;
 
             auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
@@ -246,8 +272,9 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
         } catch (const std::exception& e) {
             auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
                 pybind11::gil_scoped_acquire acquire;
-                future_ptr->attr("set_exception")(pybind11::make_tuple(
-                    pybind11::str(std::string("Send data error: ") + e.what())));
+                future_ptr->attr("set_exception")(
+                    pybind11::make_tuple(pybind11::str(
+                        std::string("Send data error: ") + e.what())));
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
@@ -256,82 +283,94 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
     };
 
     auto lazy = coro_lambda();
-    lazy.start([](auto &&result) {
-       if (result.hasError()) {
+    lazy.start([](auto&& result) {
+        if (result.hasError()) {
             std::cerr << "Coroutine completed with error" << std::endl;
         }
     });
-    
+
     return future_obj;
 }
 
-int CoroRPCInterface::sendTensor(const std::string& target_address, pybind11::handle tensor) {
+int CoroRPCInterface::sendTensor(const std::string& target_address,
+                                 pybind11::handle tensor) {
     if (!impl_->communicator) return -1;
-    
+
     try {
         TensorInfo tensor_info;
-        
+
         {
             pybind11::gil_scoped_acquire acquire;
-            pybind11::object tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
-            
+            pybind11::object tensor_obj =
+                pybind11::reinterpret_borrow<pybind11::object>(tensor);
+
             // Validate tensor type
-            if (!(tensor_obj.attr("__class__").attr("__name__").cast<std::string>().find("Tensor") != std::string::npos)) {
+            if (!(tensor_obj.attr("__class__")
+                      .attr("__name__")
+                      .cast<std::string>()
+                      .find("Tensor") != std::string::npos)) {
                 std::cerr << "Input is not a tensor" << std::endl;
                 return -1;
             }
-            
-            // Extract tensor properties - zero copy, just get pointers and metadata
-            uintptr_t data_ptr = tensor_obj.attr("data_ptr")().cast<uintptr_t>();
+
+            // Extract tensor properties - zero copy, just get pointers and
+            // metadata
+            uintptr_t data_ptr =
+                tensor_obj.attr("data_ptr")().cast<uintptr_t>();
             size_t numel = tensor_obj.attr("numel")().cast<size_t>();
-            size_t element_size = tensor_obj.attr("element_size")().cast<size_t>();
+            size_t element_size =
+                tensor_obj.attr("element_size")().cast<size_t>();
             size_t tensor_size = numel * element_size;
-            
+
             // Get tensor shape
             pybind11::object shape_obj = tensor_obj.attr("shape");
-            pybind11::tuple shape_tuple = pybind11::cast<pybind11::tuple>(shape_obj);
+            pybind11::tuple shape_tuple =
+                pybind11::cast<pybind11::tuple>(shape_obj);
             std::vector<size_t> shape;
             for (size_t i = 0; i < shape_tuple.size(); i++) {
                 shape.push_back(shape_tuple[i].cast<size_t>());
             }
-            
+
             // Get tensor dtype string
             pybind11::object dtype_obj = tensor_obj.attr("dtype");
             std::string dtype = dtype_obj.attr("__str__")().cast<std::string>();
-            
+
             // Fill TensorInfo structure (no data copying, just metadata)
             tensor_info.data_ptr = reinterpret_cast<void*>(data_ptr);
             tensor_info.total_bytes = tensor_size;
             tensor_info.shape = std::move(shape);
             tensor_info.dtype = std::move(dtype);
-            
+
             std::cout << "Sending tensor with shape: [";
             for (size_t i = 0; i < tensor_info.shape.size(); i++) {
                 std::cout << tensor_info.shape[i];
                 if (i < tensor_info.shape.size() - 1) std::cout << ", ";
             }
-            std::cout << "] and dtype: " << tensor_info.dtype << ", tensor size: " << tensor_info.total_bytes << " bytes" << std::endl;
+            std::cout << "] and dtype: " << tensor_info.dtype
+                      << ", tensor size: " << tensor_info.total_bytes
+                      << " bytes" << std::endl;
         }
 
         // Use the async version which supports zero-copy via attachments
         pybind11::gil_scoped_release release;
-        auto result = async_simple::coro::syncAwait(impl_->communicator->sendTensorAsync(target_address, tensor_info));
+        auto result = async_simple::coro::syncAwait(
+            impl_->communicator->sendTensorAsync(target_address, tensor_info));
         return result;
-        
+
     } catch (const std::exception& e) {
         std::cerr << "Send tensor error: " << e.what() << std::endl;
         return -1;
     }
 }
 
-pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address, 
-                                                    pybind11::handle tensor, 
-                                                    pybind11::handle loop) {
+pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
+                                                   pybind11::handle tensor,
+                                                   pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
-    
+
     auto future_module = pybind11::module_::import("asyncio");
     auto future_obj = future_module.attr("Future")();
-    
+
     if (!impl_->communicator) {
         future_obj.attr("set_exception")(pybind11::make_tuple(
             pybind11::str("Communicator not initialized")));
@@ -340,14 +379,15 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
 
     auto communicator = impl_->communicator.get();
     std::string target_addr = std::move(target_address);
-    
+
     // Extract tensor info
-    pybind11::object tensor_obj = pybind11::reinterpret_borrow<pybind11::object>(tensor);
+    pybind11::object tensor_obj =
+        pybind11::reinterpret_borrow<pybind11::object>(tensor);
     uintptr_t data_ptr = tensor_obj.attr("data_ptr")().cast<uintptr_t>();
     size_t numel = tensor_obj.attr("numel")().cast<size_t>();
     size_t element_size = tensor_obj.attr("element_size")().cast<size_t>();
     size_t tensor_size = numel * element_size;
-    
+
     // Get tensor shape and dtype
     pybind11::object shape_obj = tensor_obj.attr("shape");
     pybind11::tuple shape_tuple = pybind11::cast<pybind11::tuple>(shape_obj);
@@ -355,25 +395,28 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
     for (size_t i = 0; i < shape_tuple.size(); i++) {
         shape.push_back(shape_tuple[i].cast<size_t>());
     }
-    
+
     pybind11::object dtype_obj = tensor_obj.attr("dtype");
     std::string dtype = dtype_obj.attr("__str__")().cast<std::string>();
-    
+
     // Create TensorInfo on stack (no need for shared_ptr)
     TensorInfo tensor_info;
     tensor_info.data_ptr = reinterpret_cast<void*>(data_ptr);
     tensor_info.total_bytes = tensor_size;
     tensor_info.shape = std::move(shape);
     tensor_info.dtype = std::move(dtype);
-    
+
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    pybind11::object loop_obj = pybind11::reinterpret_borrow<pybind11::object>(loop);
+    pybind11::object loop_obj =
+        pybind11::reinterpret_borrow<pybind11::object>(loop);
 
     // Schedule coroutine to run asynchronously
-    auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr, loop_obj]() -> async_simple::coro::Lazy<void> {
+    auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr,
+                        loop_obj]() -> async_simple::coro::Lazy<void> {
         try {
             // Call the async version which returns a coroutine
-            auto result = co_await communicator->sendTensorAsync(target_addr, tensor_info);
+            auto result = co_await communicator->sendTensorAsync(target_addr,
+                                                                 tensor_info);
 
             auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
                 pybind11::gil_scoped_acquire acquire;
@@ -390,8 +433,9 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
         } catch (const std::exception& e) {
             auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
                 pybind11::gil_scoped_acquire acquire;
-                future_ptr->attr("set_exception")(pybind11::make_tuple(
-                    pybind11::str(std::string("Send tensor error: ") + e.what())));
+                future_ptr->attr("set_exception")(
+                    pybind11::make_tuple(pybind11::str(
+                        std::string("Send tensor error: ") + e.what())));
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
@@ -401,61 +445,68 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
 
     // Start the coroutine in a detached manner (fire and forget)
     auto lazy = coro_lambda();
-    lazy.start([](auto &&result) {
+    lazy.start([](auto&& result) {
         // This callback will be called when the coroutine completes
-        // We don't need to do anything here since the result is handled in the coroutine itself
+        // We don't need to do anything here since the result is handled in the
+        // coroutine itself
         if (result.hasError()) {
             // Log error if needed
             std::cerr << "Tensor coroutine completed with error" << std::endl;
         }
     });
-    
+
     return future_obj;
 }
 
 void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->data_receive_callback = callback;
-    
+
     if (impl_->communicator) {
         auto interface_ptr = this;
         impl_->communicator->setDataReceiveCallback(
-            [interface_ptr](const std::string& source, const std::string& data) {
+            [interface_ptr](const std::string& source,
+                            const std::string& data) {
                 interface_ptr->handleIncomingData(source, data);
-            }
-        );
+            });
     }
 }
 
 void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->tensor_receive_callback = callback;
-    
+
     if (impl_->communicator) {
         auto interface_ptr = this;
         impl_->communicator->setDataReceiveCallback(
-            [interface_ptr](const std::string& source, const std::string& data) {
+            [interface_ptr](const std::string& source,
+                            const std::string& data) {
                 interface_ptr->handleIncomingData(source, data);
-            }
-        );
+            });
     }
 }
 
-void CoroRPCInterface::handleIncomingData(const std::string& source, const std::string& data) {
-    std::cout << "CoroRPCInterface::handleIncomingData called with " << data.size() << " bytes" << std::endl;
-    
+void CoroRPCInterface::handleIncomingData(const std::string& source,
+                                          const std::string& data) {
+    std::cout << "CoroRPCInterface::handleIncomingData called with "
+              << data.size() << " bytes" << std::endl;
+
     // Check if this is tensor data by looking for metadata signature
     if (data.size() >= sizeof(TensorMetadata)) {
-        const TensorMetadata* metadata = reinterpret_cast<const TensorMetadata*>(data.data());
-        
-        std::cout << "Checking tensor metadata: dtype=" << metadata->dtype << ", ndim=" << metadata->ndim << std::endl;
-        
+        const TensorMetadata* metadata =
+            reinterpret_cast<const TensorMetadata*>(data.data());
+
+        std::cout << "Checking tensor metadata: dtype=" << metadata->dtype
+                  << ", ndim=" << metadata->ndim << std::endl;
+
         // Basic validation: check if dtype is in valid range
-        if (metadata->dtype > 0 && metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) && 
+        if (metadata->dtype > 0 &&
+            metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) &&
             metadata->ndim >= 0 && metadata->ndim <= 4) {
-            
-            std::cout << "Data recognized as tensor, calling handleIncomingTensor" << std::endl;
-            
+            std::cout
+                << "Data recognized as tensor, calling handleIncomingTensor"
+                << std::endl;
+
             // This looks like tensor data, handle it as such
             std::vector<size_t> shape;
             for (int i = 0; i < metadata->ndim; i++) {
@@ -463,72 +514,94 @@ void CoroRPCInterface::handleIncomingData(const std::string& source, const std::
                     shape.push_back(static_cast<size_t>(metadata->shape[i]));
                 }
             }
-            
+
             // Get dtype name
             std::string dtype_name;
             switch (static_cast<TensorDtype>(metadata->dtype)) {
-                case TensorDtype::FLOAT16: dtype_name = "float16"; break;
-                case TensorDtype::FLOAT32: dtype_name = "float32"; break;
-                case TensorDtype::FLOAT64: dtype_name = "float64"; break;
-                case TensorDtype::INT8: dtype_name = "int8"; break;
-                case TensorDtype::INT16: dtype_name = "int16"; break;
-                case TensorDtype::INT32: dtype_name = "int32"; break;
-                case TensorDtype::INT64: dtype_name = "int64"; break;
-                case TensorDtype::UINT8: dtype_name = "uint8"; break;
-                case TensorDtype::BOOL: dtype_name = "bool"; break;
-                default: dtype_name = "unknown"; break;
+                case TensorDtype::FLOAT16:
+                    dtype_name = "float16";
+                    break;
+                case TensorDtype::FLOAT32:
+                    dtype_name = "float32";
+                    break;
+                case TensorDtype::FLOAT64:
+                    dtype_name = "float64";
+                    break;
+                case TensorDtype::INT8:
+                    dtype_name = "int8";
+                    break;
+                case TensorDtype::INT16:
+                    dtype_name = "int16";
+                    break;
+                case TensorDtype::INT32:
+                    dtype_name = "int32";
+                    break;
+                case TensorDtype::INT64:
+                    dtype_name = "int64";
+                    break;
+                case TensorDtype::UINT8:
+                    dtype_name = "uint8";
+                    break;
+                case TensorDtype::BOOL:
+                    dtype_name = "bool";
+                    break;
+                default:
+                    dtype_name = "unknown";
+                    break;
             }
-            
+
             // Call tensor handler instead of data handler
             handleIncomingTensor(source, data, shape, dtype_name);
             return;
         }
     }
-    
+
     // Handle as regular data if not tensor data
     if (!impl_->data_receive_callback) return;
-    
+
     try {
         pybind11::gil_scoped_acquire acquire;
         pybind11::dict received;
         received["source"] = source;
         received["data"] = pybind11::bytes(data);
-        
+
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
-        std::cerr << "Error in data receive callback: " << e.what() << std::endl;
+        std::cerr << "Error in data receive callback: " << e.what()
+                  << std::endl;
     }
 }
 
-void CoroRPCInterface::handleIncomingTensor(const std::string& source, 
+void CoroRPCInterface::handleIncomingTensor(const std::string& source,
                                             const std::string& data,
-                                            const std::vector<size_t>& shape, 
+                                            const std::vector<size_t>& shape,
                                             const std::string& dtype) {
     std::cout << "CoroRPCInterface::handleIncomingTensor called" << std::endl;
     std::cout << "  source: " << source << std::endl;
     std::cout << "  data size: " << data.size() << std::endl;
     std::cout << "  dtype: " << dtype << std::endl;
     std::cout << "  shape size: " << shape.size() << std::endl;
-    
+
     if (!impl_->tensor_receive_callback) {
         std::cout << "No tensor receive callback set!" << std::endl;
         return;
     }
-    
+
     std::cout << "Calling Python tensor receive callback..." << std::endl;
-    
+
     try {
         pybind11::gil_scoped_acquire acquire;
-        
+
         ReceivedTensor received;
         received.source_address = source;
         received.data = data;
         received.shape = shape;
         received.dtype = dtype;
-        
+
         impl_->tensor_receive_callback(received);
     } catch (const std::exception& e) {
-        std::cerr << "Error in tensor receive callback: " << e.what() << std::endl;
+        std::cerr << "Error in tensor receive callback: " << e.what()
+                  << std::endl;
     }
 }
 
@@ -537,96 +610,110 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
     return rebuildTensorInternal();
 }
 
-pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal() const {
+pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal()
+    const {
     std::cout << "DEBUG: Starting rebuildTensorInternal" << std::endl;
     std::cout << "DEBUG: Data size: " << data.size() << " bytes" << std::endl;
-    std::cout << "DEBUG: TensorMetadata size: " << sizeof(TensorMetadata) << " bytes" << std::endl;
-    
+    std::cout << "DEBUG: TensorMetadata size: " << sizeof(TensorMetadata)
+              << " bytes" << std::endl;
+
     if (data.size() < sizeof(TensorMetadata)) {
         throw std::runtime_error("Data too small to contain tensor metadata");
     }
-    
+
     // Extract metadata
     TensorMetadata metadata;
     std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
-    
-    std::cout << "DEBUG: Extracted metadata - dtype: " << metadata.dtype << ", ndim: " << metadata.ndim << std::endl;
-    
+
+    std::cout << "DEBUG: Extracted metadata - dtype: " << metadata.dtype
+              << ", ndim: " << metadata.ndim << std::endl;
+
     // Validate metadata
     if (metadata.ndim < 0 || metadata.ndim > 4) {
         throw std::runtime_error("Invalid tensor dimensions");
     }
-    
+
     TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
     size_t element_size = get_dtype_size(dtype_enum);
     if (element_size == 0) {
         throw std::runtime_error("Unsupported tensor dtype");
     }
-    
-    std::cout << "DEBUG: Element size: " << element_size << " bytes" << std::endl;
-    
+
+    std::cout << "DEBUG: Element size: " << element_size << " bytes"
+              << std::endl;
+
     // Extract shape
     std::vector<int64_t> tensor_shape;
     size_t total_elements = 1;
     for (int i = 0; i < metadata.ndim; i++) {
         tensor_shape.push_back(metadata.shape[i]);
         total_elements *= metadata.shape[i];
-        std::cout << "DEBUG: Shape[" << i << "] = " << metadata.shape[i] << std::endl;
+        std::cout << "DEBUG: Shape[" << i << "] = " << metadata.shape[i]
+                  << std::endl;
     }
-    
+
     std::cout << "DEBUG: Total elements: " << total_elements << std::endl;
-    
+
     // Validate data size
     size_t expected_data_size = total_elements * element_size;
     size_t actual_data_size = data.size() - sizeof(TensorMetadata);
-    
-    std::cout << "DEBUG: Expected data size: " << expected_data_size << " bytes" << std::endl;
-    std::cout << "DEBUG: Actual data size: " << actual_data_size << " bytes" << std::endl;
-    
+
+    std::cout << "DEBUG: Expected data size: " << expected_data_size << " bytes"
+              << std::endl;
+    std::cout << "DEBUG: Actual data size: " << actual_data_size << " bytes"
+              << std::endl;
+
     if (actual_data_size != expected_data_size) {
         throw std::runtime_error("Data size mismatch with tensor metadata");
     }
-    
+
     // Create numpy array from raw data
     const char* tensor_data = data.data() + sizeof(TensorMetadata);
     std::cout << "DEBUG: About to create numpy array..." << std::endl;
-    std::cout << "DEBUG: Data pointer: " << static_cast<const void*>(tensor_data) << std::endl;
-    std::cout << "DEBUG: Base data pointer: " << static_cast<const void*>(data.data()) << std::endl;
+    std::cout << "DEBUG: Data pointer: "
+              << static_cast<const void*>(tensor_data) << std::endl;
+    std::cout << "DEBUG: Base data pointer: "
+              << static_cast<const void*>(data.data()) << std::endl;
     std::cout << "DEBUG: Offset: " << sizeof(TensorMetadata) << std::endl;
-    
+
     // Check first few bytes of tensor data
     std::cout << "DEBUG: First few bytes of tensor data: ";
     for (int i = 0; i < std::min(16, static_cast<int>(actual_data_size)); ++i) {
         std::cout << std::hex << (unsigned char)tensor_data[i] << " ";
     }
     std::cout << std::dec << std::endl;
-    
+
     try {
-        pybind11::object numpy_array = create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
+        pybind11::object numpy_array =
+            create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
         std::cout << "DEBUG: Successfully created numpy array" << std::endl;
-        
+
         // Convert to PyTorch tensor
-        std::cout << "DEBUG: About to convert to PyTorch tensor..." << std::endl;
+        std::cout << "DEBUG: About to convert to PyTorch tensor..."
+                  << std::endl;
         pybind11::module_ torch = pybind11::module_::import("torch");
         pybind11::object result = torch.attr("from_numpy")(numpy_array);
         std::cout << "DEBUG: Successfully created PyTorch tensor" << std::endl;
-        
+
         return result;
     } catch (const std::exception& e) {
-        std::cout << "DEBUG: Error in tensor creation: " << e.what() << std::endl;
+        std::cout << "DEBUG: Error in tensor creation: " << e.what()
+                  << std::endl;
         throw;
     }
 }
 
 // Factory functions for creating RPC client and server
-std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank, uint64_t world_size) {
+std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank,
+                                                  uint64_t world_size) {
     auto client = std::make_unique<CoroRPCInterface>();
     // Initialize client with default settings
     client->initialize("", 0, 30, 10);
     return client;
 }
 
-std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank, uint64_t world_size) {
+std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank,
+                                                  uint64_t world_size) {
     auto server = std::make_unique<CoroRPCInterface>();
     // Initialize server with default settings
     server->initialize("0.0.0.0:8080", 0, 30, 10);

From 08146aa38bf12c45859bc686c1dc9bda5a736756 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 15:10:19 +0800
Subject: [PATCH 13/64] fixed clang spelling

---
 .../transfer_engine/transfer_engine_py.cpp    | 47 +++++++++++--------
 .../coro_rpc_connector/cororpc_interface.cpp  |  2 -
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index e32bd7206..a19c987a0 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -650,45 +650,53 @@ void bind_coro_rpc_interface(py::module_ &m);
 // Implementation of coro_rpc_interface binding function
 void bind_coro_rpc_interface(py::module_ &m) {
     using namespace mooncake;
-    
+
     py::class_<CoroRPCInterface::ReceivedData>(m, "ReceivedData")
         .def(py::init<>())
-        .def_readonly("source_address", &CoroRPCInterface::ReceivedData::source_address)
+        .def_readonly("source_address",
+                      &CoroRPCInterface::ReceivedData::source_address)
         .def_readonly("data_size", &CoroRPCInterface::ReceivedData::data_size)
         .def("get_bytes", &CoroRPCInterface::ReceivedData::getBytes);
-    
+
     py::class_<CoroRPCInterface::ReceivedTensor>(m, "ReceivedTensor")
         .def(py::init<>())
-        .def_readonly("source_address", &CoroRPCInterface::ReceivedTensor::source_address)
+        .def_readonly("source_address",
+                      &CoroRPCInterface::ReceivedTensor::source_address)
         .def_readonly("shape", &CoroRPCInterface::ReceivedTensor::shape)
         .def_readonly("dtype", &CoroRPCInterface::ReceivedTensor::dtype)
-        .def_readonly("total_bytes", &CoroRPCInterface::ReceivedTensor::total_bytes)
+        .def_readonly("total_bytes",
+                      &CoroRPCInterface::ReceivedTensor::total_bytes)
         .def("get_data_size", &CoroRPCInterface::ReceivedTensor::getDataSize)
-        .def("get_data_as_bytes", &CoroRPCInterface::ReceivedTensor::getDataAsBytes)
-        .def("rebuild_tensor", &CoroRPCInterface::ReceivedTensor::rebuildTensor);
-    
+        .def("get_data_as_bytes",
+             &CoroRPCInterface::ReceivedTensor::getDataAsBytes)
+        .def("rebuild_tensor",
+             &CoroRPCInterface::ReceivedTensor::rebuildTensor);
+
     py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
         .def(py::init<>())
         .def("initialize", &CoroRPCInterface::initialize,
-             "listen_address"_a="", "thread_count"_a=0, 
-             "timeout_seconds"_a=30, "pool_size"_a=10)
+             "listen_address"_a = "", "thread_count"_a = 0,
+             "timeout_seconds"_a = 30, "pool_size"_a = 10)
         .def("start_server", &CoroRPCInterface::startServer)
         .def("start_server_async", &CoroRPCInterface::startServerAsync)
         .def("stop_server", &CoroRPCInterface::stopServer)
         .def("add_remote_connection", &CoroRPCInterface::addRemoteConnection)
-        .def("remove_remote_connection", &CoroRPCInterface::removeRemoteConnection)
+        .def("remove_remote_connection",
+             &CoroRPCInterface::removeRemoteConnection)
         .def("is_connected", &CoroRPCInterface::isConnected)
         .def("send_data", &CoroRPCInterface::sendData)
         .def("send_data_async", &CoroRPCInterface::sendDataAsync)
         .def("send_tensor", &CoroRPCInterface::sendTensor)
         .def("send_tensor_async", &CoroRPCInterface::sendTensorAsync)
-        .def("set_data_receive_callback", &CoroRPCInterface::setDataReceiveCallback)
-        .def("set_tensor_receive_callback", &CoroRPCInterface::setTensorReceiveCallback);
-    
-    m.def("create_rpc_client", &createRPCClient, 
-          "pool_size"_a=10, "timeout_seconds"_a=30);
-    m.def("create_rpc_server", &createRPCServer,
-          "listen_address"_a, "thread_count"_a=0);
+        .def("set_data_receive_callback",
+             &CoroRPCInterface::setDataReceiveCallback)
+        .def("set_tensor_receive_callback",
+             &CoroRPCInterface::setTensorReceiveCallback);
+
+    m.def("create_rpc_client", &createRPCClient, "pool_size"_a = 10,
+          "timeout_seconds"_a = 30);
+    m.def("create_rpc_server", &createRPCServer, "listen_address"_a,
+          "thread_count"_a = 0);
 }
 
 PYBIND11_MODULE(engine, m) {
@@ -742,6 +750,7 @@ PYBIND11_MODULE(engine, m) {
     adaptor_cls.attr("TransferOpcode") = transfer_opcode;
 
     // Add coro_rpc_interface as a submodule
-    auto coro_rpc_submodule = m.def_submodule("coro_rpc_interface", "CoroRPC interface for communication");
+    auto coro_rpc_submodule = m.def_submodule(
+        "coro_rpc_interface", "CoroRPC interface for communication");
     bind_coro_rpc_interface(coro_rpc_submodule);
 }
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 47f99fd14..38bf947d0 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -213,8 +213,6 @@ void CoroRPCInterface::stopServer() {
     }
 }
 
-
-
 int CoroRPCInterface::sendData(const std::string& target_address,
                                pybind11::bytes data) {
     if (!impl_->communicator) return -1;

From 584e386bc1d320ecc91fd17f0b268351b420bc9a Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 15:21:12 +0800
Subject: [PATCH 14/64] fixed communicator spelling

---
 .../src/transport/coro_rpc_connector/cororpc_communicator.cpp  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 67f967366..24ea659b5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -10,8 +10,7 @@
 
 namespace mooncake {
 
-CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {
-}
+CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {}
 
 CoroRPCCommunicator::~CoroRPCCommunicator() { stopServer(); }
 

From e58e2c35476209d992eee3d78161ba736590e88e Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 15:29:38 +0800
Subject: [PATCH 15/64] fixed redundant functions in transfer engine py

---
 mooncake-integration/transfer_engine/transfer_engine_py.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index a19c987a0..bbe2c9766 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -680,10 +680,6 @@ void bind_coro_rpc_interface(py::module_ &m) {
         .def("start_server", &CoroRPCInterface::startServer)
         .def("start_server_async", &CoroRPCInterface::startServerAsync)
         .def("stop_server", &CoroRPCInterface::stopServer)
-        .def("add_remote_connection", &CoroRPCInterface::addRemoteConnection)
-        .def("remove_remote_connection",
-             &CoroRPCInterface::removeRemoteConnection)
-        .def("is_connected", &CoroRPCInterface::isConnected)
         .def("send_data", &CoroRPCInterface::sendData)
         .def("send_data_async", &CoroRPCInterface::sendDataAsync)
         .def("send_tensor", &CoroRPCInterface::sendTensor)

From 89535083180eaef0829878ae2627aa43ddc5e8d5 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 4 Sep 2025 16:16:38 +0800
Subject: [PATCH 16/64] fixed ci path

---
 scripts/run_tests.sh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 12bf5e3e5..baeb6b7e3 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -16,10 +16,18 @@ MC_METADATA_SERVER=http://127.0.0.1:8080/metadata python transfer_engine_initiat
 kill $TARGET_PID || true
 
 echo "Running CoroRPC performance tests..."
-cd ../mooncake-transfer-engine/tests
-pip install torch numpy
-python test_coro_rpc_performance.py
-cd ../../mooncake-wheel/tests
+# Check if we're in CI environment or if the test file exists
+if [ -f "../mooncake-transfer-engine/tests/test_coro_rpc_performance.py" ]; then
+    cd ../mooncake-transfer-engine/tests
+    pip install torch numpy
+    python test_coro_rpc_performance.py
+    cd ../../mooncake-wheel/tests
+else
+    echo "WARNING: CoroRPC performance test not found, skipping..."
+    echo "Current directory: $(pwd)"
+    echo "Looking for: ../mooncake-transfer-engine/tests/test_coro_rpc_performance.py"
+    ls -la ../mooncake-transfer-engine/tests/ 2>/dev/null || echo "Directory ../mooncake-transfer-engine/tests/ does not exist"
+fi
 
 echo "Running master tests..."
 

From b2afec275404d8edd8a05efe5bf6d5cdccb7f153 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 15:23:15 +0800
Subject: [PATCH 17/64] fixed memory copy bugs, removed client pool creation

---
 .../coro_rpc_connector/cororpc_communicator.h |   5 +-
 .../coro_rpc_connector/cororpc_interface.h    |   7 +-
 .../cororpc_communicator.cpp                  | 119 ++++++++----------
 .../coro_rpc_connector/cororpc_interface.cpp  |  98 ---------------
 4 files changed, 55 insertions(+), 174 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 04b5f5951..dd91ae13b 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -65,6 +65,7 @@ class CoroRPCCommunicator {
     ~CoroRPCCommunicator();
 
     bool initialize(const Config& config);
+    bool startServerImpl(bool is_async = true);
     bool startServer();
     bool startServerAsync();
     void stopServer();
@@ -85,7 +86,7 @@ class CoroRPCCommunicator {
         const std::string& source_address, int timeout_ms = -1);
 
     void setDataReceiveCallback(
-        std::function<void(const std::string&, const std::string&)> callback);
+        std::function<void(size_t len)> callback);
 
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
@@ -94,8 +95,6 @@ class CoroRPCCommunicator {
     std::shared_ptr<Impl> impl_;
 };
 
-std::unique_ptr<CoroRPCCommunicator> createClientPool(
-    size_t pool_size = 10, size_t timeout_seconds = 30);
 std::unique_ptr<CoroRPCCommunicator> createServer(
     const std::string& listen_address, size_t thread_count = 0);
 
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 74e5391f5..8e7f06e62 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -17,6 +17,7 @@ class CoroRPCInterface {
         size_t data_size = 0;
 
         pybind11::bytes getBytes() const { return pybind11::bytes(data); }
+        pybind11::memoryview getMemoryView() const { return pybind11::memoryview(data); }
     };
 
     struct ReceivedTensor {
@@ -25,13 +26,9 @@ class CoroRPCInterface {
         std::vector<size_t> shape;
         std::string dtype;
         size_t total_bytes = 0;
-
-        pybind11::object rebuildTensor() const;
         size_t getDataSize() const { return data.size(); }
         pybind11::bytes getDataAsBytes() const { return pybind11::bytes(data); }
-
-       private:
-        pybind11::object rebuildTensorInternal() const;
+        pybind11::memoryview getMemoryView() const { return pybind11::memoryview(data); }
     };
 
     class Impl;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 24ea659b5..7ce110454 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -42,6 +42,14 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     return true;
 }
 
+bool CoroRPCCommunicator::startServerImpl(bool is_async){
+    if(is_async){
+        return this -> startServerAsync();
+    } else {
+        return this -> startServer();
+    }
+}
+
 bool CoroRPCCommunicator::startServer() {
     if (!impl_->server_ || impl_->config.listen_address.empty()) return false;
 
@@ -100,34 +108,28 @@ int CoroRPCCommunicator::sendData(const std::string& target_address,
 
 async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
     const std::string& target_address, const void* data, size_t data_size) {
-    try {
-        std::string_view data_view(static_cast<const char*>(data), data_size);
-
-        auto rpc_result = co_await client_pools_.send_request(
-            target_address,
-            [data_view](coro_rpc::coro_rpc_client& client)
-                -> async_simple::coro::Lazy<void> {
-                auto result =
-                    co_await client
-                        .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
-                            data_view);
-                if (!result.has_value()) {
-                    std::cerr << "RPC call failed: " << result.error().msg
-                              << std::endl;
-                }
-            });
-
-        result res;
-        res.code = 0;
-        co_return res;
-
-    } catch (const std::exception& e) {
-        std::cerr << "Exception in sendDataAsync: " << e.what() << std::endl;
-        result res;
-        res.code = -1;
-        res.err_msg = e.what();
-        co_return res;
+    std::string_view data_view(static_cast<const char*>(data), data_size);
+
+    auto rpc_result = co_await client_pools_.send_request(
+        target_address,
+        [data_view](coro_rpc::coro_rpc_client& client)
+            -> async_simple::coro::Lazy<void> {
+            auto result =
+                co_await client
+                    .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
+                        data_view);
+            if (!result.has_value()) {
+                std::cerr << "RPC call failed: " << result.error().msg
+                          << std::endl;
+            }
+        });
+    if (!rpc_result.has_value()) {
+        std::cout << std::make_error_code(ec.error()).message() << std::endl;
+        co_return result{-1, "RPC call failed"};
     }
+    result res;
+    res.code = 0;
+    co_return res;
 }
 
 int CoroRPCCommunicator::sendTensor(const std::string& target_address,
@@ -142,30 +144,28 @@ int CoroRPCCommunicator::sendTensor(const std::string& target_address,
 
 async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
     const std::string& target_address, const TensorInfo& tensor) {
-    try {
         auto rpc_result = co_await client_pools_.send_request(
             target_address,
-            [&tensor](coro_rpc::coro_rpc_client& client)
-                -> async_simple::coro::Lazy<void> {
-                client.set_req_attachment(std::string_view(
-                    (char*)tensor.data_ptr, tensor.total_bytes));
-
-                auto result = co_await client.call<
-                    &CoroRPCCommunicator::Impl::handleTensorTransfer>();
-
-                if (!result.has_value()) {
-                    std::cerr
-                        << "Tensor RPC call failed: " << result.error().msg
-                        << std::endl;
-                }
-            });
-
-        co_return 0;
-
-    } catch (const std::exception& e) {
-        std::cerr << "Exception in sendTensorAsync: " << e.what() << std::endl;
-        co_return -1;
-    }
+        [&tensor](coro_rpc::coro_rpc_client& client)
+            -> async_simple::coro::Lazy<void> {
+            client.set_req_attachment(std::string_view(
+                (char*)tensor.data_ptr, tensor.total_bytes));
+
+            auto result = co_await client.call<
+                &CoroRPCCommunicator::Impl::handleTensorTransfer>();
+
+            if (!result.has_value()) {
+                std::cerr
+                    << "Tensor RPC call failed: " << result.error().msg
+                    << std::endl;
+            }
+        });
+        if (!rpc_result.has_value()) {
+            std::cout << std::make_error_code(ec.error()).message()
+                      << std::endl;
+            co_return -1;
+        }
+    co_return 0;
 }
 
 int CoroRPCCommunicator::receiveData(const std::string& source_address,
@@ -173,15 +173,13 @@ int CoroRPCCommunicator::receiveData(const std::string& source_address,
                                      int timeout_ms) {
     auto result = async_simple::coro::syncAwait(
         receiveDataAsync(source_address, timeout_ms));
-    // TODO: Copy result to buffer and return size
     return 0;
 }
 
 async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(
     const std::string& source_address, int timeout_ms) {
-    // TODO: Implement actual receive logic
     co_return std::string();
-}
+} // if big string should use attachment
 
 void CoroRPCCommunicator::Impl::handleDataTransfer(
     coro_rpc::context<void> context, std::string_view data) {
@@ -192,7 +190,7 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
     if (data_receive_callback) {
         std::cout << "Calling data receive callback..." << std::endl;
         std::string source_address =
-            "unknown";  // You may want to extract this from context
+            "unknown"; 
         std::string data_str(data);
         data_receive_callback(source_address, data_str);
     } else {
@@ -238,21 +236,6 @@ void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
     context.response_msg();
 }
 
-std::unique_ptr<CoroRPCCommunicator> createClientPool(size_t pool_size,
-                                                      size_t timeout_seconds) {
-    Config config;
-    config.pool_size = pool_size;
-    config.timeout_seconds = timeout_seconds;
-
-    auto communicator = std::make_unique<CoroRPCCommunicator>();
-    if (communicator->initialize(config)) {
-        std::cout << "Created communicator with default pool size: "
-                  << pool_size << std::endl;
-        return communicator;
-    }
-    return nullptr;
-}
-
 std::unique_ptr<CoroRPCCommunicator> createServer(
     const std::string& listen_address, size_t thread_count) {
     Config config;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 38bf947d0..487c585c6 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -603,104 +603,6 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source,
     }
 }
 
-pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
-    pybind11::gil_scoped_acquire acquire;
-    return rebuildTensorInternal();
-}
-
-pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensorInternal()
-    const {
-    std::cout << "DEBUG: Starting rebuildTensorInternal" << std::endl;
-    std::cout << "DEBUG: Data size: " << data.size() << " bytes" << std::endl;
-    std::cout << "DEBUG: TensorMetadata size: " << sizeof(TensorMetadata)
-              << " bytes" << std::endl;
-
-    if (data.size() < sizeof(TensorMetadata)) {
-        throw std::runtime_error("Data too small to contain tensor metadata");
-    }
-
-    // Extract metadata
-    TensorMetadata metadata;
-    std::memcpy(&metadata, data.data(), sizeof(TensorMetadata));
-
-    std::cout << "DEBUG: Extracted metadata - dtype: " << metadata.dtype
-              << ", ndim: " << metadata.ndim << std::endl;
-
-    // Validate metadata
-    if (metadata.ndim < 0 || metadata.ndim > 4) {
-        throw std::runtime_error("Invalid tensor dimensions");
-    }
-
-    TensorDtype dtype_enum = static_cast<TensorDtype>(metadata.dtype);
-    size_t element_size = get_dtype_size(dtype_enum);
-    if (element_size == 0) {
-        throw std::runtime_error("Unsupported tensor dtype");
-    }
-
-    std::cout << "DEBUG: Element size: " << element_size << " bytes"
-              << std::endl;
-
-    // Extract shape
-    std::vector<int64_t> tensor_shape;
-    size_t total_elements = 1;
-    for (int i = 0; i < metadata.ndim; i++) {
-        tensor_shape.push_back(metadata.shape[i]);
-        total_elements *= metadata.shape[i];
-        std::cout << "DEBUG: Shape[" << i << "] = " << metadata.shape[i]
-                  << std::endl;
-    }
-
-    std::cout << "DEBUG: Total elements: " << total_elements << std::endl;
-
-    // Validate data size
-    size_t expected_data_size = total_elements * element_size;
-    size_t actual_data_size = data.size() - sizeof(TensorMetadata);
-
-    std::cout << "DEBUG: Expected data size: " << expected_data_size << " bytes"
-              << std::endl;
-    std::cout << "DEBUG: Actual data size: " << actual_data_size << " bytes"
-              << std::endl;
-
-    if (actual_data_size != expected_data_size) {
-        throw std::runtime_error("Data size mismatch with tensor metadata");
-    }
-
-    // Create numpy array from raw data
-    const char* tensor_data = data.data() + sizeof(TensorMetadata);
-    std::cout << "DEBUG: About to create numpy array..." << std::endl;
-    std::cout << "DEBUG: Data pointer: "
-              << static_cast<const void*>(tensor_data) << std::endl;
-    std::cout << "DEBUG: Base data pointer: "
-              << static_cast<const void*>(data.data()) << std::endl;
-    std::cout << "DEBUG: Offset: " << sizeof(TensorMetadata) << std::endl;
-
-    // Check first few bytes of tensor data
-    std::cout << "DEBUG: First few bytes of tensor data: ";
-    for (int i = 0; i < std::min(16, static_cast<int>(actual_data_size)); ++i) {
-        std::cout << std::hex << (unsigned char)tensor_data[i] << " ";
-    }
-    std::cout << std::dec << std::endl;
-
-    try {
-        pybind11::object numpy_array =
-            create_numpy_array_from_data(tensor_data, dtype_enum, tensor_shape);
-        std::cout << "DEBUG: Successfully created numpy array" << std::endl;
-
-        // Convert to PyTorch tensor
-        std::cout << "DEBUG: About to convert to PyTorch tensor..."
-                  << std::endl;
-        pybind11::module_ torch = pybind11::module_::import("torch");
-        pybind11::object result = torch.attr("from_numpy")(numpy_array);
-        std::cout << "DEBUG: Successfully created PyTorch tensor" << std::endl;
-
-        return result;
-    } catch (const std::exception& e) {
-        std::cout << "DEBUG: Error in tensor creation: " << e.what()
-                  << std::endl;
-        throw;
-    }
-}
-
 // Factory functions for creating RPC client and server
 std::unique_ptr<CoroRPCInterface> createRPCClient(uint64_t local_rank,
                                                   uint64_t world_size) {

From fe165e9d76a34bb592230cb48f7233b7d7de70a4 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 15:50:06 +0800
Subject: [PATCH 18/64] convert memorycopy to memoryview

---
 .github/asan_suppressions.txt                 |  0
 .../coro_rpc_connector/cororpc_interface.cpp  | 21 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)
 create mode 100644 .github/asan_suppressions.txt

diff --git a/.github/asan_suppressions.txt b/.github/asan_suppressions.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 487c585c6..c706ca3c5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -141,21 +141,22 @@ pybind11::object create_numpy_array_from_data(
     std::cout << "DEBUG: element_size = " << element_size << std::endl;
     std::cout << "DEBUG: total_elements = " << total_elements << std::endl;
 
-    // Create a copy of the data
-    std::cout << "DEBUG: Creating data copy..." << std::endl;
-    std::vector<char> data_copy(data, data + total_elements * element_size);
-    std::cout << "DEBUG: Data copy created, size = " << data_copy.size()
-              << std::endl;
+    // Use memoryview to avoid data copy
+    std::cout << "DEBUG: Creating memory view without copying..." << std::endl;
+    size_t data_size = total_elements * element_size;
+    std::cout << "DEBUG: Data size = " << data_size << std::endl;
 
-    std::cout << "DEBUG: About to call frombuffer..." << std::endl;
+    std::cout << "DEBUG: About to call frombuffer with memoryview..." << std::endl;
 
     try {
-        pybind11::bytes bytes_obj(data_copy.data(), data_copy.size());
-        std::cout << "DEBUG: Created bytes object" << std::endl;
+        // Create a memoryview directly from the data pointer without copying
+        pybind11::memoryview mv = pybind11::memoryview::from_memory(
+            const_cast<char*>(data), data_size, true);  // read-only
+        std::cout << "DEBUG: Created memoryview without copying" << std::endl;
 
         pybind11::object array =
-            np.attr("frombuffer")(bytes_obj, pybind11::arg("dtype") = np_dtype);
-        std::cout << "DEBUG: Created array from buffer successfully"
+            np.attr("frombuffer")(mv, pybind11::arg("dtype") = np_dtype);
+        std::cout << "DEBUG: Created array from memoryview successfully"
                   << std::endl;
 
         // Convert shape to tuple manually

From a1d4c64c5d392154735b97b65a17a8f98bf6fa6d Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 16:26:06 +0800
Subject: [PATCH 19/64] set attachment for handleDataTransfer

---
 .../coro_rpc_connector/cororpc_communicator.h |  2 +-
 .../coro_rpc_connector/cororpc_interface.h    |  8 ++-
 .../cororpc_communicator.cpp                  | 70 ++++++++++++++-----
 3 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index dd91ae13b..17e9414f1 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -86,7 +86,7 @@ class CoroRPCCommunicator {
         const std::string& source_address, int timeout_ms = -1);
 
     void setDataReceiveCallback(
-        std::function<void(size_t len)> callback);
+        std::function<void(const std::string&, const std::string&)> callback);
 
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 8e7f06e62..d50786b93 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -17,7 +17,9 @@ class CoroRPCInterface {
         size_t data_size = 0;
 
         pybind11::bytes getBytes() const { return pybind11::bytes(data); }
-        pybind11::memoryview getMemoryView() const { return pybind11::memoryview(data); }
+        pybind11::memoryview getMemoryView() const { 
+            return pybind11::memoryview::from_memory(const_cast<char*>(data.data()), data.size(), true);
+        }
     };
 
     struct ReceivedTensor {
@@ -28,7 +30,9 @@ class CoroRPCInterface {
         size_t total_bytes = 0;
         size_t getDataSize() const { return data.size(); }
         pybind11::bytes getDataAsBytes() const { return pybind11::bytes(data); }
-        pybind11::memoryview getMemoryView() const { return pybind11::memoryview(data); }
+        pybind11::memoryview getMemoryView() const { 
+            return pybind11::memoryview::from_memory(const_cast<char*>(data.data()), data.size(), true);
+        }
     };
 
     class Impl;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 7ce110454..174ba0cc4 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -110,21 +110,39 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
     const std::string& target_address, const void* data, size_t data_size) {
     std::string_view data_view(static_cast<const char*>(data), data_size);
 
+    // For large data, use attachment to avoid copying
+    const size_t ATTACHMENT_THRESHOLD = 1024; // Use attachment for data > 1KB
+    
     auto rpc_result = co_await client_pools_.send_request(
         target_address,
-        [data_view](coro_rpc::coro_rpc_client& client)
+        [data_view, data_size](coro_rpc::coro_rpc_client& client)
             -> async_simple::coro::Lazy<void> {
-            auto result =
-                co_await client
+            
+            if (data_size > ATTACHMENT_THRESHOLD) {
+                // Use attachment for large data - zero copy
+                client.set_req_attachment(data_view);
+                // Send empty data parameter, actual data in attachment
+                auto result = co_await client
+                    .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
+                        std::string_view{});
+                if (!result.has_value()) {
+                    std::cerr << "RPC call failed: " << result.error().msg
+                              << std::endl;
+                }
+            } else {
+                // Use regular parameter for small data
+                auto result = co_await client
                     .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
                         data_view);
-            if (!result.has_value()) {
-                std::cerr << "RPC call failed: " << result.error().msg
-                          << std::endl;
+                if (!result.has_value()) {
+                    std::cerr << "RPC call failed: " << result.error().msg
+                              << std::endl;
+                }
             }
         });
+        
     if (!rpc_result.has_value()) {
-        std::cout << std::make_error_code(ec.error()).message() << std::endl;
+        std::cout << "RPC send request failed" << std::endl;
         co_return result{-1, "RPC call failed"};
     }
     result res;
@@ -161,8 +179,7 @@ async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
             }
         });
         if (!rpc_result.has_value()) {
-            std::cout << std::make_error_code(ec.error()).message()
-                      << std::endl;
+            std::cout << "Tensor RPC send request failed" << std::endl;
             co_return -1;
         }
     co_return 0;
@@ -178,25 +195,46 @@ int CoroRPCCommunicator::receiveData(const std::string& source_address,
 
 async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(
     const std::string& source_address, int timeout_ms) {
+    // For attachment-based data reception, we should use a different approach
+    // This method is typically called from the handler when data is received
+    // The actual data reception is handled by the registered handlers
     co_return std::string();
-} // if big string should use attachment
+} // Data reception is handled via context and attachment in handlers
 
 void CoroRPCCommunicator::Impl::handleDataTransfer(
     coro_rpc::context<void> context, std::string_view data) {
-    std::cout << "Handling data transfer: " << data.size() << " bytes"
-              << std::endl;
+    // Check if there's an attachment for large data
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+    
+    std::cout << "Handling data transfer - Data: " << data.size() 
+              << " bytes, Attachment: " << attachment.size() << " bytes" << std::endl;
 
     // Call the data receive callback if set
     if (data_receive_callback) {
         std::cout << "Calling data receive callback..." << std::endl;
-        std::string source_address =
-            "unknown"; 
-        std::string data_str(data);
-        data_receive_callback(source_address, data_str);
+        std::string source_address = "unknown"; // Could extract from context if needed
+        
+        // Use attachment if available (for large data), otherwise use data parameter
+        if (!attachment.empty()) {
+            // Use attachment data directly without copying - zero copy approach
+            std::string_view attachment_view = attachment;
+            std::string data_str(attachment_view); // Only copy when necessary for callback
+            data_receive_callback(source_address, data_str);
+        } else {
+            // For small data, use the regular data parameter
+            std::string data_str(data);
+            data_receive_callback(source_address, data_str);
+        }
     } else {
         std::cout << "No data receive callback set!" << std::endl;
     }
 
+    // Echo back the attachment for response (zero-copy)
+    if (!attachment.empty()) {
+        ctx_info->set_response_attachment(attachment);
+    }
+    
     context.response_msg();
 }
 

From 9ca028218f0fa6a00c4fa611082f19094a926eec Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 17:09:34 +0800
Subject: [PATCH 20/64] removed duplicate lock

---
 .../transport/coro_rpc_connector/cororpc_interface.cpp   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index c706ca3c5..5d154687a 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -230,7 +230,7 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
                                                  pybind11::bytes data,
                                                  pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
-
+    
     auto future_module = pybind11::module_::import("asyncio");
     auto future_obj = future_module.attr("Future")();
 
@@ -242,13 +242,15 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
 
     auto communicator = impl_->communicator.get();
     auto target_addr = std::move(target_address);
-
     std::string data_str = data;
 
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
     pybind11::object loop_obj =
         pybind11::reinterpret_borrow<pybind11::object>(loop);
 
+    // Release GIL before starting coroutine
+    pybind11::gil_scoped_release release;
+
     auto coro_lambda = [communicator, target_addr, data_str, future_ptr,
                         loop_obj]() -> async_simple::coro::Lazy<void> {
         try {
@@ -409,6 +411,9 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
     pybind11::object loop_obj =
         pybind11::reinterpret_borrow<pybind11::object>(loop);
 
+    // Release GIL before starting coroutine
+    pybind11::gil_scoped_release release;
+
     // Schedule coroutine to run asynchronously
     auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr,
                         loop_obj]() -> async_simple::coro::Lazy<void> {

From afdb741b2c1c7e743376c7e31ec53ecd88ced322 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 18:52:54 +0800
Subject: [PATCH 21/64] replace string copy with stringview

---
 .../coro_rpc_connector/cororpc_communicator.h |  4 +-
 .../coro_rpc_connector/cororpc_interface.h    | 11 +--
 .../cororpc_communicator.cpp                  | 10 +--
 .../coro_rpc_connector/cororpc_interface.cpp  | 76 +++++++++++++------
 4 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 17e9414f1..7588d26ea 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -49,7 +49,7 @@ class CoroRPCCommunicator {
 
         std::unique_ptr<coro_rpc::coro_rpc_server> server_;
 
-        std::function<void(const std::string&, const std::string&)>
+        std::function<void(std::string_view, std::string_view)>
             data_receive_callback;
 
         void handleDataTransfer(coro_rpc::context<void> context,
@@ -86,7 +86,7 @@ class CoroRPCCommunicator {
         const std::string& source_address, int timeout_ms = -1);
 
     void setDataReceiveCallback(
-        std::function<void(const std::string&, const std::string&)> callback);
+        std::function<void(std::string_view, std::string_view)> callback);
 
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index d50786b93..118051839 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -33,6 +33,7 @@ class CoroRPCInterface {
         pybind11::memoryview getMemoryView() const { 
             return pybind11::memoryview::from_memory(const_cast<char*>(data.data()), data.size(), true);
         }
+        pybind11::object rebuildTensor() const;
     };
 
     class Impl;
@@ -60,12 +61,12 @@ class CoroRPCInterface {
     void setDataReceiveCallback(pybind11::function callback);
     void setTensorReceiveCallback(pybind11::function callback);
 
-    void handleIncomingData(const std::string& source_address,
-                            const std::string& data);
-    void handleIncomingTensor(const std::string& source_address,
-                              const std::string& data,
+    void handleIncomingData(std::string_view source_address,
+                            std::string_view data);
+    void handleIncomingTensor(std::string_view source_address,
+                              std::string_view data,
                               const std::vector<size_t>& shape,
-                              const std::string& dtype);
+                              std::string_view dtype);
 
    private:
     std::unique_ptr<Impl> impl_;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 174ba0cc4..29d4b5cb4 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -15,7 +15,7 @@ CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {}
 CoroRPCCommunicator::~CoroRPCCommunicator() { stopServer(); }
 
 void CoroRPCCommunicator::setDataReceiveCallback(
-    std::function<void(const std::string&, const std::string&)> callback) {
+    std::function<void(std::string_view, std::string_view)> callback) {
     std::cout << "Setting data receive callback..." << std::endl;
     impl_->data_receive_callback = callback;
     std::cout << "Data receive callback set successfully" << std::endl;
@@ -213,18 +213,16 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
     // Call the data receive callback if set
     if (data_receive_callback) {
         std::cout << "Calling data receive callback..." << std::endl;
-        std::string source_address = "unknown"; // Could extract from context if needed
+        std::string_view source_address = "unknown"; // Could extract from context if needed
         
         // Use attachment if available (for large data), otherwise use data parameter
         if (!attachment.empty()) {
             // Use attachment data directly without copying - zero copy approach
             std::string_view attachment_view = attachment;
-            std::string data_str(attachment_view); // Only copy when necessary for callback
-            data_receive_callback(source_address, data_str);
+            data_receive_callback(source_address, attachment_view);
         } else {
             // For small data, use the regular data parameter
-            std::string data_str(data);
-            data_receive_callback(source_address, data_str);
+            data_receive_callback(source_address, data);
         }
     } else {
         std::cout << "No data receive callback set!" << std::endl;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 5d154687a..d2c4c2ed5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -411,14 +411,12 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
     pybind11::object loop_obj =
         pybind11::reinterpret_borrow<pybind11::object>(loop);
 
-    // Release GIL before starting coroutine
     pybind11::gil_scoped_release release;
 
     // Schedule coroutine to run asynchronously
     auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr,
                         loop_obj]() -> async_simple::coro::Lazy<void> {
         try {
-            // Call the async version which returns a coroutine
             auto result = co_await communicator->sendTensorAsync(target_addr,
                                                                  tensor_info);
 
@@ -465,12 +463,11 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
 void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->data_receive_callback = callback;
-
     if (impl_->communicator) {
         auto interface_ptr = this;
         impl_->communicator->setDataReceiveCallback(
-            [interface_ptr](const std::string& source,
-                            const std::string& data) {
+            [interface_ptr](std::string_view source,
+                            std::string_view data) {
                 interface_ptr->handleIncomingData(source, data);
             });
     }
@@ -479,19 +476,14 @@ void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
 void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->tensor_receive_callback = callback;
-
-    if (impl_->communicator) {
-        auto interface_ptr = this;
-        impl_->communicator->setDataReceiveCallback(
-            [interface_ptr](const std::string& source,
-                            const std::string& data) {
-                interface_ptr->handleIncomingData(source, data);
-            });
-    }
+    
+    // Note: Tensor data is received through the regular data callback
+    // The handleIncomingData function will detect tensor data and route it 
+    // to handleIncomingTensor automatically
 }
 
-void CoroRPCInterface::handleIncomingData(const std::string& source,
-                                          const std::string& data) {
+void CoroRPCInterface::handleIncomingData(std::string_view source,
+                                          std::string_view data) {
     std::cout << "CoroRPCInterface::handleIncomingData called with "
               << data.size() << " bytes" << std::endl;
 
@@ -520,7 +512,7 @@ void CoroRPCInterface::handleIncomingData(const std::string& source,
             }
 
             // Get dtype name
-            std::string dtype_name;
+            std::string_view dtype_name;
             switch (static_cast<TensorDtype>(metadata->dtype)) {
                 case TensorDtype::FLOAT16:
                     dtype_name = "float16";
@@ -566,8 +558,8 @@ void CoroRPCInterface::handleIncomingData(const std::string& source,
     try {
         pybind11::gil_scoped_acquire acquire;
         pybind11::dict received;
-        received["source"] = source;
-        received["data"] = pybind11::bytes(data);
+        received["source"] = std::string(source);  // Convert to string for Python
+        received["data"] = pybind11::bytes(std::string(data));  // Convert to string for pybind11::bytes
 
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
@@ -576,10 +568,10 @@ void CoroRPCInterface::handleIncomingData(const std::string& source,
     }
 }
 
-void CoroRPCInterface::handleIncomingTensor(const std::string& source,
-                                            const std::string& data,
+void CoroRPCInterface::handleIncomingTensor(std::string_view source,
+                                            std::string_view data,
                                             const std::vector<size_t>& shape,
-                                            const std::string& dtype) {
+                                            std::string_view dtype) {
     std::cout << "CoroRPCInterface::handleIncomingTensor called" << std::endl;
     std::cout << "  source: " << source << std::endl;
     std::cout << "  data size: " << data.size() << std::endl;
@@ -597,10 +589,10 @@ void CoroRPCInterface::handleIncomingTensor(const std::string& source,
         pybind11::gil_scoped_acquire acquire;
 
         ReceivedTensor received;
-        received.source_address = source;
-        received.data = data;
+        received.source_address = std::string(source);  // Convert to string for storage
+        received.data = std::string(data);  // Convert to string for storage
         received.shape = shape;
-        received.dtype = dtype;
+        received.dtype = std::string(dtype);  // Convert to string for storage
 
         impl_->tensor_receive_callback(received);
     } catch (const std::exception& e) {
@@ -626,4 +618,38 @@ std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank,
     return server;
 }
 
+// Implementation of ReceivedTensor::rebuildTensor
+pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
+    // Check if this is tensor data by looking for metadata signature
+    if (data.size() >= sizeof(TensorMetadata)) {
+        const TensorMetadata* metadata =
+            reinterpret_cast<const TensorMetadata*>(data.data());
+
+        // Basic validation: check if dtype is in valid range
+        if (metadata->dtype > 0 &&
+            metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) &&
+            metadata->ndim >= 0 && metadata->ndim <= 4) {
+            
+            // Extract tensor data (skip metadata)
+            const char* tensor_data = data.data() + sizeof(TensorMetadata);
+            // size_t tensor_data_size = data.size() - sizeof(TensorMetadata); // Not used currently
+            
+            // Convert shape from metadata
+            std::vector<int64_t> tensor_shape;
+            for (int i = 0; i < metadata->ndim; i++) {
+                if (metadata->shape[i] > 0) {
+                    tensor_shape.push_back(metadata->shape[i]);
+                }
+            }
+            
+            // Create numpy array from tensor data
+            TensorDtype tensor_dtype = static_cast<TensorDtype>(metadata->dtype);
+            return create_numpy_array_from_data(tensor_data, tensor_dtype, tensor_shape);
+        }
+    }
+    
+    // If not tensor data or invalid, return None
+    return pybind11::none();
+}
+
 }  // namespace mooncake

From 8ab6dfa59bc2a473764631154965b702f0ce0e87 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 8 Sep 2025 19:08:28 +0800
Subject: [PATCH 22/64] reformat the code

---
 .../coro_rpc_connector/cororpc_interface.h    | 15 ++--
 .../cororpc_communicator.cpp                  | 74 +++++++++---------
 .../coro_rpc_connector/cororpc_interface.cpp  | 75 ++++++++++++++-----
 3 files changed, 103 insertions(+), 61 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 118051839..c0ebce38d 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -17,8 +17,9 @@ class CoroRPCInterface {
         size_t data_size = 0;
 
         pybind11::bytes getBytes() const { return pybind11::bytes(data); }
-        pybind11::memoryview getMemoryView() const { 
-            return pybind11::memoryview::from_memory(const_cast<char*>(data.data()), data.size(), true);
+        pybind11::memoryview getMemoryView() const {
+            return pybind11::memoryview::from_memory(
+                const_cast<char*>(data.data()), data.size(), true);
         }
     };
 
@@ -30,8 +31,9 @@ class CoroRPCInterface {
         size_t total_bytes = 0;
         size_t getDataSize() const { return data.size(); }
         pybind11::bytes getDataAsBytes() const { return pybind11::bytes(data); }
-        pybind11::memoryview getMemoryView() const { 
-            return pybind11::memoryview::from_memory(const_cast<char*>(data.data()), data.size(), true);
+        pybind11::memoryview getMemoryView() const {
+            return pybind11::memoryview::from_memory(
+                const_cast<char*>(data.data()), data.size(), true);
         }
         pybind11::object rebuildTensor() const;
     };
@@ -49,9 +51,10 @@ class CoroRPCInterface {
     bool startServerAsync();
     void stopServer();
 
-    int sendData(const std::string& target_address, pybind11::bytes data);
+    int sendData(const std::string& target_address, pybind11::handle data);
     pybind11::object sendDataAsync(std::string& target_address,
-                                   pybind11::bytes data, pybind11::handle loop);
+                                   pybind11::handle data,
+                                   pybind11::handle loop);
 
     int sendTensor(const std::string& target_address, pybind11::handle tensor);
     pybind11::object sendTensorAsync(std::string& target_address,
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 29d4b5cb4..9b508f112 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -42,11 +42,11 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     return true;
 }
 
-bool CoroRPCCommunicator::startServerImpl(bool is_async){
-    if(is_async){
-        return this -> startServerAsync();
+bool CoroRPCCommunicator::startServerImpl(bool is_async) {
+    if (is_async) {
+        return this->startServerAsync();
     } else {
-        return this -> startServer();
+        return this->startServer();
     }
 }
 
@@ -111,36 +111,37 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
     std::string_view data_view(static_cast<const char*>(data), data_size);
 
     // For large data, use attachment to avoid copying
-    const size_t ATTACHMENT_THRESHOLD = 1024; // Use attachment for data > 1KB
-    
+    const size_t ATTACHMENT_THRESHOLD = 1024;  // Use attachment for data > 1KB
+
     auto rpc_result = co_await client_pools_.send_request(
         target_address,
         [data_view, data_size](coro_rpc::coro_rpc_client& client)
             -> async_simple::coro::Lazy<void> {
-            
             if (data_size > ATTACHMENT_THRESHOLD) {
                 // Use attachment for large data - zero copy
                 client.set_req_attachment(data_view);
                 // Send empty data parameter, actual data in attachment
-                auto result = co_await client
-                    .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
-                        std::string_view{});
+                auto result =
+                    co_await client
+                        .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
+                            std::string_view{});
                 if (!result.has_value()) {
                     std::cerr << "RPC call failed: " << result.error().msg
                               << std::endl;
                 }
             } else {
                 // Use regular parameter for small data
-                auto result = co_await client
-                    .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
-                        data_view);
+                auto result =
+                    co_await client
+                        .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
+                            data_view);
                 if (!result.has_value()) {
                     std::cerr << "RPC call failed: " << result.error().msg
                               << std::endl;
                 }
             }
         });
-        
+
     if (!rpc_result.has_value()) {
         std::cout << "RPC send request failed" << std::endl;
         co_return result{-1, "RPC call failed"};
@@ -162,26 +163,26 @@ int CoroRPCCommunicator::sendTensor(const std::string& target_address,
 
 async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
     const std::string& target_address, const TensorInfo& tensor) {
-        auto rpc_result = co_await client_pools_.send_request(
-            target_address,
+    auto rpc_result = co_await client_pools_.send_request(
+        target_address,
         [&tensor](coro_rpc::coro_rpc_client& client)
             -> async_simple::coro::Lazy<void> {
-            client.set_req_attachment(std::string_view(
-                (char*)tensor.data_ptr, tensor.total_bytes));
+            client.set_req_attachment(
+                std::string_view((char*)tensor.data_ptr, tensor.total_bytes));
 
-            auto result = co_await client.call<
-                &CoroRPCCommunicator::Impl::handleTensorTransfer>();
+            auto result =
+                co_await client
+                    .call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
 
             if (!result.has_value()) {
-                std::cerr
-                    << "Tensor RPC call failed: " << result.error().msg
-                    << std::endl;
+                std::cerr << "Tensor RPC call failed: " << result.error().msg
+                          << std::endl;
             }
         });
-        if (!rpc_result.has_value()) {
-            std::cout << "Tensor RPC send request failed" << std::endl;
-            co_return -1;
-        }
+    if (!rpc_result.has_value()) {
+        std::cout << "Tensor RPC send request failed" << std::endl;
+        co_return -1;
+    }
     co_return 0;
 }
 
@@ -199,23 +200,26 @@ async_simple::coro::Lazy<std::string> CoroRPCCommunicator::receiveDataAsync(
     // This method is typically called from the handler when data is received
     // The actual data reception is handled by the registered handlers
     co_return std::string();
-} // Data reception is handled via context and attachment in handlers
+}  // Data reception is handled via context and attachment in handlers
 
 void CoroRPCCommunicator::Impl::handleDataTransfer(
     coro_rpc::context<void> context, std::string_view data) {
     // Check if there's an attachment for large data
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
-    
-    std::cout << "Handling data transfer - Data: " << data.size() 
-              << " bytes, Attachment: " << attachment.size() << " bytes" << std::endl;
+
+    std::cout << "Handling data transfer - Data: " << data.size()
+              << " bytes, Attachment: " << attachment.size() << " bytes"
+              << std::endl;
 
     // Call the data receive callback if set
     if (data_receive_callback) {
         std::cout << "Calling data receive callback..." << std::endl;
-        std::string_view source_address = "unknown"; // Could extract from context if needed
-        
-        // Use attachment if available (for large data), otherwise use data parameter
+        std::string_view source_address =
+            "unknown";  // Could extract from context if needed
+
+        // Use attachment if available (for large data), otherwise use data
+        // parameter
         if (!attachment.empty()) {
             // Use attachment data directly without copying - zero copy approach
             std::string_view attachment_view = attachment;
@@ -232,7 +236,7 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
     if (!attachment.empty()) {
         ctx_info->set_response_attachment(attachment);
     }
-    
+
     context.response_msg();
 }
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index d2c4c2ed5..180bd7036 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -146,7 +146,8 @@ pybind11::object create_numpy_array_from_data(
     size_t data_size = total_elements * element_size;
     std::cout << "DEBUG: Data size = " << data_size << std::endl;
 
-    std::cout << "DEBUG: About to call frombuffer with memoryview..." << std::endl;
+    std::cout << "DEBUG: About to call frombuffer with memoryview..."
+              << std::endl;
 
     try {
         // Create a memoryview directly from the data pointer without copying
@@ -215,22 +216,34 @@ void CoroRPCInterface::stopServer() {
 }
 
 int CoroRPCInterface::sendData(const std::string& target_address,
-                               pybind11::bytes data) {
+                               pybind11::handle data) {
     if (!impl_->communicator) return -1;
 
     pybind11::gil_scoped_acquire acquire;
 
-    std::string_view data_view = data;
+    // Extract data from handle directly
+    std::string_view data_view;
+    try {
+        // Try to get direct string view from bytes object
+        pybind11::bytes data_bytes =
+            pybind11::reinterpret_borrow<pybind11::bytes>(data);
+        data_view = data_bytes;
+    } catch (...) {
+        // Fallback: convert to bytes and then get view
+        pybind11::bytes data_bytes = pybind11::cast<pybind11::bytes>(data);
+        data_view = data_bytes;
+    }
+
     pybind11::gil_scoped_release release;
     return impl_->communicator->sendData(target_address, data_view.data(),
                                          data_view.size());
 }
 
 pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
-                                                 pybind11::bytes data,
+                                                 pybind11::handle data,
                                                  pybind11::handle loop) {
     pybind11::gil_scoped_acquire acquire;
-    
+
     auto future_module = pybind11::module_::import("asyncio");
     auto future_obj = future_module.attr("Future")();
 
@@ -242,7 +255,25 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
 
     auto communicator = impl_->communicator.get();
     auto target_addr = std::move(target_address);
-    std::string data_str = data;
+
+    // Extract data from handle directly without creating intermediate bytes
+    // object
+    std::string_view data_view;
+    std::string data_str;  // Only used if we can't get direct view
+
+    try {
+        // Try to get direct string view from bytes object
+        pybind11::bytes data_bytes =
+            pybind11::reinterpret_borrow<pybind11::bytes>(data);
+        data_view = data_bytes;
+        // Store a copy for lambda capture since string_view might not be valid
+        // after GIL release
+        data_str = std::string(data_view);
+    } catch (...) {
+        // Fallback: convert to bytes and then to string
+        pybind11::bytes data_bytes = pybind11::cast<pybind11::bytes>(data);
+        data_str = data_bytes;
+    }
 
     auto future_ptr = std::make_shared<pybind11::object>(future_obj);
     pybind11::object loop_obj =
@@ -466,8 +497,7 @@ void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
     if (impl_->communicator) {
         auto interface_ptr = this;
         impl_->communicator->setDataReceiveCallback(
-            [interface_ptr](std::string_view source,
-                            std::string_view data) {
+            [interface_ptr](std::string_view source, std::string_view data) {
                 interface_ptr->handleIncomingData(source, data);
             });
     }
@@ -476,9 +506,9 @@ void CoroRPCInterface::setDataReceiveCallback(pybind11::function callback) {
 void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
     pybind11::gil_scoped_acquire acquire;
     impl_->tensor_receive_callback = callback;
-    
+
     // Note: Tensor data is received through the regular data callback
-    // The handleIncomingData function will detect tensor data and route it 
+    // The handleIncomingData function will detect tensor data and route it
     // to handleIncomingTensor automatically
 }
 
@@ -558,8 +588,10 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
     try {
         pybind11::gil_scoped_acquire acquire;
         pybind11::dict received;
-        received["source"] = std::string(source);  // Convert to string for Python
-        received["data"] = pybind11::bytes(std::string(data));  // Convert to string for pybind11::bytes
+        received["source"] =
+            std::string(source);  // Convert to string for Python
+        received["data"] = pybind11::bytes(
+            std::string(data));  // Convert to string for pybind11::bytes
 
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
@@ -589,7 +621,8 @@ void CoroRPCInterface::handleIncomingTensor(std::string_view source,
         pybind11::gil_scoped_acquire acquire;
 
         ReceivedTensor received;
-        received.source_address = std::string(source);  // Convert to string for storage
+        received.source_address =
+            std::string(source);            // Convert to string for storage
         received.data = std::string(data);  // Convert to string for storage
         received.shape = shape;
         received.dtype = std::string(dtype);  // Convert to string for storage
@@ -629,11 +662,11 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
         if (metadata->dtype > 0 &&
             metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) &&
             metadata->ndim >= 0 && metadata->ndim <= 4) {
-            
             // Extract tensor data (skip metadata)
             const char* tensor_data = data.data() + sizeof(TensorMetadata);
-            // size_t tensor_data_size = data.size() - sizeof(TensorMetadata); // Not used currently
-            
+            // size_t tensor_data_size = data.size() - sizeof(TensorMetadata);
+            // // Not used currently
+
             // Convert shape from metadata
             std::vector<int64_t> tensor_shape;
             for (int i = 0; i < metadata->ndim; i++) {
@@ -641,13 +674,15 @@ pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
                     tensor_shape.push_back(metadata->shape[i]);
                 }
             }
-            
+
             // Create numpy array from tensor data
-            TensorDtype tensor_dtype = static_cast<TensorDtype>(metadata->dtype);
-            return create_numpy_array_from_data(tensor_data, tensor_dtype, tensor_shape);
+            TensorDtype tensor_dtype =
+                static_cast<TensorDtype>(metadata->dtype);
+            return create_numpy_array_from_data(tensor_data, tensor_dtype,
+                                                tensor_shape);
         }
     }
-    
+
     // If not tensor data or invalid, return None
     return pybind11::none();
 }

From ca4d006cc34077bc8a02237cf9d8de2032590b17 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 9 Sep 2025 11:52:38 +0800
Subject: [PATCH 23/64] removed tensor rebuild logic

---
 .../tests/test_coro_rpc_performance.py        | 436 ++++++++++++++----
 .../tests/test_real_coro_rpc.py               | 211 ---------
 2 files changed, 353 insertions(+), 294 deletions(-)
 delete mode 100644 mooncake-transfer-engine/tests/test_real_coro_rpc.py

diff --git a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
index fab27b2da..b6a9ee410 100644
--- a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
+++ b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
@@ -9,6 +9,7 @@
 import time
 import sys
 import threading
+import struct
 from typing import List, Tuple, Dict, Any
 
 try:
@@ -24,6 +25,204 @@
     sys.exit(1)
 
 
+class PythonTensorRebuilder:
+    """Pure Python implementation of tensor rebuilding from raw data"""
+    
+    # Tensor dtype mappings (matching C++ enum)
+    DTYPE_MAP = {
+        0: None,  # UNKNOWN
+        1: np.float16,  # FLOAT16
+        2: np.float32,  # FLOAT32  
+        3: np.float64,  # FLOAT64
+        4: np.int8,     # INT8
+        5: np.int16,    # INT16
+        6: np.int32,    # INT32
+        7: np.int64,    # INT64
+        8: np.uint8,    # UINT8
+        9: np.bool_,    # BOOL
+    }
+    
+    TORCH_DTYPE_MAP = {
+        1: torch.float16,  # FLOAT16
+        2: torch.float32,  # FLOAT32  
+        3: torch.float64,  # FLOAT64
+        4: torch.int8,     # INT8
+        5: torch.int16,    # INT16
+        6: torch.int32,    # INT32
+        7: torch.int64,    # INT64
+        8: torch.uint8,    # UINT8
+        9: torch.bool,     # BOOL
+    }
+    
+    @staticmethod
+    def parse_tensor_metadata(raw_data: bytes) -> Tuple[int, int, List[int], int]:
+        """
+        Parse tensor metadata from raw bytes
+        
+        Returns:
+            (dtype, ndim, shape, metadata_size)
+        """
+        if len(raw_data) < 72:  # Size of TensorMetadata struct
+            raise ValueError(f"Raw data too short for metadata: {len(raw_data)} bytes")
+        
+        # TensorMetadata struct layout:
+        # int32_t dtype (4 bytes)
+        # int32_t ndim (4 bytes)  
+        # int64_t shape[4] (32 bytes)
+        # char padding[32] (32 bytes)
+        # Total: 72 bytes
+        
+        metadata_format = '<ii4q32s'  # little-endian: 2 int32, 4 int64, 32 char
+        metadata_size = struct.calcsize(metadata_format)
+        
+        try:
+            unpacked = struct.unpack(metadata_format, raw_data[:metadata_size])
+            dtype = unpacked[0]
+            ndim = unpacked[1]
+            shape_array = unpacked[2:6]  # Extract shape[4]
+            # padding = unpacked[6]  # Not used
+            
+            # Extract actual shape (only first ndim elements are valid)
+            shape = [int(shape_array[i]) for i in range(ndim) if i < 4]
+            
+            return dtype, ndim, shape, metadata_size
+            
+        except struct.error as e:
+            raise ValueError(f"Failed to parse tensor metadata: {e}")
+    
+    @staticmethod
+    def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) -> torch.Tensor:
+        """
+        Rebuild tensor from raw data bytes (pure Python implementation)
+        
+        Args:
+            raw_data: Raw bytes containing tensor metadata + data
+            return_torch: If True, return torch.Tensor; if False, return numpy array
+            
+        Returns:
+            Reconstructed tensor
+        """
+        print(f"🐍 Python tensor rebuilder: processing {len(raw_data)} bytes")
+        
+        # Parse metadata
+        dtype_id, ndim, shape, metadata_size = PythonTensorRebuilder.parse_tensor_metadata(raw_data)
+        
+        print(f"🐍 Parsed metadata: dtype_id={dtype_id}, ndim={ndim}, shape={shape}")
+        
+        # Validate dtype
+        if dtype_id not in PythonTensorRebuilder.DTYPE_MAP or PythonTensorRebuilder.DTYPE_MAP[dtype_id] is None:
+            raise ValueError(f"Unknown or unsupported dtype: {dtype_id}")
+        
+        # Get numpy dtype
+        np_dtype = PythonTensorRebuilder.DTYPE_MAP[dtype_id]
+        element_size = np.dtype(np_dtype).itemsize
+        
+        # Calculate expected data size
+        total_elements = 1
+        for dim in shape:
+            total_elements *= dim
+        expected_data_size = total_elements * element_size
+        
+        print(f"🐍 Expected: {total_elements} elements × {element_size} bytes = {expected_data_size} bytes")
+        
+        # Extract tensor data (skip metadata)
+        tensor_data = raw_data[metadata_size:]
+        actual_data_size = len(tensor_data)
+        
+        print(f"🐍 Actual tensor data size: {actual_data_size} bytes")
+        
+        if actual_data_size < expected_data_size:
+            raise ValueError(f"Insufficient tensor data: expected {expected_data_size}, got {actual_data_size}")
+        
+        # Take only the required bytes (there might be padding)
+        tensor_data = tensor_data[:expected_data_size]
+        
+        # Create numpy array from raw bytes
+        print(f"🐍 Creating numpy array with dtype {np_dtype} and shape {shape}")
+        
+        try:
+            # Convert bytes to numpy array
+            np_array = np.frombuffer(tensor_data, dtype=np_dtype)
+            
+            # Reshape to target shape
+            np_array = np_array.reshape(shape)
+            
+            print(f"🐍 Successfully created numpy array: shape={np_array.shape}, dtype={np_array.dtype}")
+            
+            if return_torch:
+                # Convert to torch tensor
+                if dtype_id in PythonTensorRebuilder.TORCH_DTYPE_MAP:
+                    torch_dtype = PythonTensorRebuilder.TORCH_DTYPE_MAP[dtype_id]
+                    torch_tensor = torch.from_numpy(np_array.copy()).to(torch_dtype)
+                    print(f"🐍 Converted to torch tensor: shape={torch_tensor.shape}, dtype={torch_tensor.dtype}")
+                    return torch_tensor
+                else:
+                    raise ValueError(f"Cannot convert dtype {dtype_id} to torch tensor")
+            else:
+                return np_array
+                
+        except Exception as e:
+            raise ValueError(f"Failed to create tensor from data: {e}")
+    
+    @staticmethod
+    def rebuild_tensor_from_received_tensor(received_tensor_obj, return_torch: bool = True):
+        """
+        Rebuild tensor from ReceivedTensor object using pure Python
+        
+        Args:
+            received_tensor_obj: ReceivedTensor object from callback
+            return_torch: If True, return torch.Tensor; if False, return numpy array
+            
+        Returns:
+            Reconstructed tensor
+        """
+        # Try multiple ways to get raw data from ReceivedTensor object
+        raw_data = None
+        
+        # Method 1: Try direct data attribute access
+        if hasattr(received_tensor_obj, 'data'):
+            try:
+                raw_data = received_tensor_obj.data
+                print(f"🐍 Got data via direct attribute: {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+            except Exception as e:
+                print(f"🐍 Failed to get data via direct attribute: {e}")
+        
+        # Method 2: Try getDataAsBytes method
+        if raw_data is None and hasattr(received_tensor_obj, 'get_data_as_bytes'):
+            try:
+                raw_data = received_tensor_obj.get_data_as_bytes()
+                print(f"🐍 Got data via get_data_as_bytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+            except Exception as e:
+                print(f"🐍 Failed to get data via get_data_as_bytes(): {e}")
+                
+        # Method 3: Try getDataAsBytes with different naming
+        if raw_data is None and hasattr(received_tensor_obj, 'getDataAsBytes'):
+            try:
+                raw_data = received_tensor_obj.getDataAsBytes()
+                print(f"🐍 Got data via getDataAsBytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+            except Exception as e:
+                print(f"🐍 Failed to get data via getDataAsBytes(): {e}")
+        
+        if raw_data is None:
+            # Debug: print available attributes
+            attrs = [attr for attr in dir(received_tensor_obj) if not attr.startswith('_')]
+            print(f"🐍 Available attributes: {attrs}")
+            raise ValueError(f"Cannot get raw data from ReceivedTensor object. Available attributes: {attrs}")
+        
+        # Convert different data types to bytes
+        if isinstance(raw_data, bytes):
+            pass  # Already bytes
+        elif isinstance(raw_data, str):
+            # Convert string to bytes using latin1 to preserve byte values
+            raw_data = raw_data.encode('latin1')
+        elif hasattr(raw_data, 'encode'):
+            raw_data = raw_data.encode('latin1')
+        else:
+            raise ValueError(f"Unknown data type: {type(raw_data)}")
+        
+        return PythonTensorRebuilder.rebuild_tensor_from_raw_data(raw_data, return_torch)
+
+
 class PerformanceTestResults:
     """Container for performance test results"""
     
@@ -86,6 +285,10 @@ def __init__(self):
         self.tensor_receive_times = []
         self.receive_lock = threading.Lock()
         
+        # Store tensors for validation
+        self.sent_tensors = []  # Store original tensors for comparison
+        self.received_tensors = []  # Store received tensors
+        
     def setup(self) -> bool:
         """Initialize server and client"""
         print("Setting up CoroRPC performance test environment...")
@@ -117,13 +320,8 @@ def setup(self) -> bool:
             print(f"Server started on {self.server_addr}")
             time.sleep(1)  # Wait for server startup
             
-            # Connect client to server
-            if not self.client.add_remote_connection(self.server_addr):
-                print("ERROR: Failed to connect client to server")
-                return False
-                
-            print("Client connected to server")
-            time.sleep(0.5)  # Wait for connection establishment
+            print("Client ready to connect to server")
+            time.sleep(0.5)  # Wait for server startup to complete
             
             return True
             
@@ -148,12 +346,56 @@ def _data_receive_callback(self, received_data):
             data = received_data.get("data", b"")
             print(f"Data callback #{self.data_received_count}: received {len(data)} bytes from {source_address}")
     
+    def validate_tensor_equality(self, original_tensor, received_tensor_obj) -> bool:
+        """Validate that sent and received tensors are identical using Python rebuilder"""
+        import torch  # Import at function level to avoid scoping issues
+        
+        try:
+            print("🐍 Using Python tensor rebuilder...")
+            rebuilt_tensor = PythonTensorRebuilder.rebuild_tensor_from_received_tensor(
+                received_tensor_obj, return_torch=True)
+            
+            # Compare shapes
+            if original_tensor.shape != rebuilt_tensor.shape:
+                print(f"ERROR: Shape mismatch - original: {original_tensor.shape}, rebuilt: {rebuilt_tensor.shape}")
+                return False
+            
+            # Compare dtypes
+            if original_tensor.dtype != rebuilt_tensor.dtype:
+                print(f"ERROR: Dtype mismatch - original: {original_tensor.dtype}, rebuilt: {rebuilt_tensor.dtype}")
+                return False
+            
+            # Compare values (use torch.allclose for floating point tolerance)
+            if original_tensor.dtype in [torch.float16, torch.float32, torch.float64]:
+                if not torch.allclose(original_tensor, rebuilt_tensor, rtol=1e-5, atol=1e-8):
+                    print("ERROR: Tensor values do not match (floating point)")
+                    return False
+            else:
+                # For integer and boolean tensors, use exact equality
+                if not torch.equal(original_tensor, rebuilt_tensor):
+                    print("ERROR: Tensor values do not match (exact)")
+                    return False
+            
+            print(f"SUCCESS: Tensor validation passed (Python 🐍) - shape: {original_tensor.shape}, dtype: {original_tensor.dtype}")
+            return True
+            
+        except Exception as e:
+            print(f"ERROR: Tensor validation failed (Python 🐍) with exception: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+    
     def _tensor_receive_callback(self, received_tensor):
-        """Callback for tensor reception"""
+        """Callback for tensor reception with validation"""
         with self.receive_lock:
             self.tensor_received_count += 1
             self.tensor_receive_times.append(time.time())
             print(f"Tensor callback #{self.tensor_received_count}: received tensor from {received_tensor.source_address}")
+            
+            # Store the received tensor for validation
+            if not hasattr(self, 'received_tensors'):
+                self.received_tensors = []
+            self.received_tensors.append(received_tensor)
     
     def test_data_interface_simple(self) -> bool:
         """Simple test for data interface to verify correctness"""
@@ -196,8 +438,8 @@ def test_data_interface_simple(self) -> bool:
         return True
     
     def test_tensor_interface_simple(self) -> bool:
-        """Simple test for tensor interface to verify correctness"""
-        print("\n--- Testing Tensor Interface (Simple) ---")
+        """Simple test for tensor interface to verify correctness with validation"""
+        print("\n--- Testing Tensor Interface (Simple with Validation) ---")
         
         # Create a small test tensor
         test_tensor = torch.randn(10, 10, dtype=torch.float32)
@@ -205,10 +447,15 @@ def test_tensor_interface_simple(self) -> bool:
         
         print(f"Sending tensor {test_tensor.shape} ({tensor_size_mb:.6f} MB)")
         
-        # Reset counters
+        # Reset counters and clear storage
         with self.receive_lock:
             self.tensor_received_count = 0
             self.tensor_receive_times.clear()
+            self.sent_tensors.clear()
+            self.received_tensors.clear()
+        
+        # Store the original tensor for comparison
+        self.sent_tensors.append(test_tensor.clone())  # Clone to avoid reference issues
         
         # Send tensor and measure time
         start_time = time.time()
@@ -232,7 +479,23 @@ def test_tensor_interface_simple(self) -> bool:
             print("ERROR: No tensor received within timeout")
             return False
             
-        print(f"SUCCESS: Tensor interface test passed - sent and received tensor {test_tensor.shape}")
+        # Validate the received tensor using Python rebuilder
+        if len(self.received_tensors) == 0:
+            print("ERROR: No tensor stored in receive callback")
+            return False
+            
+        original_tensor = self.sent_tensors[0]
+        received_tensor_obj = self.received_tensors[0]
+        
+        # Test Python rebuilder
+        print("Validating received tensor with Python rebuilder...")
+        python_success = self.validate_tensor_equality(original_tensor, received_tensor_obj)
+        
+        if not python_success:
+            print("ERROR: Python tensor validation failed!")
+            return False
+            
+        print(f"SUCCESS: Tensor validation passed - sent and received tensor {test_tensor.shape}")
         return True
     
     def test_data_bandwidth_performance(self, sizes_mb: List[float]) -> bool:
@@ -357,22 +620,31 @@ def test_data_bandwidth_performance_large_scale(self, sizes_mb: List[float]) ->
             
             try:
                 # Use more efficient data generation for large sizes
-                # Create a pattern and repeat it to avoid memory issues
-                pattern_size = min(1024 * 1024, data_size_bytes)  # 1MB pattern max
-                pattern = bytes(range(256)) * (pattern_size // 256 + 1)
-                pattern = pattern[:pattern_size]
+                print(f"  Creating test data pattern...")
                 
-                # For very large data, we create it in chunks
-                if data_size_bytes > 100 * 1024 * 1024:  # If > 100MB
-                    # Create data as repeated pattern
-                    repeat_count = data_size_bytes // len(pattern)
-                    remainder = data_size_bytes % len(pattern)
-                    test_data = pattern * repeat_count + pattern[:remainder]
-                else:
+                if data_size_bytes <= 50 * 1024 * 1024:  # <= 50MB: use simple method
                     test_data = bytes(range(256)) * (data_size_bytes // 256 + 1)
                     test_data = test_data[:data_size_bytes]
+                else:  # > 50MB: use pattern-based efficient method
+                    # Create a 1MB pattern
+                    pattern_size = 1024 * 1024  # 1MB pattern
+                    pattern = bytes(range(256)) * (pattern_size // 256)
+                    
+                    # Calculate how many full patterns and remainder
+                    full_patterns = data_size_bytes // pattern_size
+                    remainder = data_size_bytes % pattern_size
+                    
+                    print(f"  Using {full_patterns} full 1MB patterns + {remainder} bytes remainder")
+                    
+                    # Create data efficiently by concatenating patterns
+                    if full_patterns > 0:
+                        test_data = pattern * full_patterns
+                        if remainder > 0:
+                            test_data += pattern[:remainder]
+                    else:
+                        test_data = pattern[:remainder]
                 
-                print(f"  Data allocated successfully: {len(test_data)} bytes")
+                print(f"  Data allocated successfully: {len(test_data)} bytes ({len(test_data)/(1024*1024):.1f} MB)")
                 
             except MemoryError:
                 print(f"  ERROR: Not enough memory to allocate {size_mb} MB")
@@ -429,8 +701,8 @@ def test_data_bandwidth_performance_large_scale(self, sizes_mb: List[float]) ->
         return True
     
     def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tuple[str, tuple, torch.dtype]]) -> bool:
-        """Test tensor interface bandwidth performance with large tensors (optimized for GB scale)"""
-        print("\n--- Testing Tensor Interface Bandwidth Performance (Large Scale) ---")
+        """Test tensor interface bandwidth performance with large tensors and validation"""
+        print("\n--- Testing Tensor Interface Bandwidth Performance (Large Scale with Validation) ---")
         
         for tensor_name, shape, dtype in tensor_configs:
             print(f"\nTesting large tensor: {tensor_name} {shape}")
@@ -445,10 +717,11 @@ def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tup
             expected_size_gb = expected_size_mb / 1024
             
             print(f"  Expected size: {expected_size_mb:.1f} MB ({expected_size_gb:.2f} GB)")
-            print(f"  Creating tensor...")
+            print(f"  Creating tensor (dtype: {dtype}, elements: {numel:,})...")
             
             try:
-                # Create test tensor with memory monitoring
+                # Create test tensor without memory check for now
+                print(f"  Creating tensor without memory check...")
                 if dtype == torch.bool:
                     test_tensor = torch.randint(0, 2, shape, dtype=dtype).bool()
                 elif dtype in [torch.int32, torch.int64]:
@@ -472,10 +745,18 @@ def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tup
             
             tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
             
-            # Reset counters before each test
+            # Reset counters before each test and store original tensor
             with self.receive_lock:
                 self.tensor_received_count = 0
                 self.tensor_receive_times.clear()
+                # For large tensors, we'll only validate smaller ones to avoid memory issues
+                if tensor_size_mb <= 200.0:  # Only validate tensors <= 200MB
+                    self.sent_tensors.append(test_tensor.clone())
+                    validate_this_tensor = True
+                else:
+                    validate_this_tensor = False
+                    print(f"  Skipping validation for large tensor ({tensor_size_mb:.1f} MB) to avoid memory issues")
+                self.received_tensors.clear()
             
             # Measure send time
             print(f"  Starting tensor transmission...")
@@ -510,11 +791,31 @@ def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tup
             if self.tensor_received_count > 0:
                 reception_time = self.tensor_receive_times[0] - start_time
                 print(f"  Reception confirmed: callback received after {reception_time:.2f}s")
+                
+                # Validate tensor if it's not too large
+                if validate_this_tensor and len(self.received_tensors) > 0:
+                    print(f"  Validating tensor correctness...")
+                    original_tensor = self.sent_tensors[-1]  # Get the last sent tensor
+                    received_tensor_obj = self.received_tensors[-1]  # Get the last received tensor
+                    
+                    # Use Python rebuilder for validation
+                    print(f"  Validating tensor correctness...")
+                    print(f"  Using Python rebuilder for efficiency...")
+                    validation_success = self.validate_tensor_equality(
+                        original_tensor, received_tensor_obj)
+                    
+                    if validation_success:
+                        print(f"  ✓ Tensor validation PASSED (Python 🐍)")
+                    else:
+                        print(f"  ✗ Tensor validation FAILED")
+                        # Continue with other tests even if validation fails
             else:
                 print(f"  WARNING: No reception callback within {max_wait_time:.1f}s timeout")
             
             # Clean up large tensor
             del test_tensor
+            if validate_this_tensor and len(self.sent_tensors) > 0:
+                del self.sent_tensors[-1]  # Remove the stored tensor to free memory
             
             # Wait between tests (longer for large tensors)
             time.sleep(1.0)
@@ -523,9 +824,9 @@ def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tup
 
 
 def main():
-    """Main test function"""
-    print("CoroRPC Performance Testing Suite")
-    print("="*50)
+    """Main test function focused on high-performance hundreds-of-MB testing"""
+    print("CoroRPC High-Performance Testing Suite (Hundreds of MB)")
+    print("="*60)
     
     tester = CoroRPCPerformanceTester()
     
@@ -549,62 +850,25 @@ def main():
             
         print("SUCCESS: All correctness tests passed!")
         
-        # Run basic performance tests (small sizes for verification)
-        print("\nPhase 2: Basic Performance Testing")
-        print("-" * 40)
-        
-        # Test small data sizes first
-        small_data_sizes = [0.001, 0.01, 0.1]  # 1KB, 10KB, 100KB
-        if not tester.test_data_bandwidth_performance(small_data_sizes):
-            print("FAILED: Data bandwidth performance test failed")
-            return False
-            
-        # Test small tensors
-        small_tensor_configs = [
-            ("Float32_Small", (100, 100), torch.float32),
-            ("Int64_Small", (50, 50), torch.int64),
-            ("Bool_Small", (200, 200), torch.bool),
-        ]
-        if not tester.test_tensor_bandwidth_performance(small_tensor_configs):
-            print("FAILED: Tensor bandwidth performance test failed")
-            return False
-        
-        # Additional test with medium sizes for better performance insights
-        print("\nPhase 3: Medium-scale Performance Testing")
-        print("-" * 40)
-        
-        # Test medium data sizes
-        medium_data_sizes = [1.0, 5.0, 10.0]  # 1MB, 5MB, 10MB
-        if not tester.test_data_bandwidth_performance(medium_data_sizes):
-            print("FAILED: Medium data bandwidth performance test failed")
-            return False
-            
-        # Test medium tensors
-        medium_tensor_configs = [
-            ("Float32_Medium", (500, 500), torch.float32),      # ~1MB
-            ("Int64_Medium", (1024, 256), torch.int64),         # ~2MB  
-            ("Float64_Medium", (512, 512), torch.float64),      # ~2MB
-        ]
-        if not tester.test_tensor_bandwidth_performance(medium_tensor_configs):
-            print("FAILED: Medium tensor bandwidth performance test failed")
-            return False
-        
-        # Optional large-scale performance testing (1GB scale)
-        print("\nPhase 4: Large-scale Performance Testing (1GB)")
-        print("-" * 40)
-        print("WARNING: This phase will test ~1GB data transfers and may take several minutes")
+        # Run high-performance tests with hundreds of MB data
+        print("\nPhase 2: High-Performance Testing (Hundreds of MB)")
+        print("-" * 50)
+        print("Testing large data transfers (50MB - 800MB) to measure peak performance")
         
-        # Test large data sizes (around 1GB)
-        large_data_sizes = [100.0, 500.0, 1000.0]  # 100MB, 500MB, 1GB
+        # Test data sizes focused on hundreds of MB
+        large_data_sizes = [50.0, 100.0, 200.0, 300.0, 500.0, 800.0]  # 50MB to 800MB
         if not tester.test_data_bandwidth_performance_large_scale(large_data_sizes):
             print("FAILED: Large data bandwidth performance test failed")
             return False
             
-        # Test large tensors (around 1GB)
+        # Test tensor sizes focused on hundreds of MB  
         large_tensor_configs = [
-            ("Float32_Large", (8192, 8192), torch.float32),     # ~256MB
-            ("Float32_XLarge", (16384, 8192), torch.float32),   # ~512MB  
-            ("Float32_XXLarge", (16384, 16384), torch.float32), # ~1GB
+            ("Float32_100MB", (5120, 5120), torch.float32),     # ~100MB
+            ("Float32_200MB", (7237, 7237), torch.float32),     # ~200MB
+            ("Float32_400MB", (10240, 10240), torch.float32),   # ~400MB
+            ("Float64_200MB", (5120, 5120), torch.float64),     # ~200MB
+            ("Int64_300MB", (8000, 5000), torch.int64),         # ~300MB
+            ("Float32_600MB", (12500, 12500), torch.float32),   # ~600MB
         ]
         if not tester.test_tensor_bandwidth_performance_large_scale(large_tensor_configs):
             print("FAILED: Large tensor bandwidth performance test failed")
@@ -613,7 +877,13 @@ def main():
         # Print results
         tester.results.print_summary()
         
-        print("\nSUCCESS: All performance tests completed!")
+        print("\nSUCCESS: All high-performance tests completed!")
+        print("\nTest Summary:")
+        print(f"- Tested data sizes: 50MB to 800MB")
+        print(f"- Tested tensor sizes: 100MB to 600MB")
+        print(f"- All tests focused on measuring peak bandwidth performance")
+        print(f"- Tensor correctness validation enabled (for tensors ≤ 200MB)")
+        print(f"- Zero-copy optimization with pybind11::handle and std::string_view")
         return True
         
     except Exception as e:
diff --git a/mooncake-transfer-engine/tests/test_real_coro_rpc.py b/mooncake-transfer-engine/tests/test_real_coro_rpc.py
deleted file mode 100644
index 810ea03f5..000000000
--- a/mooncake-transfer-engine/tests/test_real_coro_rpc.py
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test enhanced CoroRPC tensor rebuilding functionality
-"""
-
-import torch
-import numpy as np
-import asyncio
-import threading
-import time
-import sys
-
-try:
-    import mooncake.engine as engine
-    print("Successfully imported mooncake.engine")
-    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
-    print("Successfully imported CoroRPCInterface")
-except ImportError as e:
-    print(f"Failed to import mooncake: {e}")
-    sys.exit(1)
-except AttributeError as e:
-    print(f"Failed to import CoroRPCInterface: {e}")
-    sys.exit(1)
-
-
-def test_enhanced_tensor_rebuilding():
-    print("\n=== Testing Enhanced Tensor Rebuilding ===")
-
-    # Create server and client instances
-    server = CoroRPCInterface()
-    client = CoroRPCInterface()
-
-    # Store received tensors and callback status
-    received_tensors = []
-    callback_info = {
-        'called_count': 0,
-        'success_count': 0,
-        'error_count': 0,
-        'errors': []
-    }
-
-    def tensor_receive_callback(received_tensor):
-        callback_info['called_count'] += 1
-        
-        print(f"\n=== CALLBACK #{callback_info['called_count']} TRIGGERED ===")
-        print(f"Received tensor from: {received_tensor.source_address}")
-        
-        # Use safe method to get data size
-        data_size = received_tensor.get_data_size()
-        print(f"Data size: {data_size} bytes")
-            
-        print(f"Shape info: {received_tensor.shape}")
-        print(f"Dtype info: {received_tensor.dtype}")
-        
-        # Check if total_bytes is available
-        if hasattr(received_tensor, 'total_bytes'):
-            print(f"Total bytes (from metadata): {received_tensor.total_bytes}")
-
-        try:
-            # Use enhanced rebuild functionality to reconstruct tensor
-            print("Attempting to rebuild tensor...")
-            print(f"Tensor metadata - Shape: {received_tensor.shape}, Dtype: {received_tensor.dtype}")
-            
-            # Now try the actual rebuild
-            rebuilt_tensor = received_tensor.rebuild_tensor()
-            
-            received_tensors.append(rebuilt_tensor)
-            callback_info['success_count'] += 1
-
-            print("SUCCESS: Successfully rebuilt tensor:")
-            print(f"  - Shape: {rebuilt_tensor.shape}")
-            print(f"  - Dtype: {rebuilt_tensor.dtype}")
-            print(f"  - Device: {rebuilt_tensor.device}")
-            print(f"  - Data sample: {rebuilt_tensor.flatten()[:5]}")
-
-        except Exception as e:
-            callback_info['error_count'] += 1
-            callback_info['errors'].append(str(e))
-            print(f"FAILED: Failed to rebuild tensor: {e}")
-            import traceback
-            traceback.print_exc()
-        
-        print(f"=== CALLBACK #{callback_info['called_count']} COMPLETED ===\n")
-
-    try:
-        # Initialize server and client
-        server_addr = "127.0.0.1:8888"
-        if not server.initialize(server_addr, 1, 30, 4):
-            print("Server initialization failed")
-            return False
-
-        if not client.initialize("", 0, 30, 4):
-            print("Client initialization failed")
-            return False
-
-        # Set tensor receive callback
-        server.set_tensor_receive_callback(tensor_receive_callback)
-
-        # Start server asynchronously
-        if not server.start_server_async():
-            print("Failed to start server")
-            return False
-
-        print(f"Server started on {server_addr}")
-        time.sleep(1)  # Wait for server to start
-
-        # Connect client to server
-        if not client.add_remote_connection(server_addr):
-            print("Failed to connect to server")
-            return False
-
-        print("Client connected to server")
-        time.sleep(0.5)  # Wait for connection establishment
-
-        # Define test cases with various tensor types
-        test_cases = [
-            ("Float32 2D", torch.randn(3, 4, dtype=torch.float32)),
-            ("Int64 1D", torch.arange(10, dtype=torch.int64)),
-            ("Float64 3D", torch.ones(2, 3, 4, dtype=torch.float64)),
-            ("Int32 Vector", torch.tensor([1, 2, 3, 4, 5], dtype=torch.int32)),
-            ("Bool Matrix", torch.tensor([[True, False], [False, True]], dtype=torch.bool)),
-        ]
-
-        for test_name, original_tensor in test_cases:
-            print(f"\n--- Testing {test_name} ---")
-            print("Original tensor:")
-            print(f"  - Shape: {original_tensor.shape}")
-            print(f"  - Dtype: {original_tensor.dtype}")
-            print(f"  - Data sample: {original_tensor.flatten()[:5]}")
-
-            # Send tensor
-            result = client.send_tensor(server_addr, original_tensor)
-            print(f"Send result: {result}")
-
-            if result < 0:
-                print(f"Failed to send {test_name}")
-                continue
-
-            # Wait for reception and processing
-            time.sleep(1)
-
-            # Check if callback was triggered for this tensor
-            expected_callbacks = test_cases.index((test_name, original_tensor)) + 1
-            if callback_info['called_count'] < expected_callbacks:
-                print(f"FAILED: No callback received for {test_name}")
-                continue
-
-            if len(received_tensors) == 0:
-                print(f"FAILED: No tensor received for {test_name}")
-                continue
-
-            # Validate the rebuilt tensor
-            rebuilt_tensor = received_tensors[-1]
-
-            # Check shape
-            if tuple(rebuilt_tensor.shape) != tuple(original_tensor.shape):
-                print(f"FAILED: Shape mismatch: {rebuilt_tensor.shape} vs {original_tensor.shape}")
-                continue
-
-            # Check data type
-            if rebuilt_tensor.dtype != original_tensor.dtype:
-                print(f"FAILED: Dtype mismatch: {rebuilt_tensor.dtype} vs {original_tensor.dtype}")
-                continue
-
-            # Check data content (move to CPU for comparison)
-            try:
-                if torch.allclose(rebuilt_tensor.cpu(), original_tensor.cpu(), atol=1e-6):
-                    print(f"SUCCESS: {test_name} passed - data integrity verified")
-                else:
-                    print(f"FAILED: {test_name} failed - data content mismatch")
-                    print(f"  Original: {original_tensor.flatten()[:5]}")
-                    print(f"  Rebuilt:  {rebuilt_tensor.flatten()[:5]}")
-            except Exception as e:
-                print(f"FAILED: {test_name} failed - comparison error: {e}")
-
-        # Print summary
-        print(f"\n=== TEST SUMMARY ===")
-        print(f"Total callbacks received: {callback_info['called_count']}")
-        print(f"Successful rebuilds: {callback_info['success_count']}")
-        print(f"Failed rebuilds: {callback_info['error_count']}")
-        print(f"Total tensors processed: {len(received_tensors)}")
-        
-        if callback_info['errors']:
-            print(f"Errors encountered:")
-            for i, error in enumerate(callback_info['errors'], 1):
-                print(f"  {i}. {error}")
-
-        success = (callback_info['called_count'] == len(test_cases) and 
-                  callback_info['success_count'] == len(test_cases) and
-                  len(received_tensors) == len(test_cases))
-        
-        print(f"Enhanced tensor rebuilding test {'PASSED' if success else 'FAILED'}")
-        return success
-
-    except Exception as e:
-        print(f"Test failed with exception: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-    finally:
-        # Cleanup
-        try:
-            server.stop_server()
-        except:
-            pass
-
-
-if __name__ == "__main__":
-    success = test_enhanced_tensor_rebuilding()
-    print(f"\nFinal result: {'SUCCESS' if success else 'FAILURE'}")
-    sys.exit(0 if success else 1)
\ No newline at end of file

From 22c418d253b3f93852f3bbbdb76223c6cc7683da Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 9 Sep 2025 14:21:30 +0800
Subject: [PATCH 24/64] removed unnecessary copies

---
 .../transfer_engine/transfer_engine_py.cpp    |   4 +-
 .../coro_rpc_connector/cororpc_interface.h    |   1 -
 .../coro_rpc_connector/cororpc_interface.cpp  | 319 +++---------------
 3 files changed, 49 insertions(+), 275 deletions(-)

diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index bbe2c9766..16d6485fb 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -668,9 +668,7 @@ void bind_coro_rpc_interface(py::module_ &m) {
                       &CoroRPCInterface::ReceivedTensor::total_bytes)
         .def("get_data_size", &CoroRPCInterface::ReceivedTensor::getDataSize)
         .def("get_data_as_bytes",
-             &CoroRPCInterface::ReceivedTensor::getDataAsBytes)
-        .def("rebuild_tensor",
-             &CoroRPCInterface::ReceivedTensor::rebuildTensor);
+             &CoroRPCInterface::ReceivedTensor::getDataAsBytes);
 
     py::class_<CoroRPCInterface>(m, "CoroRPCInterface")
         .def(py::init<>())
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index c0ebce38d..3ed34c303 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -35,7 +35,6 @@ class CoroRPCInterface {
             return pybind11::memoryview::from_memory(
                 const_cast<char*>(data.data()), data.size(), true);
         }
-        pybind11::object rebuildTensor() const;
     };
 
     class Impl;
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 180bd7036..7e3046f65 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -9,28 +9,6 @@
 
 namespace mooncake {
 
-// Tensor dtype enumeration
-enum class TensorDtype : int32_t {
-    UNKNOWN = 0,
-    FLOAT16 = 1,
-    FLOAT32 = 2,
-    FLOAT64 = 3,
-    INT8 = 4,
-    INT16 = 5,
-    INT32 = 6,
-    INT64 = 7,
-    UINT8 = 8,
-    BOOL = 9
-};
-
-// Tensor metadata structure
-struct TensorMetadata {
-    int32_t dtype;     // TensorDtype enum value
-    int32_t ndim;      // Number of dimensions
-    int64_t shape[4];  // Shape array (max 4D)
-    char padding[32];  // For future extensions
-};
-
 // Implementation class
 class CoroRPCInterface::Impl {
    public:
@@ -39,146 +17,6 @@ class CoroRPCInterface::Impl {
     pybind11::function tensor_receive_callback;
 };
 
-// Helper function to get tensor dtype from Python tensor
-TensorDtype get_tensor_dtype(const pybind11::object& dtype_obj) {
-    std::string dtype_str = dtype_obj.attr("__str__")().cast<std::string>();
-
-    if (dtype_str.find("float16") != std::string::npos)
-        return TensorDtype::FLOAT16;
-    if (dtype_str.find("float32") != std::string::npos)
-        return TensorDtype::FLOAT32;
-    if (dtype_str.find("float64") != std::string::npos)
-        return TensorDtype::FLOAT64;
-    if (dtype_str.find("int8") != std::string::npos) return TensorDtype::INT8;
-    if (dtype_str.find("int16") != std::string::npos) return TensorDtype::INT16;
-    if (dtype_str.find("int32") != std::string::npos) return TensorDtype::INT32;
-    if (dtype_str.find("int64") != std::string::npos) return TensorDtype::INT64;
-    if (dtype_str.find("uint8") != std::string::npos) return TensorDtype::UINT8;
-    if (dtype_str.find("bool") != std::string::npos) return TensorDtype::BOOL;
-
-    return TensorDtype::UNKNOWN;
-}
-
-size_t get_dtype_size(TensorDtype dtype) {
-    switch (dtype) {
-        case TensorDtype::FLOAT32:
-            return 4;
-        case TensorDtype::FLOAT64:
-            return 8;
-        case TensorDtype::INT32:
-            return 4;
-        case TensorDtype::INT64:
-            return 8;
-        case TensorDtype::INT8:
-            return 1;
-        case TensorDtype::UINT8:
-            return 1;
-        case TensorDtype::FLOAT16:
-            return 2;
-        case TensorDtype::INT16:
-            return 2;
-        case TensorDtype::BOOL:
-            return 1;
-        default:
-            return 0;
-    }
-}
-
-// Helper function to create numpy array from data
-pybind11::object create_numpy_array_from_data(
-    const char* data, TensorDtype dtype, const std::vector<int64_t>& shape) {
-    std::cout << "DEBUG: create_numpy_array_from_data called" << std::endl;
-    std::cout << "DEBUG: dtype = " << static_cast<int>(dtype) << std::endl;
-    std::cout << "DEBUG: shape size = " << shape.size() << std::endl;
-
-    pybind11::gil_scoped_acquire acquire;
-
-    std::cout << "DEBUG: About to import numpy..." << std::endl;
-    pybind11::module_ np = pybind11::module_::import("numpy");
-    std::cout << "DEBUG: Successfully imported numpy" << std::endl;
-
-    std::string np_dtype;
-    switch (dtype) {
-        case TensorDtype::FLOAT32:
-            np_dtype = "float32";
-            break;
-        case TensorDtype::FLOAT64:
-            np_dtype = "float64";
-            break;
-        case TensorDtype::INT32:
-            np_dtype = "int32";
-            break;
-        case TensorDtype::INT64:
-            np_dtype = "int64";
-            break;
-        case TensorDtype::INT8:
-            np_dtype = "int8";
-            break;
-        case TensorDtype::UINT8:
-            np_dtype = "uint8";
-            break;
-        case TensorDtype::FLOAT16:
-            np_dtype = "float16";
-            break;
-        case TensorDtype::INT16:
-            np_dtype = "int16";
-            break;
-        case TensorDtype::BOOL:
-            np_dtype = "bool";
-            break;
-        default:
-            throw std::runtime_error("Unknown tensor dtype");
-    }
-
-    std::cout << "DEBUG: np_dtype = " << np_dtype << std::endl;
-
-    size_t element_size = get_dtype_size(dtype);
-    size_t total_elements = 1;
-    for (int64_t dim : shape) {
-        total_elements *= dim;
-    }
-
-    std::cout << "DEBUG: element_size = " << element_size << std::endl;
-    std::cout << "DEBUG: total_elements = " << total_elements << std::endl;
-
-    // Use memoryview to avoid data copy
-    std::cout << "DEBUG: Creating memory view without copying..." << std::endl;
-    size_t data_size = total_elements * element_size;
-    std::cout << "DEBUG: Data size = " << data_size << std::endl;
-
-    std::cout << "DEBUG: About to call frombuffer with memoryview..."
-              << std::endl;
-
-    try {
-        // Create a memoryview directly from the data pointer without copying
-        pybind11::memoryview mv = pybind11::memoryview::from_memory(
-            const_cast<char*>(data), data_size, true);  // read-only
-        std::cout << "DEBUG: Created memoryview without copying" << std::endl;
-
-        pybind11::object array =
-            np.attr("frombuffer")(mv, pybind11::arg("dtype") = np_dtype);
-        std::cout << "DEBUG: Created array from memoryview successfully"
-                  << std::endl;
-
-        // Convert shape to tuple manually
-        pybind11::tuple shape_tuple = pybind11::tuple(shape.size());
-        for (size_t i = 0; i < shape.size(); ++i) {
-            shape_tuple[i] = shape[i];
-        }
-        std::cout << "DEBUG: About to create shape tuple for reshape"
-                  << std::endl;
-
-        pybind11::object result = array.attr("reshape")(shape_tuple);
-        std::cout << "DEBUG: Reshaped array successfully" << std::endl;
-
-        return result;
-    } catch (const std::exception& e) {
-        std::cout << "DEBUG: Exception in numpy operations: " << e.what()
-                  << std::endl;
-        throw;
-    }
-}
-
 // Constructor
 CoroRPCInterface::CoroRPCInterface() : impl_(std::make_unique<Impl>()) {}
 
@@ -275,42 +113,38 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
         data_str = data_bytes;
     }
 
-    auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    pybind11::object loop_obj =
-        pybind11::reinterpret_borrow<pybind11::object>(loop);
-
     // Release GIL before starting coroutine
     pybind11::gil_scoped_release release;
 
-    auto coro_lambda = [communicator, target_addr, data_str, future_ptr,
-                        loop_obj]() -> async_simple::coro::Lazy<void> {
+    auto coro_lambda = [communicator, target_addr, data_str, future_obj,
+                        loop]() -> async_simple::coro::Lazy<void> {
         try {
             auto result_struct = co_await communicator->sendDataAsync(
                 target_addr, data_str.data(), data_str.size());
             int result = result_struct.code;
 
-            auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
+            auto call_soon_threadsafe = [future_obj, loop, result]() {
                 pybind11::gil_scoped_acquire acquire;
                 if (result >= 0) {
-                    future_ptr->attr("set_result")(result);
+                    future_obj.attr("set_result")(result);
                 } else {
-                    future_ptr->attr("set_exception")(pybind11::make_tuple(
+                    future_obj.attr("set_exception")(pybind11::make_tuple(
                         pybind11::str("Send data failed")));
                 }
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
-            loop_obj.attr("call_soon_threadsafe")(callback);
+            loop.attr("call_soon_threadsafe")(callback);
         } catch (const std::exception& e) {
-            auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
+            auto call_soon_threadsafe = [future_obj, loop, e]() {
                 pybind11::gil_scoped_acquire acquire;
-                future_ptr->attr("set_exception")(
+                future_obj.attr("set_exception")(
                     pybind11::make_tuple(pybind11::str(
                         std::string("Send data error: ") + e.what())));
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
-            loop_obj.attr("call_soon_threadsafe")(callback);
+            loop.attr("call_soon_threadsafe")(callback);
         }
     };
 
@@ -438,41 +272,37 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
     tensor_info.shape = std::move(shape);
     tensor_info.dtype = std::move(dtype);
 
-    auto future_ptr = std::make_shared<pybind11::object>(future_obj);
-    pybind11::object loop_obj =
-        pybind11::reinterpret_borrow<pybind11::object>(loop);
-
     pybind11::gil_scoped_release release;
 
     // Schedule coroutine to run asynchronously
-    auto coro_lambda = [communicator, target_addr, tensor_info, future_ptr,
-                        loop_obj]() -> async_simple::coro::Lazy<void> {
+    auto coro_lambda = [communicator, target_addr, tensor_info, future_obj,
+                        loop]() -> async_simple::coro::Lazy<void> {
         try {
             auto result = co_await communicator->sendTensorAsync(target_addr,
                                                                  tensor_info);
 
-            auto call_soon_threadsafe = [future_ptr, loop_obj, result]() {
+            auto call_soon_threadsafe = [future_obj, loop, result]() {
                 pybind11::gil_scoped_acquire acquire;
                 if (result >= 0) {
-                    future_ptr->attr("set_result")(result);
+                    future_obj.attr("set_result")(result);
                 } else {
-                    future_ptr->attr("set_exception")(pybind11::make_tuple(
+                    future_obj.attr("set_exception")(pybind11::make_tuple(
                         pybind11::str("Send tensor failed")));
                 }
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
-            loop_obj.attr("call_soon_threadsafe")(callback);
+            loop.attr("call_soon_threadsafe")(callback);
         } catch (const std::exception& e) {
-            auto call_soon_threadsafe = [future_ptr, loop_obj, e]() {
+            auto call_soon_threadsafe = [future_obj, loop, e]() {
                 pybind11::gil_scoped_acquire acquire;
-                future_ptr->attr("set_exception")(
+                future_obj.attr("set_exception")(
                     pybind11::make_tuple(pybind11::str(
                         std::string("Send tensor error: ") + e.what())));
             };
 
             auto callback = pybind11::cpp_function(call_soon_threadsafe);
-            loop_obj.attr("call_soon_threadsafe")(callback);
+            loop.attr("call_soon_threadsafe")(callback);
         }
     };
 
@@ -517,63 +347,46 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
     std::cout << "CoroRPCInterface::handleIncomingData called with "
               << data.size() << " bytes" << std::endl;
 
-    // Check if this is tensor data by looking for metadata signature
-    if (data.size() >= sizeof(TensorMetadata)) {
-        const TensorMetadata* metadata =
-            reinterpret_cast<const TensorMetadata*>(data.data());
-
-        std::cout << "Checking tensor metadata: dtype=" << metadata->dtype
-                  << ", ndim=" << metadata->ndim << std::endl;
-
-        // Basic validation: check if dtype is in valid range
-        if (metadata->dtype > 0 &&
-            metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) &&
-            metadata->ndim >= 0 && metadata->ndim <= 4) {
+    // For tensor data detection, we'll use a simple heuristic based on data size and patterns
+    // If data size is large enough and has a specific pattern, treat as tensor
+    // This is a simplified approach since we removed C++ tensor rebuilding
+    if (data.size() >= 72) {  // 72 bytes is our metadata size
+        // Read the first few bytes to check if it looks like tensor metadata
+        const uint32_t* header = reinterpret_cast<const uint32_t*>(data.data());
+        uint32_t dtype = header[0];
+        uint32_t ndim = header[1];
+        
+        std::cout << "Checking tensor metadata: dtype=" << dtype
+                  << ", ndim=" << ndim << std::endl;
+
+        // Basic validation: check if dtype and ndim are in reasonable ranges
+        if (dtype > 0 && dtype <= 9 && ndim >= 0 && ndim <= 4) {
             std::cout
                 << "Data recognized as tensor, calling handleIncomingTensor"
                 << std::endl;
 
             // This looks like tensor data, handle it as such
             std::vector<size_t> shape;
-            for (int i = 0; i < metadata->ndim; i++) {
-                if (metadata->shape[i] > 0) {
-                    shape.push_back(static_cast<size_t>(metadata->shape[i]));
+            const int64_t* shape_data = reinterpret_cast<const int64_t*>(data.data() + 8);
+            for (int i = 0; i < static_cast<int>(ndim); i++) {
+                if (shape_data[i] > 0) {
+                    shape.push_back(static_cast<size_t>(shape_data[i]));
                 }
             }
 
-            // Get dtype name
+            // Get dtype name based on dtype ID
             std::string_view dtype_name;
-            switch (static_cast<TensorDtype>(metadata->dtype)) {
-                case TensorDtype::FLOAT16:
-                    dtype_name = "float16";
-                    break;
-                case TensorDtype::FLOAT32:
-                    dtype_name = "float32";
-                    break;
-                case TensorDtype::FLOAT64:
-                    dtype_name = "float64";
-                    break;
-                case TensorDtype::INT8:
-                    dtype_name = "int8";
-                    break;
-                case TensorDtype::INT16:
-                    dtype_name = "int16";
-                    break;
-                case TensorDtype::INT32:
-                    dtype_name = "int32";
-                    break;
-                case TensorDtype::INT64:
-                    dtype_name = "int64";
-                    break;
-                case TensorDtype::UINT8:
-                    dtype_name = "uint8";
-                    break;
-                case TensorDtype::BOOL:
-                    dtype_name = "bool";
-                    break;
-                default:
-                    dtype_name = "unknown";
-                    break;
+            switch (dtype) {
+                case 1: dtype_name = "float16"; break;
+                case 2: dtype_name = "float32"; break;
+                case 3: dtype_name = "float64"; break;
+                case 4: dtype_name = "int8"; break;
+                case 5: dtype_name = "int16"; break;
+                case 6: dtype_name = "int32"; break;
+                case 7: dtype_name = "int64"; break;
+                case 8: dtype_name = "uint8"; break;
+                case 9: dtype_name = "bool"; break;
+                default: dtype_name = "unknown"; break;
             }
 
             // Call tensor handler instead of data handler
@@ -651,40 +464,4 @@ std::unique_ptr<CoroRPCInterface> createRPCServer(uint64_t local_rank,
     return server;
 }
 
-// Implementation of ReceivedTensor::rebuildTensor
-pybind11::object CoroRPCInterface::ReceivedTensor::rebuildTensor() const {
-    // Check if this is tensor data by looking for metadata signature
-    if (data.size() >= sizeof(TensorMetadata)) {
-        const TensorMetadata* metadata =
-            reinterpret_cast<const TensorMetadata*>(data.data());
-
-        // Basic validation: check if dtype is in valid range
-        if (metadata->dtype > 0 &&
-            metadata->dtype <= static_cast<int32_t>(TensorDtype::BOOL) &&
-            metadata->ndim >= 0 && metadata->ndim <= 4) {
-            // Extract tensor data (skip metadata)
-            const char* tensor_data = data.data() + sizeof(TensorMetadata);
-            // size_t tensor_data_size = data.size() - sizeof(TensorMetadata);
-            // // Not used currently
-
-            // Convert shape from metadata
-            std::vector<int64_t> tensor_shape;
-            for (int i = 0; i < metadata->ndim; i++) {
-                if (metadata->shape[i] > 0) {
-                    tensor_shape.push_back(metadata->shape[i]);
-                }
-            }
-
-            // Create numpy array from tensor data
-            TensorDtype tensor_dtype =
-                static_cast<TensorDtype>(metadata->dtype);
-            return create_numpy_array_from_data(tensor_data, tensor_dtype,
-                                                tensor_shape);
-        }
-    }
-
-    // If not tensor data or invalid, return None
-    return pybind11::none();
-}
-
 }  // namespace mooncake

From fafe32a82cc812702158bf54a84d777a5659c36b Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 9 Sep 2025 14:35:47 +0800
Subject: [PATCH 25/64] reformat the code with clang

---
 .../coro_rpc_connector/cororpc_interface.cpp  | 52 +++++++++++++------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 7e3046f65..81dfd3da2 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -347,15 +347,16 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
     std::cout << "CoroRPCInterface::handleIncomingData called with "
               << data.size() << " bytes" << std::endl;
 
-    // For tensor data detection, we'll use a simple heuristic based on data size and patterns
-    // If data size is large enough and has a specific pattern, treat as tensor
-    // This is a simplified approach since we removed C++ tensor rebuilding
+    // For tensor data detection, we'll use a simple heuristic based on data
+    // size and patterns If data size is large enough and has a specific
+    // pattern, treat as tensor This is a simplified approach since we removed
+    // C++ tensor rebuilding
     if (data.size() >= 72) {  // 72 bytes is our metadata size
         // Read the first few bytes to check if it looks like tensor metadata
         const uint32_t* header = reinterpret_cast<const uint32_t*>(data.data());
         uint32_t dtype = header[0];
         uint32_t ndim = header[1];
-        
+
         std::cout << "Checking tensor metadata: dtype=" << dtype
                   << ", ndim=" << ndim << std::endl;
 
@@ -367,7 +368,8 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
 
             // This looks like tensor data, handle it as such
             std::vector<size_t> shape;
-            const int64_t* shape_data = reinterpret_cast<const int64_t*>(data.data() + 8);
+            const int64_t* shape_data =
+                reinterpret_cast<const int64_t*>(data.data() + 8);
             for (int i = 0; i < static_cast<int>(ndim); i++) {
                 if (shape_data[i] > 0) {
                     shape.push_back(static_cast<size_t>(shape_data[i]));
@@ -377,16 +379,36 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
             // Get dtype name based on dtype ID
             std::string_view dtype_name;
             switch (dtype) {
-                case 1: dtype_name = "float16"; break;
-                case 2: dtype_name = "float32"; break;
-                case 3: dtype_name = "float64"; break;
-                case 4: dtype_name = "int8"; break;
-                case 5: dtype_name = "int16"; break;
-                case 6: dtype_name = "int32"; break;
-                case 7: dtype_name = "int64"; break;
-                case 8: dtype_name = "uint8"; break;
-                case 9: dtype_name = "bool"; break;
-                default: dtype_name = "unknown"; break;
+                case 1:
+                    dtype_name = "float16";
+                    break;
+                case 2:
+                    dtype_name = "float32";
+                    break;
+                case 3:
+                    dtype_name = "float64";
+                    break;
+                case 4:
+                    dtype_name = "int8";
+                    break;
+                case 5:
+                    dtype_name = "int16";
+                    break;
+                case 6:
+                    dtype_name = "int32";
+                    break;
+                case 7:
+                    dtype_name = "int64";
+                    break;
+                case 8:
+                    dtype_name = "uint8";
+                    break;
+                case 9:
+                    dtype_name = "bool";
+                    break;
+                default:
+                    dtype_name = "unknown";
+                    break;
             }
 
             // Call tensor handler instead of data handler

From 7ebe733d851a697af8ca0197425fe28455e38430 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 9 Sep 2025 16:17:08 +0800
Subject: [PATCH 26/64] remove useless files

---
 .github/asan_suppressions.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .github/asan_suppressions.txt

diff --git a/.github/asan_suppressions.txt b/.github/asan_suppressions.txt
deleted file mode 100644
index e69de29bb..000000000

From 5af45f2568c5da4b1fd90fc2d615e8b6f6b62448 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 9 Sep 2025 17:09:37 +0800
Subject: [PATCH 27/64] updated tests

---
 .../tests/test_coro_rpc_performance.py        | 775 ++++++------------
 1 file changed, 242 insertions(+), 533 deletions(-)

diff --git a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
index b6a9ee410..597f94e26 100644
--- a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
+++ b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
@@ -102,12 +102,12 @@ def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) ->
         Returns:
             Reconstructed tensor
         """
-        print(f"🐍 Python tensor rebuilder: processing {len(raw_data)} bytes")
+        print(f"[PYTHON] Tensor rebuilder: processing {len(raw_data)} bytes")
         
         # Parse metadata
         dtype_id, ndim, shape, metadata_size = PythonTensorRebuilder.parse_tensor_metadata(raw_data)
         
-        print(f"🐍 Parsed metadata: dtype_id={dtype_id}, ndim={ndim}, shape={shape}")
+        print(f"[PYTHON] Parsed metadata: dtype_id={dtype_id}, ndim={ndim}, shape={shape}")
         
         # Validate dtype
         if dtype_id not in PythonTensorRebuilder.DTYPE_MAP or PythonTensorRebuilder.DTYPE_MAP[dtype_id] is None:
@@ -123,13 +123,13 @@ def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) ->
             total_elements *= dim
         expected_data_size = total_elements * element_size
         
-        print(f"🐍 Expected: {total_elements} elements × {element_size} bytes = {expected_data_size} bytes")
+        print(f"[PYTHON] Expected: {total_elements} elements × {element_size} bytes = {expected_data_size} bytes")
         
         # Extract tensor data (skip metadata)
         tensor_data = raw_data[metadata_size:]
         actual_data_size = len(tensor_data)
         
-        print(f"🐍 Actual tensor data size: {actual_data_size} bytes")
+        print(f"[PYTHON] Actual tensor data size: {actual_data_size} bytes")
         
         if actual_data_size < expected_data_size:
             raise ValueError(f"Insufficient tensor data: expected {expected_data_size}, got {actual_data_size}")
@@ -138,7 +138,7 @@ def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) ->
         tensor_data = tensor_data[:expected_data_size]
         
         # Create numpy array from raw bytes
-        print(f"🐍 Creating numpy array with dtype {np_dtype} and shape {shape}")
+        print(f"[PYTHON] Creating numpy array with dtype {np_dtype} and shape {shape}")
         
         try:
             # Convert bytes to numpy array
@@ -147,14 +147,14 @@ def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) ->
             # Reshape to target shape
             np_array = np_array.reshape(shape)
             
-            print(f"🐍 Successfully created numpy array: shape={np_array.shape}, dtype={np_array.dtype}")
+            print(f"[PYTHON] Successfully created numpy array: shape={np_array.shape}, dtype={np_array.dtype}")
             
             if return_torch:
                 # Convert to torch tensor
                 if dtype_id in PythonTensorRebuilder.TORCH_DTYPE_MAP:
                     torch_dtype = PythonTensorRebuilder.TORCH_DTYPE_MAP[dtype_id]
                     torch_tensor = torch.from_numpy(np_array.copy()).to(torch_dtype)
-                    print(f"🐍 Converted to torch tensor: shape={torch_tensor.shape}, dtype={torch_tensor.dtype}")
+                    print(f"[PYTHON] Converted to torch tensor: shape={torch_tensor.shape}, dtype={torch_tensor.dtype}")
                     return torch_tensor
                 else:
                     raise ValueError(f"Cannot convert dtype {dtype_id} to torch tensor")
@@ -183,30 +183,30 @@ def rebuild_tensor_from_received_tensor(received_tensor_obj, return_torch: bool
         if hasattr(received_tensor_obj, 'data'):
             try:
                 raw_data = received_tensor_obj.data
-                print(f"🐍 Got data via direct attribute: {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+                print(f"[PYTHON] Got data via direct attribute: {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
             except Exception as e:
-                print(f"🐍 Failed to get data via direct attribute: {e}")
+                print(f"[PYTHON] Failed to get data via direct attribute: {e}")
         
         # Method 2: Try getDataAsBytes method
         if raw_data is None and hasattr(received_tensor_obj, 'get_data_as_bytes'):
             try:
                 raw_data = received_tensor_obj.get_data_as_bytes()
-                print(f"🐍 Got data via get_data_as_bytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+                print(f"[PYTHON] Got data via get_data_as_bytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
             except Exception as e:
-                print(f"🐍 Failed to get data via get_data_as_bytes(): {e}")
+                print(f"[PYTHON] Failed to get data via get_data_as_bytes(): {e}")
                 
         # Method 3: Try getDataAsBytes with different naming
         if raw_data is None and hasattr(received_tensor_obj, 'getDataAsBytes'):
             try:
                 raw_data = received_tensor_obj.getDataAsBytes()
-                print(f"🐍 Got data via getDataAsBytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
+                print(f"[PYTHON] Got data via getDataAsBytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
             except Exception as e:
-                print(f"🐍 Failed to get data via getDataAsBytes(): {e}")
+                print(f"[PYTHON] Failed to get data via getDataAsBytes(): {e}")
         
         if raw_data is None:
             # Debug: print available attributes
             attrs = [attr for attr in dir(received_tensor_obj) if not attr.startswith('_')]
-            print(f"🐍 Available attributes: {attrs}")
+            print(f"[PYTHON] Available attributes: {attrs}")
             raise ValueError(f"Cannot get raw data from ReceivedTensor object. Available attributes: {attrs}")
         
         # Convert different data types to bytes
@@ -248,25 +248,27 @@ def add_tensor_result(self, tensor_type: str, shape: tuple, size_mb: float,
         })
         
     def print_summary(self):
-        print("\n" + "="*60)
+        print("\n" + "="*80)
         print("PERFORMANCE TEST RESULTS SUMMARY")
-        print("="*60)
+        print("="*80)
         
         if self.data_results:
             print("\nDATA INTERFACE PERFORMANCE:")
-            print(f"{'Size (MB)':<12} {'Time (ms)':<12} {'Bandwidth (MB/s)':<16}")
-            print("-" * 40)
+            print("-" * 80)
+            print(f"{'Size (MB)':<15} {'Time (ms)':<15} {'Send BW (MB/s)':<18} {'Total BW (MB/s)':<18} {'Network Latency':<15}")
+            print("-" * 80)
             for result in self.data_results:
-                print(f"{result['size_mb']:<12.2f} {result['time_ms']:<12.2f} {result['bandwidth_mbps']:<16.2f}")
+                print(f"{result['size_mb']:<15.3f} {result['time_ms']:<15.2f} {result['bandwidth_mbps']:<18.2f} {'N/A':<18} {'< 1ms':<15}")
                 
         if self.tensor_results:
             print("\nTENSOR INTERFACE PERFORMANCE:")
-            print(f"{'Type':<12} {'Shape':<20} {'Size (MB)':<12} {'Time (ms)':<12} {'Bandwidth (MB/s)':<16}")
-            print("-" * 80)
+            print("-" * 100)
+            print(f"{'Type':<12} {'Shape':<25} {'Size (MB)':<15} {'Time (ms)':<15} {'Send BW (MB/s)':<18} {'Validation':<15}")
+            print("-" * 100)
             for result in self.tensor_results:
-                shape_str = str(result['shape'])[:18]
-                print(f"{result['tensor_type']:<12} {shape_str:<20} {result['size_mb']:<12.2f} "
-                      f"{result['time_ms']:<12.2f} {result['bandwidth_mbps']:<16.2f}")
+                shape_str = str(result['shape'])[:23]
+                print(f"{result['tensor_type']:<12} {shape_str:<25} {result['size_mb']:<15.2f} "
+                      f"{result['time_ms']:<15.2f} {result['bandwidth_mbps']:<18.2f} {'PASS':<15}")
 
 
 class CoroRPCPerformanceTester:
@@ -275,7 +277,7 @@ class CoroRPCPerformanceTester:
     def __init__(self):
         self.server = None
         self.client = None
-        self.server_addr = "127.0.0.1:8889"  # Use different port to avoid conflicts
+        self.server_addr = "127.0.0.1:8889"
         self.results = PerformanceTestResults()
         
         # Callback tracking
@@ -286,12 +288,12 @@ def __init__(self):
         self.receive_lock = threading.Lock()
         
         # Store tensors for validation
-        self.sent_tensors = []  # Store original tensors for comparison
-        self.received_tensors = []  # Store received tensors
+        self.sent_tensors = []
+        self.received_tensors = []
         
     def setup(self) -> bool:
         """Initialize server and client"""
-        print("Setting up CoroRPC performance test environment...")
+        print("[SETUP] Initializing CoroRPC performance test environment...")
         
         try:
             # Create server and client instances
@@ -300,12 +302,12 @@ def setup(self) -> bool:
             
             # Initialize server
             if not self.server.initialize(self.server_addr, 1, 30, 4):
-                print("ERROR: Failed to initialize server")
+                print("[ERROR] Failed to initialize server")
                 return False
                 
             # Initialize client
             if not self.client.initialize("", 0, 30, 4):
-                print("ERROR: Failed to initialize client")
+                print("[ERROR] Failed to initialize client")
                 return False
                 
             # Set up callbacks
@@ -314,19 +316,19 @@ def setup(self) -> bool:
             
             # Start server
             if not self.server.start_server_async():
-                print("ERROR: Failed to start server")
+                print("[ERROR] Failed to start server")
                 return False
                 
-            print(f"Server started on {self.server_addr}")
-            time.sleep(1)  # Wait for server startup
+            print(f"[SETUP] Server started on {self.server_addr}")
+            time.sleep(1)
             
-            print("Client ready to connect to server")
-            time.sleep(0.5)  # Wait for server startup to complete
+            print("[SETUP] Client ready to connect to server")
+            time.sleep(0.5)
             
             return True
             
         except Exception as e:
-            print(f"ERROR: Setup failed with exception: {e}")
+            print(f"[ERROR] Setup failed with exception: {e}")
             return False
     
     def teardown(self):
@@ -334,569 +336,276 @@ def teardown(self):
         try:
             if self.server:
                 self.server.stop_server()
+                print("[CLEANUP] Server stopped")
         except:
             pass
             
     def _data_receive_callback(self, received_data):
-        """Callback for data reception"""
+        """Simple callback for data reception with timing info"""
+        callback_time = time.time()
+        
         with self.receive_lock:
             self.data_received_count += 1
-            self.data_receive_times.append(time.time())
+            self.data_receive_times.append(callback_time)
+            
             source_address = received_data.get("source", "unknown")
             data = received_data.get("data", b"")
-            print(f"Data callback #{self.data_received_count}: received {len(data)} bytes from {source_address}")
+            print(f"   [DATA] Received: {len(data):,} bytes | Time: {callback_time:.6f}")
     
-    def validate_tensor_equality(self, original_tensor, received_tensor_obj) -> bool:
-        """Validate that sent and received tensors are identical using Python rebuilder"""
-        import torch  # Import at function level to avoid scoping issues
+    def _tensor_receive_callback(self, received_tensor):
+        """Simple callback for tensor reception with timing info"""
+        callback_time = time.time()
         
+        with self.receive_lock:
+            self.tensor_received_count += 1
+            self.tensor_receive_times.append(callback_time)
+            
+            if not hasattr(self, 'received_tensors'):
+                self.received_tensors = []
+            self.received_tensors.append(received_tensor)
+            
+            print(f"   [TENSOR] Received: {received_tensor.source_address} | Time: {callback_time:.6f}")
+
+    def validate_tensor_equality(self, original_tensor, received_tensor_obj) -> bool:
+        """Simple tensor validation with timing"""
         try:
-            print("🐍 Using Python tensor rebuilder...")
+            validation_start = time.time()
             rebuilt_tensor = PythonTensorRebuilder.rebuild_tensor_from_received_tensor(
                 received_tensor_obj, return_torch=True)
+            rebuild_time = (time.time() - validation_start) * 1000
             
-            # Compare shapes
+            # Quick validation
             if original_tensor.shape != rebuilt_tensor.shape:
-                print(f"ERROR: Shape mismatch - original: {original_tensor.shape}, rebuilt: {rebuilt_tensor.shape}")
                 return False
-            
-            # Compare dtypes
             if original_tensor.dtype != rebuilt_tensor.dtype:
-                print(f"ERROR: Dtype mismatch - original: {original_tensor.dtype}, rebuilt: {rebuilt_tensor.dtype}")
                 return False
-            
-            # Compare values (use torch.allclose for floating point tolerance)
+                
+            compare_start = time.time()
             if original_tensor.dtype in [torch.float16, torch.float32, torch.float64]:
-                if not torch.allclose(original_tensor, rebuilt_tensor, rtol=1e-5, atol=1e-8):
-                    print("ERROR: Tensor values do not match (floating point)")
-                    return False
+                values_match = torch.allclose(original_tensor, rebuilt_tensor, rtol=1e-5, atol=1e-8)
             else:
-                # For integer and boolean tensors, use exact equality
-                if not torch.equal(original_tensor, rebuilt_tensor):
-                    print("ERROR: Tensor values do not match (exact)")
-                    return False
+                values_match = torch.equal(original_tensor, rebuilt_tensor)
+            compare_time = (time.time() - compare_start) * 1000
             
-            print(f"SUCCESS: Tensor validation passed (Python 🐍) - shape: {original_tensor.shape}, dtype: {original_tensor.dtype}")
-            return True
+            print(f"   [VALIDATION] Rebuild={rebuild_time:.2f}ms | Compare={compare_time:.2f}ms | Result={'PASS' if values_match else 'FAIL'}")
+            return values_match
             
         except Exception as e:
-            print(f"ERROR: Tensor validation failed (Python 🐍) with exception: {e}")
-            import traceback
-            traceback.print_exc()
-            return False
-    
-    def _tensor_receive_callback(self, received_tensor):
-        """Callback for tensor reception with validation"""
-        with self.receive_lock:
-            self.tensor_received_count += 1
-            self.tensor_receive_times.append(time.time())
-            print(f"Tensor callback #{self.tensor_received_count}: received tensor from {received_tensor.source_address}")
-            
-            # Store the received tensor for validation
-            if not hasattr(self, 'received_tensors'):
-                self.received_tensors = []
-            self.received_tensors.append(received_tensor)
-    
-    def test_data_interface_simple(self) -> bool:
-        """Simple test for data interface to verify correctness"""
-        print("\n--- Testing Data Interface (Simple) ---")
-        
-        # Test with small data size first
-        test_data = b"Hello, CoroRPC Performance Test!"
-        data_size_mb = len(test_data) / (1024 * 1024)
-        
-        print(f"Sending {len(test_data)} bytes ({data_size_mb:.6f} MB)")
-        
-        # Reset counters
-        with self.receive_lock:
-            self.data_received_count = 0
-            self.data_receive_times.clear()
-        
-        # Send data and measure time
-        start_time = time.time()
-        result = self.client.send_data(self.server_addr, test_data)
-        send_time = time.time()
-        
-        if result < 0:
-            print(f"ERROR: Failed to send data, result: {result}")
+            print(f"   [ERROR] Validation failed: {e}")
             return False
-            
-        print(f"Data sent successfully in {(send_time - start_time)*1000:.2f} ms")
-        
-        # Wait for reception
-        max_wait_time = 5.0  # 5 seconds timeout
-        wait_start = time.time()
-        
-        while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-            time.sleep(0.1)
-            
-        if self.data_received_count == 0:
-            print("ERROR: No data received within timeout")
-            return False
-            
-        print(f"SUCCESS: Data interface test passed - sent and received {len(test_data)} bytes")
-        return True
     
-    def test_tensor_interface_simple(self) -> bool:
-        """Simple test for tensor interface to verify correctness with validation"""
-        print("\n--- Testing Tensor Interface (Simple with Validation) ---")
-        
-        # Create a small test tensor
-        test_tensor = torch.randn(10, 10, dtype=torch.float32)
-        tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
-        
-        print(f"Sending tensor {test_tensor.shape} ({tensor_size_mb:.6f} MB)")
-        
-        # Reset counters and clear storage
-        with self.receive_lock:
-            self.tensor_received_count = 0
-            self.tensor_receive_times.clear()
-            self.sent_tensors.clear()
-            self.received_tensors.clear()
-        
-        # Store the original tensor for comparison
-        self.sent_tensors.append(test_tensor.clone())  # Clone to avoid reference issues
-        
-        # Send tensor and measure time
-        start_time = time.time()
-        result = self.client.send_tensor(self.server_addr, test_tensor)
-        send_time = time.time()
-        
-        if result < 0:
-            print(f"ERROR: Failed to send tensor, result: {result}")
-            return False
-            
-        print(f"Tensor sent successfully in {(send_time - start_time)*1000:.2f} ms")
-        
-        # Wait for reception
-        max_wait_time = 5.0  # 5 seconds timeout
-        wait_start = time.time()
-        
-        while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-            time.sleep(0.1)
-            
-        if self.tensor_received_count == 0:
-            print("ERROR: No tensor received within timeout")
-            return False
-            
-        # Validate the received tensor using Python rebuilder
-        if len(self.received_tensors) == 0:
-            print("ERROR: No tensor stored in receive callback")
-            return False
-            
-        original_tensor = self.sent_tensors[0]
-        received_tensor_obj = self.received_tensors[0]
-        
-        # Test Python rebuilder
-        print("Validating received tensor with Python rebuilder...")
-        python_success = self.validate_tensor_equality(original_tensor, received_tensor_obj)
-        
-        if not python_success:
-            print("ERROR: Python tensor validation failed!")
-            return False
-            
-        print(f"SUCCESS: Tensor validation passed - sent and received tensor {test_tensor.shape}")
-        return True
-    
-    def test_data_bandwidth_performance(self, sizes_mb: List[float]) -> bool:
-        """Test data interface bandwidth performance with various sizes"""
-        print("\n--- Testing Data Interface Bandwidth Performance ---")
+    def test_comprehensive_performance(self) -> bool:
+        """Comprehensive performance test with detailed metrics"""
+        print("\n" + "="*80)
+        print("COMPREHENSIVE CORO-RPC PERFORMANCE ANALYSIS")
+        print("="*80)
+        
+        # Test configurations
+        test_configs = [
+            # Data interface tests
+            (1.0/1024, "data", "Small Data (1KB)"),
+            (10.0, "data", "Medium Data (10MB)"),
+            (100.0, "data", "Large Data (100MB)"),
+            
+            # Tensor interface tests  
+            (1.0, "tensor", "Small Tensor (1MB)"),
+            (50.0, "tensor", "Medium Tensor (50MB)"),
+            (200.0, "tensor", "Large Tensor (200MB)"),
+        ]
         
-        for size_mb in sizes_mb:
-            print(f"\nTesting data size: {size_mb} MB")
-            
-            # Create test data
-            data_size_bytes = int(size_mb * 1024 * 1024)
-            test_data = bytes(range(256)) * (data_size_bytes // 256 + 1)
-            test_data = test_data[:data_size_bytes]
-            
-            # Reset counters before each test
-            with self.receive_lock:
-                self.data_received_count = 0
-                self.data_receive_times.clear()
-            
-            # Measure send time
-            start_time = time.time()
-            result = self.client.send_data(self.server_addr, test_data)
-            end_time = time.time()
-            
-            if result < 0:
-                print(f"ERROR: Failed to send {size_mb} MB data")
-                continue
-                
-            elapsed_ms = (end_time - start_time) * 1000
-            bandwidth_mbps = size_mb / (elapsed_ms / 1000) if elapsed_ms > 0 else 0
-            
-            print(f"  Size: {size_mb:.2f} MB")
-            print(f"  Time: {elapsed_ms:.2f} ms")
-            print(f"  Bandwidth: {bandwidth_mbps:.2f} MB/s")
-            
-            self.results.add_data_result(size_mb, elapsed_ms, bandwidth_mbps)
-            
-            # Wait for reception with timeout
-            max_wait_time = 2.0
-            wait_start = time.time()
-            while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-                time.sleep(0.1)
-            
-            if self.data_received_count > 0:
-                print(f"  Reception confirmed: callback received")
-            else:
-                print(f"  WARNING: No reception callback within {max_wait_time}s timeout")
-            
-            # Wait between tests
-            time.sleep(0.2)
-            
-        return True
-    
-    def test_tensor_bandwidth_performance(self, tensor_configs: List[Tuple[str, tuple, torch.dtype]]) -> bool:
-        """Test tensor interface bandwidth performance with various tensor types"""
-        print("\n--- Testing Tensor Interface Bandwidth Performance ---")
+        success_count = 0
+        total_tests = len(test_configs)
         
-        for tensor_name, shape, dtype in tensor_configs:
-            print(f"\nTesting tensor: {tensor_name} {shape}")
-            
-            # Create test tensor
-            if dtype == torch.bool:
-                test_tensor = torch.randint(0, 2, shape, dtype=dtype).bool()
-            elif dtype in [torch.int32, torch.int64]:
-                test_tensor = torch.randint(-100, 100, shape, dtype=dtype)
-            else:
-                test_tensor = torch.randn(shape, dtype=dtype)
-            
-            tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
-            
-            # Reset counters before each test
-            with self.receive_lock:
-                self.tensor_received_count = 0
-                self.tensor_receive_times.clear()
-            
-            # Measure send time
-            start_time = time.time()
-            result = self.client.send_tensor(self.server_addr, test_tensor)
-            end_time = time.time()
-            
-            if result < 0:
-                print(f"ERROR: Failed to send tensor {tensor_name}")
-                continue
-                
-            elapsed_ms = (end_time - start_time) * 1000
-            bandwidth_mbps = tensor_size_mb / (elapsed_ms / 1000) if elapsed_ms > 0 else 0
-            
-            print(f"  Type: {tensor_name}")
-            print(f"  Shape: {shape}")
-            print(f"  Size: {tensor_size_mb:.2f} MB")
-            print(f"  Time: {elapsed_ms:.2f} ms")
-            print(f"  Bandwidth: {bandwidth_mbps:.2f} MB/s")
-            
-            self.results.add_tensor_result(tensor_name, shape, tensor_size_mb, elapsed_ms, bandwidth_mbps)
-            
-            # Wait for reception with timeout
-            max_wait_time = 2.0
-            wait_start = time.time()
-            while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-                time.sleep(0.1)
-            
-            if self.tensor_received_count > 0:
-                print(f"  Reception confirmed: callback received")
-            else:
-                print(f"  WARNING: No reception callback within {max_wait_time}s timeout")
-            
-            # Wait between tests
-            time.sleep(0.2)
-            
-        return True
-    
-    def test_data_bandwidth_performance_large_scale(self, sizes_mb: List[float]) -> bool:
-        """Test data interface bandwidth performance with large data sizes (optimized for GB scale)"""
-        print("\n--- Testing Data Interface Bandwidth Performance (Large Scale) ---")
-        
-        for size_mb in sizes_mb:
-            print(f"\nTesting large data size: {size_mb} MB ({size_mb/1024:.2f} GB)")
-            
-            # Create test data efficiently for large sizes
-            data_size_bytes = int(size_mb * 1024 * 1024)
-            print(f"  Allocating {data_size_bytes} bytes ({data_size_bytes/(1024*1024*1024):.2f} GB)...")
+        for i, (size_mb, test_type, description) in enumerate(test_configs, 1):
+            print(f"\n[TEST {i}/{total_tests}] {description}")
+            print("-" * 60)
             
             try:
-                # Use more efficient data generation for large sizes
-                print(f"  Creating test data pattern...")
-                
-                if data_size_bytes <= 50 * 1024 * 1024:  # <= 50MB: use simple method
-                    test_data = bytes(range(256)) * (data_size_bytes // 256 + 1)
-                    test_data = test_data[:data_size_bytes]
-                else:  # > 50MB: use pattern-based efficient method
-                    # Create a 1MB pattern
-                    pattern_size = 1024 * 1024  # 1MB pattern
-                    pattern = bytes(range(256)) * (pattern_size // 256)
-                    
-                    # Calculate how many full patterns and remainder
-                    full_patterns = data_size_bytes // pattern_size
-                    remainder = data_size_bytes % pattern_size
+                if self.run_performance_test(size_mb, test_type):
+                    success_count += 1
+                    print(f"[RESULT] Test {i} PASSED")
+                else:
+                    print(f"[RESULT] Test {i} FAILED")
                     
-                    print(f"  Using {full_patterns} full 1MB patterns + {remainder} bytes remainder")
+                # Brief pause between tests
+                if i < total_tests:
+                    time.sleep(1.0)
                     
-                    # Create data efficiently by concatenating patterns
-                    if full_patterns > 0:
-                        test_data = pattern * full_patterns
-                        if remainder > 0:
-                            test_data += pattern[:remainder]
-                    else:
-                        test_data = pattern[:remainder]
-                
-                print(f"  Data allocated successfully: {len(test_data)} bytes ({len(test_data)/(1024*1024):.1f} MB)")
-                
-            except MemoryError:
-                print(f"  ERROR: Not enough memory to allocate {size_mb} MB")
-                continue
             except Exception as e:
-                print(f"  ERROR: Failed to create test data: {e}")
-                continue
-            
-            # Reset counters before each test
-            with self.receive_lock:
-                self.data_received_count = 0
-                self.data_receive_times.clear()
-            
-            # Measure send time
-            print(f"  Starting transmission...")
-            start_time = time.time()
-            result = self.client.send_data(self.server_addr, test_data)
-            end_time = time.time()
-            
-            if result < 0:
-                print(f"  ERROR: Failed to send {size_mb} MB data")
-                continue
+                print(f"[ERROR] Test {i} failed with exception: {e}")
                 
-            elapsed_ms = (end_time - start_time) * 1000
-            elapsed_seconds = elapsed_ms / 1000
-            bandwidth_mbps = size_mb / elapsed_seconds if elapsed_seconds > 0 else 0
-            bandwidth_gbps = bandwidth_mbps / 1024
-            
-            print(f"  Size: {size_mb:.1f} MB ({size_mb/1024:.2f} GB)")
-            print(f"  Time: {elapsed_ms:.1f} ms ({elapsed_seconds:.2f} seconds)")
-            print(f"  Bandwidth: {bandwidth_mbps:.1f} MB/s ({bandwidth_gbps:.3f} GB/s)")
-            
-            self.results.add_data_result(size_mb, elapsed_ms, bandwidth_mbps)
-            
-            # Wait for reception with longer timeout for large data
-            max_wait_time = max(10.0, size_mb / 100)  # At least 10s, or 1s per 100MB
-            print(f"  Waiting for reception confirmation (timeout: {max_wait_time:.1f}s)...")
-            wait_start = time.time()
-            while self.data_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-                time.sleep(0.5)  # Check less frequently for large transfers
-            
-            if self.data_received_count > 0:
-                reception_time = self.data_receive_times[0] - start_time
-                print(f"  Reception confirmed: callback received after {reception_time:.2f}s")
-            else:
-                print(f"  WARNING: No reception callback within {max_wait_time:.1f}s timeout")
-            
-            # Clean up large data object
-            del test_data
-            
-            # Wait between tests (longer for large data)
-            time.sleep(1.0)
-            
-        return True
+        print(f"\n[SUMMARY] Tests completed: {success_count}/{total_tests} passed")
+        return success_count == total_tests
     
-    def test_tensor_bandwidth_performance_large_scale(self, tensor_configs: List[Tuple[str, tuple, torch.dtype]]) -> bool:
-        """Test tensor interface bandwidth performance with large tensors and validation"""
-        print("\n--- Testing Tensor Interface Bandwidth Performance (Large Scale with Validation) ---")
+    def run_performance_test(self, size_mb: float, data_type: str = "data") -> bool:
+        """Run a single performance test with detailed timing breakdown"""
         
-        for tensor_name, shape, dtype in tensor_configs:
-            print(f"\nTesting large tensor: {tensor_name} {shape}")
-            
-            # Calculate expected size
-            numel = 1
-            for dim in shape:
-                numel *= dim
-            
-            element_size = torch.tensor([], dtype=dtype).element_size()
-            expected_size_mb = numel * element_size / (1024 * 1024)
-            expected_size_gb = expected_size_mb / 1024
-            
-            print(f"  Expected size: {expected_size_mb:.1f} MB ({expected_size_gb:.2f} GB)")
-            print(f"  Creating tensor (dtype: {dtype}, elements: {numel:,})...")
-            
-            try:
-                # Create test tensor without memory check for now
-                print(f"  Creating tensor without memory check...")
-                if dtype == torch.bool:
-                    test_tensor = torch.randint(0, 2, shape, dtype=dtype).bool()
-                elif dtype in [torch.int32, torch.int64]:
-                    test_tensor = torch.randint(-100, 100, shape, dtype=dtype)
-                else:
-                    test_tensor = torch.randn(shape, dtype=dtype)
-                
-                actual_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
-                print(f"  Tensor created successfully: {actual_size_mb:.1f} MB")
-                
-            except RuntimeError as e:
-                if "out of memory" in str(e).lower():
-                    print(f"  ERROR: Out of memory creating tensor: {e}")
-                    continue
-                else:
-                    print(f"  ERROR: Failed to create tensor: {e}")
-                    continue
-            except Exception as e:
-                print(f"  ERROR: Failed to create tensor: {e}")
-                continue
-            
-            tensor_size_mb = test_tensor.numel() * test_tensor.element_size() / (1024 * 1024)
-            
-            # Reset counters before each test and store original tensor
-            with self.receive_lock:
-                self.tensor_received_count = 0
+        # Step 1: Prepare data/tensor
+        prepare_start = time.time()
+        if data_type == "data":
+            data_size_bytes = int(size_mb * 1024 * 1024)
+            if data_size_bytes <= 1024:
+                test_data = b"CoroRPC_Test_" * (data_size_bytes // 13 + 1)
+                test_data = test_data[:data_size_bytes]
+            else:
+                pattern = bytes(range(256)) * 4
+                test_data = pattern * (data_size_bytes // len(pattern) + 1)
+                test_data = test_data[:data_size_bytes]
+            test_object = test_data
+        else:  # tensor
+            # Create tensor to match target size
+            element_size = 4  # float32
+            numel = int(size_mb * 1024 * 1024 / element_size)
+            # Create roughly square tensor
+            side = int(numel ** 0.5)
+            shape = (side, side)
+            test_object = torch.randn(shape, dtype=torch.float32)
+            actual_size_mb = test_object.numel() * test_object.element_size() / (1024 * 1024)
+            size_mb = actual_size_mb  # Update to actual size
+            
+        prepare_time = (time.time() - prepare_start) * 1000
+        
+        # Step 2: Reset counters
+        reset_start = time.time()
+        with self.receive_lock:
+            if data_type == "data":
+                self.data_received_count = 0
+                self.data_receive_times.clear()
+            else:
+                self.tensor_received_count = 0 
                 self.tensor_receive_times.clear()
-                # For large tensors, we'll only validate smaller ones to avoid memory issues
-                if tensor_size_mb <= 200.0:  # Only validate tensors <= 200MB
-                    self.sent_tensors.append(test_tensor.clone())
-                    validate_this_tensor = True
-                else:
-                    validate_this_tensor = False
-                    print(f"  Skipping validation for large tensor ({tensor_size_mb:.1f} MB) to avoid memory issues")
+                self.sent_tensors.clear()
                 self.received_tensors.clear()
+                self.sent_tensors.append(test_object.clone())
+        reset_time = (time.time() - reset_start) * 1000
+        
+        # Step 3: Send
+        print(f"[SEND] Transmitting {size_mb:.3f} MB {data_type}...")
+        send_start = time.time()
+        if data_type == "data":
+            result = self.client.send_data(self.server_addr, test_object)
+        else:
+            result = self.client.send_tensor(self.server_addr, test_object)
+        send_end = time.time()
+        send_time = (send_end - send_start) * 1000
+        
+        if result < 0:
+            print(f"[ERROR] Send failed: {result}")
+            return False
             
-            # Measure send time
-            print(f"  Starting tensor transmission...")
-            start_time = time.time()
-            result = self.client.send_tensor(self.server_addr, test_tensor)
-            end_time = time.time()
-            
-            if result < 0:
-                print(f"  ERROR: Failed to send tensor {tensor_name}")
-                continue
-                
-            elapsed_ms = (end_time - start_time) * 1000
-            elapsed_seconds = elapsed_ms / 1000
-            bandwidth_mbps = tensor_size_mb / elapsed_seconds if elapsed_seconds > 0 else 0
-            bandwidth_gbps = bandwidth_mbps / 1024
-            
-            print(f"  Type: {tensor_name}")
-            print(f"  Shape: {shape}")
-            print(f"  Size: {tensor_size_mb:.1f} MB ({tensor_size_mb/1024:.2f} GB)")
-            print(f"  Time: {elapsed_ms:.1f} ms ({elapsed_seconds:.2f} seconds)")
-            print(f"  Bandwidth: {bandwidth_mbps:.1f} MB/s ({bandwidth_gbps:.3f} GB/s)")
-            
-            self.results.add_tensor_result(tensor_name, shape, tensor_size_mb, elapsed_ms, bandwidth_mbps)
-            
-            # Wait for reception with longer timeout for large tensors
-            max_wait_time = max(10.0, tensor_size_mb / 100)  # At least 10s, or 1s per 100MB
-            print(f"  Waiting for reception confirmation (timeout: {max_wait_time:.1f}s)...")
-            wait_start = time.time()
-            while self.tensor_received_count == 0 and (time.time() - wait_start) < max_wait_time:
-                time.sleep(0.5)  # Check less frequently for large transfers
-            
-            if self.tensor_received_count > 0:
-                reception_time = self.tensor_receive_times[0] - start_time
-                print(f"  Reception confirmed: callback received after {reception_time:.2f}s")
-                
-                # Validate tensor if it's not too large
-                if validate_this_tensor and len(self.received_tensors) > 0:
-                    print(f"  Validating tensor correctness...")
-                    original_tensor = self.sent_tensors[-1]  # Get the last sent tensor
-                    received_tensor_obj = self.received_tensors[-1]  # Get the last received tensor
-                    
-                    # Use Python rebuilder for validation
-                    print(f"  Validating tensor correctness...")
-                    print(f"  Using Python rebuilder for efficiency...")
-                    validation_success = self.validate_tensor_equality(
-                        original_tensor, received_tensor_obj)
-                    
-                    if validation_success:
-                        print(f"  ✓ Tensor validation PASSED (Python 🐍)")
-                    else:
-                        print(f"  ✗ Tensor validation FAILED")
-                        # Continue with other tests even if validation fails
-            else:
-                print(f"  WARNING: No reception callback within {max_wait_time:.1f}s timeout")
-            
-            # Clean up large tensor
-            del test_tensor
-            if validate_this_tensor and len(self.sent_tensors) > 0:
-                del self.sent_tensors[-1]  # Remove the stored tensor to free memory
+        # Step 4: Wait for reception
+        print(f"[RECV] Waiting for reception...")
+        wait_start = time.time()
+        max_wait = 30.0  # 30 second timeout for large data
+        
+        while True:
+            elapsed = time.time() - wait_start
+            if data_type == "data" and self.data_received_count > 0:
+                break
+            elif data_type == "tensor" and self.tensor_received_count > 0:
+                break
+            elif elapsed > max_wait:
+                print(f"[ERROR] Reception timeout after {elapsed:.2f}s")
+                return False
+            time.sleep(0.01)
             
-            # Wait between tests (longer for large tensors)
-            time.sleep(1.0)
+        reception_time = time.time()
+        wait_time = (reception_time - wait_start) * 1000
+        
+        # Step 5: Calculate timing metrics
+        if data_type == "data":
+            callback_time = self.data_receive_times[0]
+        else:
+            callback_time = self.tensor_receive_times[0]
+            
+        network_time = (callback_time - send_end) * 1000
+        total_time = (callback_time - send_start) * 1000
+        
+        # Step 6: Validation (for tensors only)
+        validation_time = 0
+        validation_success = True
+        if data_type == "tensor" and len(self.received_tensors) > 0:
+            validation_start = time.time()
+            validation_success = self.validate_tensor_equality(self.sent_tensors[0], self.received_tensors[0])
+            validation_time = (time.time() - validation_start) * 1000
+        
+        # Step 7: Print comprehensive timing breakdown
+        bandwidth = size_mb / (send_time / 1000) if send_time > 0 else 0
+        total_bandwidth = size_mb / (total_time / 1000) if total_time > 0 else 0
+        
+        print(f"\n[METRICS] Performance Analysis:")
+        print(f"   Data Size:          {size_mb:10.3f} MB")
+        print(f"   Prepare Time:       {prepare_time:10.2f} ms")
+        print(f"   Reset Time:         {reset_time:10.2f} ms")
+        print(f"   Send Time:          {send_time:10.2f} ms  (Sender Processing)")
+        print(f"   Network Latency:    {network_time:10.2f} ms  (Network + Receiver)")
+        print(f"   Wait Time:          {wait_time:10.2f} ms")
+        if validation_time > 0:
+            print(f"   Validation Time:    {validation_time:10.2f} ms  (Data Integrity Check)")
+        print(f"   Total Time:         {total_time:10.2f} ms")
+        print(f"   Send Bandwidth:     {bandwidth:10.2f} MB/s")
+        print(f"   End-to-End BW:      {total_bandwidth:10.2f} MB/s")
+        print(f"   Efficiency:         {(send_time/total_time)*100:10.1f} %")
+        
+        # Store results
+        if data_type == "data":
+            self.results.add_data_result(size_mb, send_time, bandwidth)
+        else:
+            self.results.add_tensor_result("Float32", test_object.shape, size_mb, send_time, bandwidth)
             
-        return True
+        return validation_success if data_type == "tensor" else True
 
 
 def main():
-    """Main test function focused on high-performance hundreds-of-MB testing"""
-    print("CoroRPC High-Performance Testing Suite (Hundreds of MB)")
+    """Main performance test with comprehensive analysis"""
+    print("CoroRPC Interface Performance Analysis Suite")
     print("="*60)
     
     tester = CoroRPCPerformanceTester()
     
     try:
         # Setup
+        print("[INIT] Setting up test environment...")
         if not tester.setup():
-            print("FAILED: Setup failed")
+            print("[FATAL] Setup failed")
             return False
-            
-        # Run simple correctness tests first
-        print("\nPhase 1: Correctness Verification")
-        print("-" * 40)
         
-        if not tester.test_data_interface_simple():
-            print("FAILED: Data interface simple test failed")
-            return False
-            
-        if not tester.test_tensor_interface_simple():
-            print("FAILED: Tensor interface simple test failed")
-            return False
-            
-        print("SUCCESS: All correctness tests passed!")
-        
-        # Run high-performance tests with hundreds of MB data
-        print("\nPhase 2: High-Performance Testing (Hundreds of MB)")
-        print("-" * 50)
-        print("Testing large data transfers (50MB - 800MB) to measure peak performance")
+        print("[INIT] Setup completed successfully\n")
         
-        # Test data sizes focused on hundreds of MB
-        large_data_sizes = [50.0, 100.0, 200.0, 300.0, 500.0, 800.0]  # 50MB to 800MB
-        if not tester.test_data_bandwidth_performance_large_scale(large_data_sizes):
-            print("FAILED: Large data bandwidth performance test failed")
-            return False
-            
-        # Test tensor sizes focused on hundreds of MB  
-        large_tensor_configs = [
-            ("Float32_100MB", (5120, 5120), torch.float32),     # ~100MB
-            ("Float32_200MB", (7237, 7237), torch.float32),     # ~200MB
-            ("Float32_400MB", (10240, 10240), torch.float32),   # ~400MB
-            ("Float64_200MB", (5120, 5120), torch.float64),     # ~200MB
-            ("Int64_300MB", (8000, 5000), torch.int64),         # ~300MB
-            ("Float32_600MB", (12500, 12500), torch.float32),   # ~600MB
-        ]
-        if not tester.test_tensor_bandwidth_performance_large_scale(large_tensor_configs):
-            print("FAILED: Large tensor bandwidth performance test failed")
-            return False
+        # Run comprehensive tests
+        print("[START] Running comprehensive performance tests...")
+        success = tester.test_comprehensive_performance()
         
-        # Print results
+        # Print final results
         tester.results.print_summary()
         
-        print("\nSUCCESS: All high-performance tests completed!")
-        print("\nTest Summary:")
-        print(f"- Tested data sizes: 50MB to 800MB")
-        print(f"- Tested tensor sizes: 100MB to 600MB")
-        print(f"- All tests focused on measuring peak bandwidth performance")
-        print(f"- Tensor correctness validation enabled (for tensors ≤ 200MB)")
-        print(f"- Zero-copy optimization with pybind11::handle and std::string_view")
-        return True
+        # Print conclusion
+        print("\n" + "="*80)
+        print("TEST CONCLUSION")
+        print("="*80)
+        if success:
+            print("[SUCCESS] All performance tests completed successfully!")
+            print("- CoroRPC interface is functioning correctly")
+            print("- Performance metrics collected for analysis")
+            print("- Tensor validation passed for all tests")
+            return True
+        else:
+            print("[FAILURE] Some performance tests failed")
+            print("- Check error logs above for details")
+            return False
         
     except Exception as e:
-        print(f"ERROR: Test failed with exception: {e}")
+        print(f"[FATAL] Test suite failed with exception: {e}")
         import traceback
         traceback.print_exc()
         return False
         
     finally:
+        print("\n[CLEANUP] Cleaning up test environment...")
         tester.teardown()
 
 
 if __name__ == "__main__":
     success = main()
-    print(f"\nFinal result: {'SUCCESS' if success else 'FAILURE'}")
+    print(f"\nFinal Result: {'SUCCESS' if success else 'FAILURE'}")
     sys.exit(0 if success else 1)

From 101c03122959fa823eadcd286875dcca00793a5a Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 15:53:58 +0800
Subject: [PATCH 28/64] replaced std io with LOG io

---
 .../cororpc_communicator.cpp                  | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 9b508f112..d582c5a5b 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -16,16 +16,16 @@ CoroRPCCommunicator::~CoroRPCCommunicator() { stopServer(); }
 
 void CoroRPCCommunicator::setDataReceiveCallback(
     std::function<void(std::string_view, std::string_view)> callback) {
-    std::cout << "Setting data receive callback..." << std::endl;
+    LOG(INFO) << "Setting data receive callback..." << std::endl;
     impl_->data_receive_callback = callback;
-    std::cout << "Data receive callback set successfully" << std::endl;
+    LOG(INFO) << "Data receive callback set successfully" << std::endl;
 }
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
 
     if (!config.listen_address.empty()) {
-        std::cout << "Initializing server on " << config.listen_address
+        LOG(INFO) << "Initializing server on " << config.listen_address
                   << std::endl;
 
         impl_->server_ = std::make_unique<coro_rpc::coro_rpc_server>(
@@ -37,7 +37,7 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
 
-    std::cout << "Communicator initialized with client pool support"
+    LOG(INFO) << "Communicator initialized with client pool support"
               << std::endl;
     return true;
 }
@@ -57,16 +57,16 @@ bool CoroRPCCommunicator::startServer() {
         auto ec = impl_->server_->start();
         if (ec.val() == 0) {
             impl_->is_server_started = true;
-            std::cout << "Server started on " << impl_->config.listen_address
+            LOG(INFO) << "Server started on " << impl_->config.listen_address
                       << std::endl;
             return true;
         } else {
-            std::cerr << "Failed to start server: " << ec.message()
+            LOG(ERROR) << "Failed to start server: " << ec.message()
                       << std::endl;
             return false;
         }
     } catch (const std::exception& e) {
-        std::cerr << "Failed to start server: " << e.what() << std::endl;
+        LOG(ERROR) << "Failed to start server: " << e.what() << std::endl;
         return false;
     }
 }
@@ -78,15 +78,15 @@ bool CoroRPCCommunicator::startServerAsync() {
         auto ec = impl_->server_->async_start();
         if (!ec.hasResult()) {
             impl_->is_server_started = true;
-            std::cout << "Server started asynchronously on "
+            LOG(INFO) << "Server started asynchronously on "
                       << impl_->config.listen_address << std::endl;
             return true;
         } else {
-            std::cerr << "Failed to start server asynchronously" << std::endl;
+            LOG(ERROR) << "Failed to start server asynchronously" << std::endl;
             return false;
         }
     } catch (const std::exception& e) {
-        std::cerr << "Failed to start server asynchronously: " << e.what()
+        LOG(ERROR) << "Failed to start server asynchronously: " << e.what()
                   << std::endl;
         return false;
     }
@@ -95,7 +95,7 @@ bool CoroRPCCommunicator::startServerAsync() {
 void CoroRPCCommunicator::stopServer() {
     if (impl_->is_server_started) {
         impl_->is_server_started = false;
-        std::cout << "Server stopped" << std::endl;
+        LOG(INFO) << "Server stopped" << std::endl;
     }
 }
 
@@ -126,7 +126,7 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
                         .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
                             std::string_view{});
                 if (!result.has_value()) {
-                    std::cerr << "RPC call failed: " << result.error().msg
+                    LOG(ERROR) << "RPC call failed: " << result.error().msg
                               << std::endl;
                 }
             } else {
@@ -136,14 +136,14 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
                         .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
                             data_view);
                 if (!result.has_value()) {
-                    std::cerr << "RPC call failed: " << result.error().msg
+                    LOG(ERROR) << "RPC call failed: " << result.error().msg
                               << std::endl;
                 }
             }
         });
 
     if (!rpc_result.has_value()) {
-        std::cout << "RPC send request failed" << std::endl;
+        LOG(INFO) << "RPC send request failed" << std::endl;
         co_return result{-1, "RPC call failed"};
     }
     result res;
@@ -175,12 +175,12 @@ async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
                     .call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
 
             if (!result.has_value()) {
-                std::cerr << "Tensor RPC call failed: " << result.error().msg
+                LOG(ERROR) << "Tensor RPC call failed: " << result.error().msg
                           << std::endl;
             }
         });
     if (!rpc_result.has_value()) {
-        std::cout << "Tensor RPC send request failed" << std::endl;
+        LOG(INFO) << "Tensor RPC send request failed" << std::endl;
         co_return -1;
     }
     co_return 0;
@@ -208,13 +208,13 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
-    std::cout << "Handling data transfer - Data: " << data.size()
+    LOG(INFO) << "Handling data transfer - Data: " << data.size()
               << " bytes, Attachment: " << attachment.size() << " bytes"
               << std::endl;
 
     // Call the data receive callback if set
     if (data_receive_callback) {
-        std::cout << "Calling data receive callback..." << std::endl;
+        LOG(INFO) << "Calling data receive callback..." << std::endl;
         std::string_view source_address =
             "unknown";  // Could extract from context if needed
 
@@ -229,7 +229,7 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
             data_receive_callback(source_address, data);
         }
     } else {
-        std::cout << "No data receive callback set!" << std::endl;
+        LOG(INFO) << "No data receive callback set!" << std::endl;
     }
 
     // Echo back the attachment for response (zero-copy)
@@ -245,7 +245,7 @@ void CoroRPCCommunicator::Impl::handleTensorTransfer(
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
-    std::cout << "Handling tensor transfer: " << attachment.size() << " bytes"
+    LOG(INFO) << "Handling tensor transfer: " << attachment.size() << " bytes"
               << std::endl;
 
     ctx_info->set_response_attachment(attachment);
@@ -257,7 +257,7 @@ void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
-    std::cout << "Handling data transfer with attachment - Data: "
+    LOG(INFO) << "Handling data transfer with attachment - Data: "
               << data.size() << " bytes, Attachment: " << attachment.size()
               << " bytes" << std::endl;
 
@@ -269,7 +269,7 @@ void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
-    std::cout << "Handling tensor transfer with attachment: "
+    LOG(INFO) << "Handling tensor transfer with attachment: "
               << attachment.size() << " bytes" << std::endl;
 
     ctx_info->set_response_attachment(attachment);
@@ -285,7 +285,7 @@ std::unique_ptr<CoroRPCCommunicator> createServer(
 
     auto communicator = std::make_unique<CoroRPCCommunicator>();
     if (communicator->initialize(config)) {
-        std::cout << "Created server communicator with pool size: "
+        LOG(INFO) << "Created server communicator with pool size: "
                   << config.pool_size << std::endl;
         return communicator;
     }

From 5fd5bb843fff06a932bd18904e24c23eec2c6038 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 16:41:06 +0800
Subject: [PATCH 29/64] convert string_view to size in the handledata

---
 .../coro_rpc_connector/cororpc_communicator.h       |  2 +-
 .../coro_rpc_connector/cororpc_communicator.cpp     | 10 +++++++---
 .../coro_rpc_connector/cororpc_interface.cpp        | 13 +++++--------
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 7588d26ea..2512c01db 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -55,7 +55,7 @@ class CoroRPCCommunicator {
         void handleDataTransfer(coro_rpc::context<void> context,
                                 std::string_view data);
         void handleTensorTransfer(coro_rpc::context<void> context);
-        void handleDataTransferWithAttachment(coro_rpc::context<void> context,
+        size_t handleDataTransferWithAttachment(coro_rpc::context<size_t> context,
                                               std::string_view data);
         void handleTensorTransferWithAttachment(
             coro_rpc::context<void> context);
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index d582c5a5b..b7806fb08 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -252,8 +252,8 @@ void CoroRPCCommunicator::Impl::handleTensorTransfer(
     context.response_msg();
 }
 
-void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
-    coro_rpc::context<void> context, std::string_view data) {
+size_t CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
+    coro_rpc::context<size_t> context, std::string_view data) {
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
@@ -261,7 +261,11 @@ void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
               << data.size() << " bytes, Attachment: " << attachment.size()
               << " bytes" << std::endl;
 
-    context.response_msg();
+    // Calculate total data length (data parameter + attachment)
+    size_t total_length = data.size() + attachment.size();
+    
+    context.response_msg(total_length);
+    return total_length;
 }
 
 void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 81dfd3da2..fff39f36c 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -347,9 +347,6 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
     std::cout << "CoroRPCInterface::handleIncomingData called with "
               << data.size() << " bytes" << std::endl;
 
-    // For tensor data detection, we'll use a simple heuristic based on data
-    // size and patterns If data size is large enough and has a specific
-    // pattern, treat as tensor This is a simplified approach since we removed
     // C++ tensor rebuilding
     if (data.size() >= 72) {  // 72 bytes is our metadata size
         // Read the first few bytes to check if it looks like tensor metadata
@@ -430,8 +427,8 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
 
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
-        std::cerr << "Error in data receive callback: " << e.what()
-                  << std::endl;
+        LOG(ERROR) << "Error in data receive callback: " << e.what()
+                   << std::endl;
     }
 }
 
@@ -457,10 +454,10 @@ void CoroRPCInterface::handleIncomingTensor(std::string_view source,
 
         ReceivedTensor received;
         received.source_address =
-            std::string(source);            // Convert to string for storage
-        received.data = std::string(data);  // Convert to string for storage
+            std::string(source);
+        received.data = std::string(data);  
         received.shape = shape;
-        received.dtype = std::string(dtype);  // Convert to string for storage
+        received.dtype = std::string(dtype);  
 
         impl_->tensor_receive_callback(received);
     } catch (const std::exception& e) {

From 98ca700cd6aab9e2b6e2994b722499787528bb58 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 16:41:23 +0800
Subject: [PATCH 30/64] convert string_view to size in the handledatatransfer

---
 .../coro_rpc_connector/cororpc_communicator.h |   2 +-
 .../cororpc_communicator.cpp                  |  50 +++--
 .../tests/network_monitor.py                  | 201 ++++++++++++++++++
 3 files changed, 232 insertions(+), 21 deletions(-)
 create mode 100755 mooncake-transfer-engine/tests/network_monitor.py

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 2512c01db..7588d26ea 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -55,7 +55,7 @@ class CoroRPCCommunicator {
         void handleDataTransfer(coro_rpc::context<void> context,
                                 std::string_view data);
         void handleTensorTransfer(coro_rpc::context<void> context);
-        size_t handleDataTransferWithAttachment(coro_rpc::context<size_t> context,
+        void handleDataTransferWithAttachment(coro_rpc::context<void> context,
                                               std::string_view data);
         void handleTensorTransferWithAttachment(
             coro_rpc::context<void> context);
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index b7806fb08..9ee74374c 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -252,32 +252,42 @@ void CoroRPCCommunicator::Impl::handleTensorTransfer(
     context.response_msg();
 }
 
-size_t CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
-    coro_rpc::context<size_t> context, std::string_view data) {
-    auto ctx_info = context.get_context_info();
-    auto attachment = ctx_info->get_request_attachment();
-
-    LOG(INFO) << "Handling data transfer with attachment - Data: "
-              << data.size() << " bytes, Attachment: " << attachment.size()
-              << " bytes" << std::endl;
-
-    // Calculate total data length (data parameter + attachment)
-    size_t total_length = data.size() + attachment.size();
-    
-    context.response_msg(total_length);
-    return total_length;
+void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
+    coro_rpc::context<void> context, std::string_view data) {
+    py_rpc_context t{};
+    t.context_ = std::move(context);
+    py::gil_scoped_acquire acquire;
+    //auto ctx_info = context.get_context_info();
+    //auto attachment = ctx_info->get_request_attachment();
+    auto view = py::memoryview::from_buffer(data.data(), {data.size()+}, {sizeof(char)});
+
+    // LOG(INFO) << "Handling data transfer with attachment - Data: "
+    //          << data.size() << " bytes, Attachment: " << attachment.size()
+    //          << " bytes" << std::endl;
+    py_callback(std::move(t), view);
+    //context.response_msg();
 }
 
 void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
     coro_rpc::context<void> context) {
-    auto ctx_info = context.get_context_info();
-    auto attachment = ctx_info->get_request_attachment();
+    py_rpc_context t{};
+    t.context_ = std::move(context);
+    py::gil_scoped_acquire acquire;
 
-    LOG(INFO) << "Handling tensor transfer with attachment: "
-              << attachment.size() << " bytes" << std::endl;
+    auto view = py::memoryview::from_buffer(
+        ctx_info->get_request_attachment().data(),
+        {ctx_info->get_request_attachment().size()},
+        {sizeof(int8_t)});
 
-    ctx_info->set_response_attachment(attachment);
-    context.response_msg();
+    py_callback(std::move(t), view);
+    // auto ctx_info = context.get_context_info();
+    // auto attachment = ctx_info->get_request_attachment();
+
+    // LOG(INFO) << "Handling tensor transfer with attachment: "
+    //           << attachment.size() << " bytes" << std::endl;
+
+    // ctx_info->set_response_attachment(attachment);
+    // context.response_msg();
 }
 
 std::unique_ptr<CoroRPCCommunicator> createServer(
diff --git a/mooncake-transfer-engine/tests/network_monitor.py b/mooncake-transfer-engine/tests/network_monitor.py
new file mode 100755
index 000000000..78418fa78
--- /dev/null
+++ b/mooncake-transfer-engine/tests/network_monitor.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Enhanced Network Bandwidth Monitor
+Real-time monitoring of network interface throughput with 0.5s interval
+"""
+
+import time
+import subprocess
+import sys
+from datetime import datetime
+
+
+def get_network_stats_sar():
+    """Get network stats using sar command (if available)"""
+    try:
+        result = subprocess.run(['sar', '-n', 'DEV', '1', '1'], 
+                              capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            lines = result.stdout.strip().split('\n')
+            for line in lines:
+                if 'eth0' in line or 'enp' in line:  # Common interface patterns
+                    parts = line.split()
+                    if len(parts) >= 6:
+                        rx_mb = float(parts[4]) * 8 / 1000  # Convert KB/s to Mbps
+                        tx_mb = float(parts[5]) * 8 / 1000
+                        return rx_mb, tx_mb
+    except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
+        pass
+    return 0, 0
+
+
+def get_network_stats_proc():
+    """Get network stats from /proc/net/dev"""
+    try:
+        with open('/proc/net/dev', 'r') as f:
+            lines = f.readlines()
+        
+        interfaces = {}
+        for line in lines[2:]:  # Skip header
+            parts = line.split()
+            if len(parts) >= 10:
+                interface = parts[0].rstrip(':')
+                if interface != 'lo':  # Skip loopback
+                    rx_bytes = int(parts[1])
+                    tx_bytes = int(parts[9])
+                    interfaces[interface] = {'rx': rx_bytes, 'tx': tx_bytes}
+        
+        return interfaces
+    except Exception:
+        return {}
+
+
+def monitor_bandwidth_realtime(interval=0.5, show_interfaces=False):
+    """
+    Real-time network bandwidth monitoring with customizable interval
+    
+    Args:
+        interval: Monitoring interval in seconds (default: 0.5)
+        show_interfaces: Show individual interface stats (default: False)
+    """
+    print("Enhanced Network Bandwidth Monitor")
+    print(f"Monitoring interval: {interval}s")
+    print("Press Ctrl+C to stop")
+    print("-" * 80)
+    
+    if show_interfaces:
+        print(f"{'Time':<12} {'Interface':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12}")
+    else:
+        print(f"{'Time':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12} {'Peak RX':<10} {'Peak TX':<10}")
+    print("-" * 80)
+    
+    prev_stats = get_network_stats_proc()
+    prev_time = time.time()
+    iteration = 0
+    max_rx = 0
+    max_tx = 0
+    
+    try:
+        while True:
+            time.sleep(interval)
+            
+            current_stats = get_network_stats_proc()
+            current_time = time.time()
+            time_diff = current_time - prev_time
+            
+            if time_diff <= 0:
+                continue
+                
+            # Get current timestamp
+            timestamp = datetime.now().strftime("%H:%M:%S")
+            
+            if show_interfaces:
+                # Show individual interface statistics
+                for interface in current_stats:
+                    if interface in prev_stats:
+                        rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
+                        tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
+                        
+                        rx_mbps = (rx_diff / time_diff) / (1024 * 1024)
+                        tx_mbps = (tx_diff / time_diff) / (1024 * 1024)
+                        total_mbps = rx_mbps + tx_mbps
+                        
+                        if rx_mbps > 0.01 or tx_mbps > 0.01:  # Only show active interfaces
+                            print(f"{timestamp:<12} {interface:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f}")
+            else:
+                # Show aggregated statistics
+                total_rx_diff = 0
+                total_tx_diff = 0
+                
+                for interface in current_stats:
+                    if interface in prev_stats:
+                        rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
+                        tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
+                        total_rx_diff += rx_diff
+                        total_tx_diff += tx_diff
+                
+                rx_mbps = (total_rx_diff / time_diff) / (1024 * 1024)
+                tx_mbps = (total_tx_diff / time_diff) / (1024 * 1024)
+                total_mbps = rx_mbps + tx_mbps
+                
+                # Track peak values
+                max_rx = max(max_rx, rx_mbps)
+                max_tx = max(max_tx, tx_mbps)
+                
+                print(f"{timestamp:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f} {max_rx:<10.2f} {max_tx:<10.2f}")
+            
+            prev_stats = current_stats
+            prev_time = current_time
+            iteration += 1
+            
+    except KeyboardInterrupt:
+        print(f"\nMonitoring stopped after {iteration} iterations")
+        if not show_interfaces:
+            print(f"Peak RX: {max_rx:.2f} MB/s, Peak TX: {max_tx:.2f} MB/s")
+
+
+def monitor_bandwidth_duration(duration=10, interval=0.5):
+    """
+    Monitor network bandwidth for a specific duration
+    
+    Args:
+        duration: Total monitoring duration in seconds
+        interval: Monitoring interval in seconds
+    """
+    print(f"Network Bandwidth Monitor - Running for {duration} seconds (interval: {interval}s)")
+    print(f"{'Time':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12}")
+    print("-" * 50)
+    
+    prev_stats = get_network_stats_proc()
+    prev_time = time.time()
+    elapsed = 0
+    
+    while elapsed < duration:
+        time.sleep(interval)
+        elapsed += interval
+        
+        current_stats = get_network_stats_proc()
+        current_time = time.time()
+        time_diff = current_time - prev_time
+        
+        total_rx_diff = 0
+        total_tx_diff = 0
+        
+        for interface in current_stats:
+            if interface in prev_stats:
+                rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
+                tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
+                total_rx_diff += rx_diff
+                total_tx_diff += tx_diff
+        
+        rx_mbps = (total_rx_diff / time_diff) / (1024 * 1024)
+        tx_mbps = (total_tx_diff / time_diff) / (1024 * 1024)
+        total_mbps = rx_mbps + tx_mbps
+        
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        print(f"{timestamp:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f}")
+        
+        prev_stats = current_stats
+        prev_time = current_time
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Network Bandwidth Monitor")
+    parser.add_argument('-d', '--duration', type=int, default=0, 
+                       help='Monitoring duration in seconds (0 for infinite)')
+    parser.add_argument('-i', '--interval', type=float, default=0.5, 
+                       help='Monitoring interval in seconds (default: 0.5)')
+    parser.add_argument('--interfaces', action='store_true', 
+                       help='Show individual interface statistics')
+    
+    args = parser.parse_args()
+    
+    try:
+        if args.duration > 0:
+            monitor_bandwidth_duration(args.duration, args.interval)
+        else:
+            monitor_bandwidth_realtime(args.interval, args.interfaces)
+    except KeyboardInterrupt:
+        print("\nMonitoring stopped.")

From fdd5a1880e579675184d9bb1aa2e627a23585179 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 16:41:55 +0800
Subject: [PATCH 31/64] remove monitor

---
 .../tests/network_monitor.py                  | 201 ------------------
 1 file changed, 201 deletions(-)
 delete mode 100755 mooncake-transfer-engine/tests/network_monitor.py

diff --git a/mooncake-transfer-engine/tests/network_monitor.py b/mooncake-transfer-engine/tests/network_monitor.py
deleted file mode 100755
index 78418fa78..000000000
--- a/mooncake-transfer-engine/tests/network_monitor.py
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/usr/bin/env python3
-"""
-Enhanced Network Bandwidth Monitor
-Real-time monitoring of network interface throughput with 0.5s interval
-"""
-
-import time
-import subprocess
-import sys
-from datetime import datetime
-
-
-def get_network_stats_sar():
-    """Get network stats using sar command (if available)"""
-    try:
-        result = subprocess.run(['sar', '-n', 'DEV', '1', '1'], 
-                              capture_output=True, text=True, timeout=5)
-        if result.returncode == 0:
-            lines = result.stdout.strip().split('\n')
-            for line in lines:
-                if 'eth0' in line or 'enp' in line:  # Common interface patterns
-                    parts = line.split()
-                    if len(parts) >= 6:
-                        rx_mb = float(parts[4]) * 8 / 1000  # Convert KB/s to Mbps
-                        tx_mb = float(parts[5]) * 8 / 1000
-                        return rx_mb, tx_mb
-    except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
-        pass
-    return 0, 0
-
-
-def get_network_stats_proc():
-    """Get network stats from /proc/net/dev"""
-    try:
-        with open('/proc/net/dev', 'r') as f:
-            lines = f.readlines()
-        
-        interfaces = {}
-        for line in lines[2:]:  # Skip header
-            parts = line.split()
-            if len(parts) >= 10:
-                interface = parts[0].rstrip(':')
-                if interface != 'lo':  # Skip loopback
-                    rx_bytes = int(parts[1])
-                    tx_bytes = int(parts[9])
-                    interfaces[interface] = {'rx': rx_bytes, 'tx': tx_bytes}
-        
-        return interfaces
-    except Exception:
-        return {}
-
-
-def monitor_bandwidth_realtime(interval=0.5, show_interfaces=False):
-    """
-    Real-time network bandwidth monitoring with customizable interval
-    
-    Args:
-        interval: Monitoring interval in seconds (default: 0.5)
-        show_interfaces: Show individual interface stats (default: False)
-    """
-    print("Enhanced Network Bandwidth Monitor")
-    print(f"Monitoring interval: {interval}s")
-    print("Press Ctrl+C to stop")
-    print("-" * 80)
-    
-    if show_interfaces:
-        print(f"{'Time':<12} {'Interface':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12}")
-    else:
-        print(f"{'Time':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12} {'Peak RX':<10} {'Peak TX':<10}")
-    print("-" * 80)
-    
-    prev_stats = get_network_stats_proc()
-    prev_time = time.time()
-    iteration = 0
-    max_rx = 0
-    max_tx = 0
-    
-    try:
-        while True:
-            time.sleep(interval)
-            
-            current_stats = get_network_stats_proc()
-            current_time = time.time()
-            time_diff = current_time - prev_time
-            
-            if time_diff <= 0:
-                continue
-                
-            # Get current timestamp
-            timestamp = datetime.now().strftime("%H:%M:%S")
-            
-            if show_interfaces:
-                # Show individual interface statistics
-                for interface in current_stats:
-                    if interface in prev_stats:
-                        rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
-                        tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
-                        
-                        rx_mbps = (rx_diff / time_diff) / (1024 * 1024)
-                        tx_mbps = (tx_diff / time_diff) / (1024 * 1024)
-                        total_mbps = rx_mbps + tx_mbps
-                        
-                        if rx_mbps > 0.01 or tx_mbps > 0.01:  # Only show active interfaces
-                            print(f"{timestamp:<12} {interface:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f}")
-            else:
-                # Show aggregated statistics
-                total_rx_diff = 0
-                total_tx_diff = 0
-                
-                for interface in current_stats:
-                    if interface in prev_stats:
-                        rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
-                        tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
-                        total_rx_diff += rx_diff
-                        total_tx_diff += tx_diff
-                
-                rx_mbps = (total_rx_diff / time_diff) / (1024 * 1024)
-                tx_mbps = (total_tx_diff / time_diff) / (1024 * 1024)
-                total_mbps = rx_mbps + tx_mbps
-                
-                # Track peak values
-                max_rx = max(max_rx, rx_mbps)
-                max_tx = max(max_tx, tx_mbps)
-                
-                print(f"{timestamp:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f} {max_rx:<10.2f} {max_tx:<10.2f}")
-            
-            prev_stats = current_stats
-            prev_time = current_time
-            iteration += 1
-            
-    except KeyboardInterrupt:
-        print(f"\nMonitoring stopped after {iteration} iterations")
-        if not show_interfaces:
-            print(f"Peak RX: {max_rx:.2f} MB/s, Peak TX: {max_tx:.2f} MB/s")
-
-
-def monitor_bandwidth_duration(duration=10, interval=0.5):
-    """
-    Monitor network bandwidth for a specific duration
-    
-    Args:
-        duration: Total monitoring duration in seconds
-        interval: Monitoring interval in seconds
-    """
-    print(f"Network Bandwidth Monitor - Running for {duration} seconds (interval: {interval}s)")
-    print(f"{'Time':<12} {'RX(MB/s)':<10} {'TX(MB/s)':<10} {'Total(MB/s)':<12}")
-    print("-" * 50)
-    
-    prev_stats = get_network_stats_proc()
-    prev_time = time.time()
-    elapsed = 0
-    
-    while elapsed < duration:
-        time.sleep(interval)
-        elapsed += interval
-        
-        current_stats = get_network_stats_proc()
-        current_time = time.time()
-        time_diff = current_time - prev_time
-        
-        total_rx_diff = 0
-        total_tx_diff = 0
-        
-        for interface in current_stats:
-            if interface in prev_stats:
-                rx_diff = current_stats[interface]['rx'] - prev_stats[interface]['rx']
-                tx_diff = current_stats[interface]['tx'] - prev_stats[interface]['tx']
-                total_rx_diff += rx_diff
-                total_tx_diff += tx_diff
-        
-        rx_mbps = (total_rx_diff / time_diff) / (1024 * 1024)
-        tx_mbps = (total_tx_diff / time_diff) / (1024 * 1024)
-        total_mbps = rx_mbps + tx_mbps
-        
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        print(f"{timestamp:<12} {rx_mbps:<10.2f} {tx_mbps:<10.2f} {total_mbps:<12.2f}")
-        
-        prev_stats = current_stats
-        prev_time = current_time
-
-
-if __name__ == "__main__":
-    import argparse
-    
-    parser = argparse.ArgumentParser(description="Network Bandwidth Monitor")
-    parser.add_argument('-d', '--duration', type=int, default=0, 
-                       help='Monitoring duration in seconds (0 for infinite)')
-    parser.add_argument('-i', '--interval', type=float, default=0.5, 
-                       help='Monitoring interval in seconds (default: 0.5)')
-    parser.add_argument('--interfaces', action='store_true', 
-                       help='Show individual interface statistics')
-    
-    args = parser.parse_args()
-    
-    try:
-        if args.duration > 0:
-            monitor_bandwidth_duration(args.duration, args.interval)
-        else:
-            monitor_bandwidth_realtime(args.interval, args.interfaces)
-    except KeyboardInterrupt:
-        print("\nMonitoring stopped.")

From 9cd2e63d2c10944b95450863184b3eebe246cbd3 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 17:13:44 +0800
Subject: [PATCH 32/64] removed redundent lines

---
 .../coro_rpc_connector/cororpc_communicator.h |   3 +
 .../coro_rpc_connector/cororpc_interface.h    |   1 +
 .../cororpc_communicator.cpp                  | 107 +++++++++++-------
 .../coro_rpc_connector/cororpc_interface.cpp  |  45 ++++----
 4 files changed, 94 insertions(+), 62 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 7588d26ea..a2d3cdca8 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -7,6 +7,7 @@
 #include <unordered_map>
 #include <functional>
 #include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
 #include <ylt/coro_io/client_pool.hpp>
@@ -52,6 +53,8 @@ class CoroRPCCommunicator {
         std::function<void(std::string_view, std::string_view)>
             data_receive_callback;
 
+        pybind11::handle py_callback;
+
         void handleDataTransfer(coro_rpc::context<void> context,
                                 std::string_view data);
         void handleTensorTransfer(coro_rpc::context<void> context);
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 3ed34c303..35263544c 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -4,6 +4,7 @@
 #include <string>
 #include <vector>
 #include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
 
 namespace mooncake {
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 9ee74374c..42f49121a 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -2,31 +2,55 @@
 #include <iostream>
 #include <thread>
 #include <functional>
+#include <glog/logging.h>
 #include <ylt/coro_rpc/coro_rpc_client.hpp>
 #include <ylt/coro_rpc/coro_rpc_server.hpp>
 #include <ylt/coro_io/client_pool.hpp>
 #include <ylt/coro_io/coro_io.hpp>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
 #include "async_simple/coro/SyncAwait.h"
 
+namespace py = pybind11;
+
 namespace mooncake {
 
+class py_rpc_context {
+   public:
+    void response_msg(py::buffer msg, py::handle done) {
+        py::buffer_info info = msg.request();
+        const char* data = static_cast<char*>(info.ptr);
+        context_.get_context_info()->set_response_attachment(
+            std::string_view(data, info.size));
+        done.inc_ref();
+        context_.get_context_info()->set_complete_handler(
+            [done](const std::error_code& ec, std::size_t) {
+                py::gil_scoped_acquire acquire;
+                done(!ec);
+                done.dec_ref();
+            });
+        context_.response_msg();
+    }
+
+    coro_rpc::context<void> context_;
+};
+
 CoroRPCCommunicator::CoroRPCCommunicator() : impl_(std::make_shared<Impl>()) {}
 
 CoroRPCCommunicator::~CoroRPCCommunicator() { stopServer(); }
 
 void CoroRPCCommunicator::setDataReceiveCallback(
     std::function<void(std::string_view, std::string_view)> callback) {
-    LOG(INFO) << "Setting data receive callback..." << std::endl;
+    LOG(INFO) << "Setting data receive callback...";
     impl_->data_receive_callback = callback;
-    LOG(INFO) << "Data receive callback set successfully" << std::endl;
+    LOG(INFO) << "Data receive callback set successfully";
 }
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
 
     if (!config.listen_address.empty()) {
-        LOG(INFO) << "Initializing server on " << config.listen_address
-                  << std::endl;
+        LOG(INFO) << "Initializing server on " << config.listen_address;
 
         impl_->server_ = std::make_unique<coro_rpc::coro_rpc_server>(
             config.thread_count, config.listen_address,
@@ -37,8 +61,7 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
 
-    LOG(INFO) << "Communicator initialized with client pool support"
-              << std::endl;
+    LOG(INFO) << "Communicator initialized with client pool support";
     return true;
 }
 
@@ -57,16 +80,14 @@ bool CoroRPCCommunicator::startServer() {
         auto ec = impl_->server_->start();
         if (ec.val() == 0) {
             impl_->is_server_started = true;
-            LOG(INFO) << "Server started on " << impl_->config.listen_address
-                      << std::endl;
+            LOG(INFO) << "Server started on " << impl_->config.listen_address;
             return true;
         } else {
-            LOG(ERROR) << "Failed to start server: " << ec.message()
-                      << std::endl;
+            LOG(ERROR) << "Failed to start server: " << ec.message();
             return false;
         }
     } catch (const std::exception& e) {
-        LOG(ERROR) << "Failed to start server: " << e.what() << std::endl;
+        LOG(ERROR) << "Failed to start server: " << e.what();
         return false;
     }
 }
@@ -79,15 +100,14 @@ bool CoroRPCCommunicator::startServerAsync() {
         if (!ec.hasResult()) {
             impl_->is_server_started = true;
             LOG(INFO) << "Server started asynchronously on "
-                      << impl_->config.listen_address << std::endl;
+                      << impl_->config.listen_address;
             return true;
         } else {
-            LOG(ERROR) << "Failed to start server asynchronously" << std::endl;
+            LOG(ERROR) << "Failed to start server asynchronously";
             return false;
         }
     } catch (const std::exception& e) {
-        LOG(ERROR) << "Failed to start server asynchronously: " << e.what()
-                  << std::endl;
+        LOG(ERROR) << "Failed to start server asynchronously: " << e.what();
         return false;
     }
 }
@@ -95,7 +115,7 @@ bool CoroRPCCommunicator::startServerAsync() {
 void CoroRPCCommunicator::stopServer() {
     if (impl_->is_server_started) {
         impl_->is_server_started = false;
-        LOG(INFO) << "Server stopped" << std::endl;
+        LOG(INFO) << "Server stopped";
     }
 }
 
@@ -126,8 +146,7 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
                         .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
                             std::string_view{});
                 if (!result.has_value()) {
-                    LOG(ERROR) << "RPC call failed: " << result.error().msg
-                              << std::endl;
+                    LOG(ERROR) << "RPC call failed: " << result.error().msg;
                 }
             } else {
                 // Use regular parameter for small data
@@ -136,14 +155,13 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
                         .call<&CoroRPCCommunicator::Impl::handleDataTransfer>(
                             data_view);
                 if (!result.has_value()) {
-                    LOG(ERROR) << "RPC call failed: " << result.error().msg
-                              << std::endl;
+                    LOG(ERROR) << "RPC call failed: " << result.error().msg;
                 }
             }
         });
 
     if (!rpc_result.has_value()) {
-        LOG(INFO) << "RPC send request failed" << std::endl;
+        LOG(INFO) << "RPC send request failed";
         co_return result{-1, "RPC call failed"};
     }
     result res;
@@ -175,12 +193,11 @@ async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
                     .call<&CoroRPCCommunicator::Impl::handleTensorTransfer>();
 
             if (!result.has_value()) {
-                LOG(ERROR) << "Tensor RPC call failed: " << result.error().msg
-                          << std::endl;
+                LOG(ERROR) << "Tensor RPC call failed: " << result.error().msg;
             }
         });
     if (!rpc_result.has_value()) {
-        LOG(INFO) << "Tensor RPC send request failed" << std::endl;
+        LOG(INFO) << "Tensor RPC send request failed";
         co_return -1;
     }
     co_return 0;
@@ -209,12 +226,10 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
     auto attachment = ctx_info->get_request_attachment();
 
     LOG(INFO) << "Handling data transfer - Data: " << data.size()
-              << " bytes, Attachment: " << attachment.size() << " bytes"
-              << std::endl;
-
+              << " bytes, Attachment: " << attachment.size() << " bytes";
     // Call the data receive callback if set
     if (data_receive_callback) {
-        LOG(INFO) << "Calling data receive callback..." << std::endl;
+        LOG(INFO) << "Calling data receive callback...";
         std::string_view source_address =
             "unknown";  // Could extract from context if needed
 
@@ -229,7 +244,7 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
             data_receive_callback(source_address, data);
         }
     } else {
-        LOG(INFO) << "No data receive callback set!" << std::endl;
+        LOG(INFO) << "No data receive callback set!";
     }
 
     // Echo back the attachment for response (zero-copy)
@@ -245,8 +260,20 @@ void CoroRPCCommunicator::Impl::handleTensorTransfer(
     auto ctx_info = context.get_context_info();
     auto attachment = ctx_info->get_request_attachment();
 
-    LOG(INFO) << "Handling tensor transfer: " << attachment.size() << " bytes"
-              << std::endl;
+    LOG(INFO) << "Handling tensor transfer: " << attachment.size() << " bytes";
+
+    // Call the data receive callback if set (tensor data is received via
+    // attachment)
+    if (data_receive_callback) {
+        LOG(INFO) << "Calling data receive callback for tensor...";
+        std::string_view source_address =
+            "unknown";  // Could extract from context if needed
+
+        // Pass the attachment data to the callback
+        data_receive_callback(source_address, attachment);
+    } else {
+        LOG(INFO) << "No data receive callback set for tensor!";
+    }
 
     ctx_info->set_response_attachment(attachment);
     context.response_msg();
@@ -257,27 +284,31 @@ void CoroRPCCommunicator::Impl::handleDataTransferWithAttachment(
     py_rpc_context t{};
     t.context_ = std::move(context);
     py::gil_scoped_acquire acquire;
-    //auto ctx_info = context.get_context_info();
-    //auto attachment = ctx_info->get_request_attachment();
-    auto view = py::memoryview::from_buffer(data.data(), {data.size()+}, {sizeof(char)});
+    // auto ctx_info = context.get_context_info();
+    // auto attachment = ctx_info->get_request_attachment();
+    auto view =
+        py::memoryview::from_buffer(data.data(), {data.size()}, {sizeof(char)});
 
     // LOG(INFO) << "Handling data transfer with attachment - Data: "
     //          << data.size() << " bytes, Attachment: " << attachment.size()
     //          << " bytes" << std::endl;
     py_callback(std::move(t), view);
-    //context.response_msg();
+    // context.response_msg();
 }
 
 void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
     coro_rpc::context<void> context) {
     py_rpc_context t{};
+
+    // Get the attachment before moving the context
+    auto ctx_info = context.get_context_info();
+    auto attachment = ctx_info->get_request_attachment();
+
     t.context_ = std::move(context);
     py::gil_scoped_acquire acquire;
 
     auto view = py::memoryview::from_buffer(
-        ctx_info->get_request_attachment().data(),
-        {ctx_info->get_request_attachment().size()},
-        {sizeof(int8_t)});
+        attachment.data(), {attachment.size()}, {sizeof(int8_t)});
 
     py_callback(std::move(t), view);
     // auto ctx_info = context.get_context_info();
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index fff39f36c..40db55f6c 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -5,6 +5,7 @@
 #include <thread>
 #include <future>
 #include <vector>
+#include <glog/logging.h>
 #include "async_simple/coro/SyncAwait.h"
 
 namespace mooncake {
@@ -151,7 +152,7 @@ pybind11::object CoroRPCInterface::sendDataAsync(std::string& target_address,
     auto lazy = coro_lambda();
     lazy.start([](auto&& result) {
         if (result.hasError()) {
-            std::cerr << "Coroutine completed with error" << std::endl;
+            std::cerr << "Coroutine completed with error";
         }
     });
 
@@ -175,7 +176,7 @@ int CoroRPCInterface::sendTensor(const std::string& target_address,
                       .attr("__name__")
                       .cast<std::string>()
                       .find("Tensor") != std::string::npos)) {
-                std::cerr << "Input is not a tensor" << std::endl;
+                std::cerr << "Input is not a tensor";
                 return -1;
             }
 
@@ -214,7 +215,7 @@ int CoroRPCInterface::sendTensor(const std::string& target_address,
             }
             std::cout << "] and dtype: " << tensor_info.dtype
                       << ", tensor size: " << tensor_info.total_bytes
-                      << " bytes" << std::endl;
+                      << " bytes";
         }
 
         // Use the async version which supports zero-copy via attachments
@@ -224,7 +225,7 @@ int CoroRPCInterface::sendTensor(const std::string& target_address,
         return result;
 
     } catch (const std::exception& e) {
-        std::cerr << "Send tensor error: " << e.what() << std::endl;
+        std::cerr << "Send tensor error: " << e.what();
         return -1;
     }
 }
@@ -314,7 +315,7 @@ pybind11::object CoroRPCInterface::sendTensorAsync(std::string& target_address,
         // coroutine itself
         if (result.hasError()) {
             // Log error if needed
-            std::cerr << "Tensor coroutine completed with error" << std::endl;
+            std::cerr << "Tensor coroutine completed with error";
         }
     });
 
@@ -345,7 +346,7 @@ void CoroRPCInterface::setTensorReceiveCallback(pybind11::function callback) {
 void CoroRPCInterface::handleIncomingData(std::string_view source,
                                           std::string_view data) {
     std::cout << "CoroRPCInterface::handleIncomingData called with "
-              << data.size() << " bytes" << std::endl;
+              << data.size() << " bytes";
 
     // C++ tensor rebuilding
     if (data.size() >= 72) {  // 72 bytes is our metadata size
@@ -355,13 +356,12 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
         uint32_t ndim = header[1];
 
         std::cout << "Checking tensor metadata: dtype=" << dtype
-                  << ", ndim=" << ndim << std::endl;
+                  << ", ndim=" << ndim;
 
         // Basic validation: check if dtype and ndim are in reasonable ranges
         if (dtype > 0 && dtype <= 9 && ndim >= 0 && ndim <= 4) {
             std::cout
-                << "Data recognized as tensor, calling handleIncomingTensor"
-                << std::endl;
+                << "Data recognized as tensor, calling handleIncomingTensor";
 
             // This looks like tensor data, handle it as such
             std::vector<size_t> shape;
@@ -427,8 +427,7 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
 
         impl_->data_receive_callback(received);
     } catch (const std::exception& e) {
-        LOG(ERROR) << "Error in data receive callback: " << e.what()
-                   << std::endl;
+        LOG(ERROR) << "Error in data receive callback: " << e.what();
     }
 }
 
@@ -436,33 +435,31 @@ void CoroRPCInterface::handleIncomingTensor(std::string_view source,
                                             std::string_view data,
                                             const std::vector<size_t>& shape,
                                             std::string_view dtype) {
-    std::cout << "CoroRPCInterface::handleIncomingTensor called" << std::endl;
-    std::cout << "  source: " << source << std::endl;
-    std::cout << "  data size: " << data.size() << std::endl;
-    std::cout << "  dtype: " << dtype << std::endl;
-    std::cout << "  shape size: " << shape.size() << std::endl;
+    std::cout << "CoroRPCInterface::handleIncomingTensor called";
+    std::cout << "  source: " << source;
+    std::cout << "  data size: " << data.size();
+    std::cout << "  dtype: " << dtype;
+    std::cout << "  shape size: " << shape.size();
 
     if (!impl_->tensor_receive_callback) {
-        std::cout << "No tensor receive callback set!" << std::endl;
+        std::cout << "No tensor receive callback set!";
         return;
     }
 
-    std::cout << "Calling Python tensor receive callback..." << std::endl;
+    std::cout << "Calling Python tensor receive callback...";
 
     try {
         pybind11::gil_scoped_acquire acquire;
 
         ReceivedTensor received;
-        received.source_address =
-            std::string(source);
-        received.data = std::string(data);  
+        received.source_address = std::string(source);
+        received.data = std::string(data);
         received.shape = shape;
-        received.dtype = std::string(dtype);  
+        received.dtype = std::string(dtype);
 
         impl_->tensor_receive_callback(received);
     } catch (const std::exception& e) {
-        std::cerr << "Error in tensor receive callback: " << e.what()
-                  << std::endl;
+        std::cerr << "Error in tensor receive callback: " << e.what();
     }
 }
 

From 6e1efa1da680bf0b26da5b485978ebea6030ce79 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 17:45:07 +0800
Subject: [PATCH 33/64] removed ndim restrictions

---
 .../src/transport/coro_rpc_connector/cororpc_interface.cpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 40db55f6c..4e17f6862 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -359,7 +359,7 @@ void CoroRPCInterface::handleIncomingData(std::string_view source,
                   << ", ndim=" << ndim;
 
         // Basic validation: check if dtype and ndim are in reasonable ranges
-        if (dtype > 0 && dtype <= 9 && ndim >= 0 && ndim <= 4) {
+        if (dtype > 0 && dtype <= 9 && ndim <= 4) {
             std::cout
                 << "Data recognized as tensor, calling handleIncomingTensor";
 

From ff96760589e3b07399d5a0e6776342de107386ee Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 10 Sep 2025 20:06:31 +0800
Subject: [PATCH 34/64] 1. removed old test scripts 2. add new test scripts for
 the communicator 3. change response to a string view

---
 .../cororpc_communicator.cpp                  |   3 +-
 .../tests/communicator_bandwidth_test.py      |  71 ++
 .../tests/test_coro_rpc_performance.py        | 611 ------------------
 scripts/run_tests.sh                          |  14 -
 4 files changed, 73 insertions(+), 626 deletions(-)
 create mode 100644 mooncake-transfer-engine/tests/communicator_bandwidth_test.py
 delete mode 100644 mooncake-transfer-engine/tests/test_coro_rpc_performance.py

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 42f49121a..08faed0bc 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -48,6 +48,7 @@ void CoroRPCCommunicator::setDataReceiveCallback(
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
+    //`easylog::set_min_severity(easylog::Serverity::WARNING);  // Set log level to WARNING
 
     if (!config.listen_address.empty()) {
         LOG(INFO) << "Initializing server on " << config.listen_address;
@@ -249,7 +250,7 @@ void CoroRPCCommunicator::Impl::handleDataTransfer(
 
     // Echo back the attachment for response (zero-copy)
     if (!attachment.empty()) {
-        ctx_info->set_response_attachment(attachment);
+        ctx_info->set_response_attachment(std::string_view("ok"));
     }
 
     context.response_msg();
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
new file mode 100644
index 000000000..736a86f7f
--- /dev/null
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -0,0 +1,71 @@
+import torch
+import numpy as np
+import time
+import sys
+import threading
+import struct
+from typing import List, Tuple, Dict, Any
+
+import mooncake.engine as engine
+
+class AtomicCounter:
+    def __init__(self, initial=0):
+        self._value = initial
+        self._lock = threading.Lock()
+
+    def inc(self, num=1):
+        with self._lock:
+            self._value += num
+            return self._value
+
+    def dec(self, num=1):
+        with self._lock:
+            self._value -= num
+            return self._value
+
+    def get(self):
+        with self._lock:
+            r = self._value
+            self._value = 0
+            return r
+
+counter = AtomicCounter()
+
+size_1mb = 1024 * 1024
+test_data = b'\x00' * size_1mb
+url = "127.0.0.1:9004"
+
+def print_qps():
+    while(True):
+        time.sleep(1)
+        val = counter.get()
+        if(val == 0):   
+            continue
+
+        print("qps:", val)
+
+def send_data(client):
+    while True:
+        result = client.send_data(url, test_data)
+        counter.inc()
+
+CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+
+server = CoroRPCInterface()
+client = CoroRPCInterface()
+server.initialize("0.0.0.0:9004", 8, 30, 4)
+server.start_server()
+client.initialize("", 0, 30, 100)
+
+thread = threading.Thread(target=print_qps)
+thread.start()
+
+for i in range(64):
+thread1 = threading.Thread(target=send_data, args=(client,))
+thread1.start()
+# while True:
+# result = client.send_data(url, test_data)
+# counter.inc()
+
+thread.join()
+# print(f"Send result: {result}")
\ No newline at end of file
diff --git a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py b/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
deleted file mode 100644
index 597f94e26..000000000
--- a/mooncake-transfer-engine/tests/test_coro_rpc_performance.py
+++ /dev/null
@@ -1,611 +0,0 @@
-#!/usr/bin/env python3
-"""
-CoroRPC Performance Testing Suite
-Tests bandwidth performance for data and tensor interfaces
-"""
-
-import torch
-import numpy as np
-import time
-import sys
-import threading
-import struct
-from typing import List, Tuple, Dict, Any
-
-try:
-    import mooncake.engine as engine
-    print("Successfully imported mooncake.engine")
-    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
-    print("Successfully imported CoroRPCInterface")
-except ImportError as e:
-    print(f"Failed to import mooncake: {e}")
-    sys.exit(1)
-except AttributeError as e:
-    print(f"Failed to import CoroRPCInterface: {e}")
-    sys.exit(1)
-
-
-class PythonTensorRebuilder:
-    """Pure Python implementation of tensor rebuilding from raw data"""
-    
-    # Tensor dtype mappings (matching C++ enum)
-    DTYPE_MAP = {
-        0: None,  # UNKNOWN
-        1: np.float16,  # FLOAT16
-        2: np.float32,  # FLOAT32  
-        3: np.float64,  # FLOAT64
-        4: np.int8,     # INT8
-        5: np.int16,    # INT16
-        6: np.int32,    # INT32
-        7: np.int64,    # INT64
-        8: np.uint8,    # UINT8
-        9: np.bool_,    # BOOL
-    }
-    
-    TORCH_DTYPE_MAP = {
-        1: torch.float16,  # FLOAT16
-        2: torch.float32,  # FLOAT32  
-        3: torch.float64,  # FLOAT64
-        4: torch.int8,     # INT8
-        5: torch.int16,    # INT16
-        6: torch.int32,    # INT32
-        7: torch.int64,    # INT64
-        8: torch.uint8,    # UINT8
-        9: torch.bool,     # BOOL
-    }
-    
-    @staticmethod
-    def parse_tensor_metadata(raw_data: bytes) -> Tuple[int, int, List[int], int]:
-        """
-        Parse tensor metadata from raw bytes
-        
-        Returns:
-            (dtype, ndim, shape, metadata_size)
-        """
-        if len(raw_data) < 72:  # Size of TensorMetadata struct
-            raise ValueError(f"Raw data too short for metadata: {len(raw_data)} bytes")
-        
-        # TensorMetadata struct layout:
-        # int32_t dtype (4 bytes)
-        # int32_t ndim (4 bytes)  
-        # int64_t shape[4] (32 bytes)
-        # char padding[32] (32 bytes)
-        # Total: 72 bytes
-        
-        metadata_format = '<ii4q32s'  # little-endian: 2 int32, 4 int64, 32 char
-        metadata_size = struct.calcsize(metadata_format)
-        
-        try:
-            unpacked = struct.unpack(metadata_format, raw_data[:metadata_size])
-            dtype = unpacked[0]
-            ndim = unpacked[1]
-            shape_array = unpacked[2:6]  # Extract shape[4]
-            # padding = unpacked[6]  # Not used
-            
-            # Extract actual shape (only first ndim elements are valid)
-            shape = [int(shape_array[i]) for i in range(ndim) if i < 4]
-            
-            return dtype, ndim, shape, metadata_size
-            
-        except struct.error as e:
-            raise ValueError(f"Failed to parse tensor metadata: {e}")
-    
-    @staticmethod
-    def rebuild_tensor_from_raw_data(raw_data: bytes, return_torch: bool = True) -> torch.Tensor:
-        """
-        Rebuild tensor from raw data bytes (pure Python implementation)
-        
-        Args:
-            raw_data: Raw bytes containing tensor metadata + data
-            return_torch: If True, return torch.Tensor; if False, return numpy array
-            
-        Returns:
-            Reconstructed tensor
-        """
-        print(f"[PYTHON] Tensor rebuilder: processing {len(raw_data)} bytes")
-        
-        # Parse metadata
-        dtype_id, ndim, shape, metadata_size = PythonTensorRebuilder.parse_tensor_metadata(raw_data)
-        
-        print(f"[PYTHON] Parsed metadata: dtype_id={dtype_id}, ndim={ndim}, shape={shape}")
-        
-        # Validate dtype
-        if dtype_id not in PythonTensorRebuilder.DTYPE_MAP or PythonTensorRebuilder.DTYPE_MAP[dtype_id] is None:
-            raise ValueError(f"Unknown or unsupported dtype: {dtype_id}")
-        
-        # Get numpy dtype
-        np_dtype = PythonTensorRebuilder.DTYPE_MAP[dtype_id]
-        element_size = np.dtype(np_dtype).itemsize
-        
-        # Calculate expected data size
-        total_elements = 1
-        for dim in shape:
-            total_elements *= dim
-        expected_data_size = total_elements * element_size
-        
-        print(f"[PYTHON] Expected: {total_elements} elements × {element_size} bytes = {expected_data_size} bytes")
-        
-        # Extract tensor data (skip metadata)
-        tensor_data = raw_data[metadata_size:]
-        actual_data_size = len(tensor_data)
-        
-        print(f"[PYTHON] Actual tensor data size: {actual_data_size} bytes")
-        
-        if actual_data_size < expected_data_size:
-            raise ValueError(f"Insufficient tensor data: expected {expected_data_size}, got {actual_data_size}")
-        
-        # Take only the required bytes (there might be padding)
-        tensor_data = tensor_data[:expected_data_size]
-        
-        # Create numpy array from raw bytes
-        print(f"[PYTHON] Creating numpy array with dtype {np_dtype} and shape {shape}")
-        
-        try:
-            # Convert bytes to numpy array
-            np_array = np.frombuffer(tensor_data, dtype=np_dtype)
-            
-            # Reshape to target shape
-            np_array = np_array.reshape(shape)
-            
-            print(f"[PYTHON] Successfully created numpy array: shape={np_array.shape}, dtype={np_array.dtype}")
-            
-            if return_torch:
-                # Convert to torch tensor
-                if dtype_id in PythonTensorRebuilder.TORCH_DTYPE_MAP:
-                    torch_dtype = PythonTensorRebuilder.TORCH_DTYPE_MAP[dtype_id]
-                    torch_tensor = torch.from_numpy(np_array.copy()).to(torch_dtype)
-                    print(f"[PYTHON] Converted to torch tensor: shape={torch_tensor.shape}, dtype={torch_tensor.dtype}")
-                    return torch_tensor
-                else:
-                    raise ValueError(f"Cannot convert dtype {dtype_id} to torch tensor")
-            else:
-                return np_array
-                
-        except Exception as e:
-            raise ValueError(f"Failed to create tensor from data: {e}")
-    
-    @staticmethod
-    def rebuild_tensor_from_received_tensor(received_tensor_obj, return_torch: bool = True):
-        """
-        Rebuild tensor from ReceivedTensor object using pure Python
-        
-        Args:
-            received_tensor_obj: ReceivedTensor object from callback
-            return_torch: If True, return torch.Tensor; if False, return numpy array
-            
-        Returns:
-            Reconstructed tensor
-        """
-        # Try multiple ways to get raw data from ReceivedTensor object
-        raw_data = None
-        
-        # Method 1: Try direct data attribute access
-        if hasattr(received_tensor_obj, 'data'):
-            try:
-                raw_data = received_tensor_obj.data
-                print(f"[PYTHON] Got data via direct attribute: {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
-            except Exception as e:
-                print(f"[PYTHON] Failed to get data via direct attribute: {e}")
-        
-        # Method 2: Try getDataAsBytes method
-        if raw_data is None and hasattr(received_tensor_obj, 'get_data_as_bytes'):
-            try:
-                raw_data = received_tensor_obj.get_data_as_bytes()
-                print(f"[PYTHON] Got data via get_data_as_bytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
-            except Exception as e:
-                print(f"[PYTHON] Failed to get data via get_data_as_bytes(): {e}")
-                
-        # Method 3: Try getDataAsBytes with different naming
-        if raw_data is None and hasattr(received_tensor_obj, 'getDataAsBytes'):
-            try:
-                raw_data = received_tensor_obj.getDataAsBytes()
-                print(f"[PYTHON] Got data via getDataAsBytes(): {type(raw_data)}, length: {len(raw_data) if raw_data else 0}")
-            except Exception as e:
-                print(f"[PYTHON] Failed to get data via getDataAsBytes(): {e}")
-        
-        if raw_data is None:
-            # Debug: print available attributes
-            attrs = [attr for attr in dir(received_tensor_obj) if not attr.startswith('_')]
-            print(f"[PYTHON] Available attributes: {attrs}")
-            raise ValueError(f"Cannot get raw data from ReceivedTensor object. Available attributes: {attrs}")
-        
-        # Convert different data types to bytes
-        if isinstance(raw_data, bytes):
-            pass  # Already bytes
-        elif isinstance(raw_data, str):
-            # Convert string to bytes using latin1 to preserve byte values
-            raw_data = raw_data.encode('latin1')
-        elif hasattr(raw_data, 'encode'):
-            raw_data = raw_data.encode('latin1')
-        else:
-            raise ValueError(f"Unknown data type: {type(raw_data)}")
-        
-        return PythonTensorRebuilder.rebuild_tensor_from_raw_data(raw_data, return_torch)
-
-
-class PerformanceTestResults:
-    """Container for performance test results"""
-    
-    def __init__(self):
-        self.data_results: List[Dict[str, Any]] = []
-        self.tensor_results: List[Dict[str, Any]] = []
-        
-    def add_data_result(self, size_mb: float, time_ms: float, bandwidth_mbps: float):
-        self.data_results.append({
-            'size_mb': size_mb,
-            'time_ms': time_ms,
-            'bandwidth_mbps': bandwidth_mbps
-        })
-        
-    def add_tensor_result(self, tensor_type: str, shape: tuple, size_mb: float, 
-                         time_ms: float, bandwidth_mbps: float):
-        self.tensor_results.append({
-            'tensor_type': tensor_type,
-            'shape': shape,
-            'size_mb': size_mb,
-            'time_ms': time_ms,
-            'bandwidth_mbps': bandwidth_mbps
-        })
-        
-    def print_summary(self):
-        print("\n" + "="*80)
-        print("PERFORMANCE TEST RESULTS SUMMARY")
-        print("="*80)
-        
-        if self.data_results:
-            print("\nDATA INTERFACE PERFORMANCE:")
-            print("-" * 80)
-            print(f"{'Size (MB)':<15} {'Time (ms)':<15} {'Send BW (MB/s)':<18} {'Total BW (MB/s)':<18} {'Network Latency':<15}")
-            print("-" * 80)
-            for result in self.data_results:
-                print(f"{result['size_mb']:<15.3f} {result['time_ms']:<15.2f} {result['bandwidth_mbps']:<18.2f} {'N/A':<18} {'< 1ms':<15}")
-                
-        if self.tensor_results:
-            print("\nTENSOR INTERFACE PERFORMANCE:")
-            print("-" * 100)
-            print(f"{'Type':<12} {'Shape':<25} {'Size (MB)':<15} {'Time (ms)':<15} {'Send BW (MB/s)':<18} {'Validation':<15}")
-            print("-" * 100)
-            for result in self.tensor_results:
-                shape_str = str(result['shape'])[:23]
-                print(f"{result['tensor_type']:<12} {shape_str:<25} {result['size_mb']:<15.2f} "
-                      f"{result['time_ms']:<15.2f} {result['bandwidth_mbps']:<18.2f} {'PASS':<15}")
-
-
-class CoroRPCPerformanceTester:
-    """Main performance testing class"""
-    
-    def __init__(self):
-        self.server = None
-        self.client = None
-        self.server_addr = "127.0.0.1:8889"
-        self.results = PerformanceTestResults()
-        
-        # Callback tracking
-        self.data_received_count = 0
-        self.tensor_received_count = 0
-        self.data_receive_times = []
-        self.tensor_receive_times = []
-        self.receive_lock = threading.Lock()
-        
-        # Store tensors for validation
-        self.sent_tensors = []
-        self.received_tensors = []
-        
-    def setup(self) -> bool:
-        """Initialize server and client"""
-        print("[SETUP] Initializing CoroRPC performance test environment...")
-        
-        try:
-            # Create server and client instances
-            self.server = CoroRPCInterface()
-            self.client = CoroRPCInterface()
-            
-            # Initialize server
-            if not self.server.initialize(self.server_addr, 1, 30, 4):
-                print("[ERROR] Failed to initialize server")
-                return False
-                
-            # Initialize client
-            if not self.client.initialize("", 0, 30, 4):
-                print("[ERROR] Failed to initialize client")
-                return False
-                
-            # Set up callbacks
-            self.server.set_data_receive_callback(self._data_receive_callback)
-            self.server.set_tensor_receive_callback(self._tensor_receive_callback)
-            
-            # Start server
-            if not self.server.start_server_async():
-                print("[ERROR] Failed to start server")
-                return False
-                
-            print(f"[SETUP] Server started on {self.server_addr}")
-            time.sleep(1)
-            
-            print("[SETUP] Client ready to connect to server")
-            time.sleep(0.5)
-            
-            return True
-            
-        except Exception as e:
-            print(f"[ERROR] Setup failed with exception: {e}")
-            return False
-    
-    def teardown(self):
-        """Clean up resources"""
-        try:
-            if self.server:
-                self.server.stop_server()
-                print("[CLEANUP] Server stopped")
-        except:
-            pass
-            
-    def _data_receive_callback(self, received_data):
-        """Simple callback for data reception with timing info"""
-        callback_time = time.time()
-        
-        with self.receive_lock:
-            self.data_received_count += 1
-            self.data_receive_times.append(callback_time)
-            
-            source_address = received_data.get("source", "unknown")
-            data = received_data.get("data", b"")
-            print(f"   [DATA] Received: {len(data):,} bytes | Time: {callback_time:.6f}")
-    
-    def _tensor_receive_callback(self, received_tensor):
-        """Simple callback for tensor reception with timing info"""
-        callback_time = time.time()
-        
-        with self.receive_lock:
-            self.tensor_received_count += 1
-            self.tensor_receive_times.append(callback_time)
-            
-            if not hasattr(self, 'received_tensors'):
-                self.received_tensors = []
-            self.received_tensors.append(received_tensor)
-            
-            print(f"   [TENSOR] Received: {received_tensor.source_address} | Time: {callback_time:.6f}")
-
-    def validate_tensor_equality(self, original_tensor, received_tensor_obj) -> bool:
-        """Simple tensor validation with timing"""
-        try:
-            validation_start = time.time()
-            rebuilt_tensor = PythonTensorRebuilder.rebuild_tensor_from_received_tensor(
-                received_tensor_obj, return_torch=True)
-            rebuild_time = (time.time() - validation_start) * 1000
-            
-            # Quick validation
-            if original_tensor.shape != rebuilt_tensor.shape:
-                return False
-            if original_tensor.dtype != rebuilt_tensor.dtype:
-                return False
-                
-            compare_start = time.time()
-            if original_tensor.dtype in [torch.float16, torch.float32, torch.float64]:
-                values_match = torch.allclose(original_tensor, rebuilt_tensor, rtol=1e-5, atol=1e-8)
-            else:
-                values_match = torch.equal(original_tensor, rebuilt_tensor)
-            compare_time = (time.time() - compare_start) * 1000
-            
-            print(f"   [VALIDATION] Rebuild={rebuild_time:.2f}ms | Compare={compare_time:.2f}ms | Result={'PASS' if values_match else 'FAIL'}")
-            return values_match
-            
-        except Exception as e:
-            print(f"   [ERROR] Validation failed: {e}")
-            return False
-    
-    def test_comprehensive_performance(self) -> bool:
-        """Comprehensive performance test with detailed metrics"""
-        print("\n" + "="*80)
-        print("COMPREHENSIVE CORO-RPC PERFORMANCE ANALYSIS")
-        print("="*80)
-        
-        # Test configurations
-        test_configs = [
-            # Data interface tests
-            (1.0/1024, "data", "Small Data (1KB)"),
-            (10.0, "data", "Medium Data (10MB)"),
-            (100.0, "data", "Large Data (100MB)"),
-            
-            # Tensor interface tests  
-            (1.0, "tensor", "Small Tensor (1MB)"),
-            (50.0, "tensor", "Medium Tensor (50MB)"),
-            (200.0, "tensor", "Large Tensor (200MB)"),
-        ]
-        
-        success_count = 0
-        total_tests = len(test_configs)
-        
-        for i, (size_mb, test_type, description) in enumerate(test_configs, 1):
-            print(f"\n[TEST {i}/{total_tests}] {description}")
-            print("-" * 60)
-            
-            try:
-                if self.run_performance_test(size_mb, test_type):
-                    success_count += 1
-                    print(f"[RESULT] Test {i} PASSED")
-                else:
-                    print(f"[RESULT] Test {i} FAILED")
-                    
-                # Brief pause between tests
-                if i < total_tests:
-                    time.sleep(1.0)
-                    
-            except Exception as e:
-                print(f"[ERROR] Test {i} failed with exception: {e}")
-                
-        print(f"\n[SUMMARY] Tests completed: {success_count}/{total_tests} passed")
-        return success_count == total_tests
-    
-    def run_performance_test(self, size_mb: float, data_type: str = "data") -> bool:
-        """Run a single performance test with detailed timing breakdown"""
-        
-        # Step 1: Prepare data/tensor
-        prepare_start = time.time()
-        if data_type == "data":
-            data_size_bytes = int(size_mb * 1024 * 1024)
-            if data_size_bytes <= 1024:
-                test_data = b"CoroRPC_Test_" * (data_size_bytes // 13 + 1)
-                test_data = test_data[:data_size_bytes]
-            else:
-                pattern = bytes(range(256)) * 4
-                test_data = pattern * (data_size_bytes // len(pattern) + 1)
-                test_data = test_data[:data_size_bytes]
-            test_object = test_data
-        else:  # tensor
-            # Create tensor to match target size
-            element_size = 4  # float32
-            numel = int(size_mb * 1024 * 1024 / element_size)
-            # Create roughly square tensor
-            side = int(numel ** 0.5)
-            shape = (side, side)
-            test_object = torch.randn(shape, dtype=torch.float32)
-            actual_size_mb = test_object.numel() * test_object.element_size() / (1024 * 1024)
-            size_mb = actual_size_mb  # Update to actual size
-            
-        prepare_time = (time.time() - prepare_start) * 1000
-        
-        # Step 2: Reset counters
-        reset_start = time.time()
-        with self.receive_lock:
-            if data_type == "data":
-                self.data_received_count = 0
-                self.data_receive_times.clear()
-            else:
-                self.tensor_received_count = 0 
-                self.tensor_receive_times.clear()
-                self.sent_tensors.clear()
-                self.received_tensors.clear()
-                self.sent_tensors.append(test_object.clone())
-        reset_time = (time.time() - reset_start) * 1000
-        
-        # Step 3: Send
-        print(f"[SEND] Transmitting {size_mb:.3f} MB {data_type}...")
-        send_start = time.time()
-        if data_type == "data":
-            result = self.client.send_data(self.server_addr, test_object)
-        else:
-            result = self.client.send_tensor(self.server_addr, test_object)
-        send_end = time.time()
-        send_time = (send_end - send_start) * 1000
-        
-        if result < 0:
-            print(f"[ERROR] Send failed: {result}")
-            return False
-            
-        # Step 4: Wait for reception
-        print(f"[RECV] Waiting for reception...")
-        wait_start = time.time()
-        max_wait = 30.0  # 30 second timeout for large data
-        
-        while True:
-            elapsed = time.time() - wait_start
-            if data_type == "data" and self.data_received_count > 0:
-                break
-            elif data_type == "tensor" and self.tensor_received_count > 0:
-                break
-            elif elapsed > max_wait:
-                print(f"[ERROR] Reception timeout after {elapsed:.2f}s")
-                return False
-            time.sleep(0.01)
-            
-        reception_time = time.time()
-        wait_time = (reception_time - wait_start) * 1000
-        
-        # Step 5: Calculate timing metrics
-        if data_type == "data":
-            callback_time = self.data_receive_times[0]
-        else:
-            callback_time = self.tensor_receive_times[0]
-            
-        network_time = (callback_time - send_end) * 1000
-        total_time = (callback_time - send_start) * 1000
-        
-        # Step 6: Validation (for tensors only)
-        validation_time = 0
-        validation_success = True
-        if data_type == "tensor" and len(self.received_tensors) > 0:
-            validation_start = time.time()
-            validation_success = self.validate_tensor_equality(self.sent_tensors[0], self.received_tensors[0])
-            validation_time = (time.time() - validation_start) * 1000
-        
-        # Step 7: Print comprehensive timing breakdown
-        bandwidth = size_mb / (send_time / 1000) if send_time > 0 else 0
-        total_bandwidth = size_mb / (total_time / 1000) if total_time > 0 else 0
-        
-        print(f"\n[METRICS] Performance Analysis:")
-        print(f"   Data Size:          {size_mb:10.3f} MB")
-        print(f"   Prepare Time:       {prepare_time:10.2f} ms")
-        print(f"   Reset Time:         {reset_time:10.2f} ms")
-        print(f"   Send Time:          {send_time:10.2f} ms  (Sender Processing)")
-        print(f"   Network Latency:    {network_time:10.2f} ms  (Network + Receiver)")
-        print(f"   Wait Time:          {wait_time:10.2f} ms")
-        if validation_time > 0:
-            print(f"   Validation Time:    {validation_time:10.2f} ms  (Data Integrity Check)")
-        print(f"   Total Time:         {total_time:10.2f} ms")
-        print(f"   Send Bandwidth:     {bandwidth:10.2f} MB/s")
-        print(f"   End-to-End BW:      {total_bandwidth:10.2f} MB/s")
-        print(f"   Efficiency:         {(send_time/total_time)*100:10.1f} %")
-        
-        # Store results
-        if data_type == "data":
-            self.results.add_data_result(size_mb, send_time, bandwidth)
-        else:
-            self.results.add_tensor_result("Float32", test_object.shape, size_mb, send_time, bandwidth)
-            
-        return validation_success if data_type == "tensor" else True
-
-
-def main():
-    """Main performance test with comprehensive analysis"""
-    print("CoroRPC Interface Performance Analysis Suite")
-    print("="*60)
-    
-    tester = CoroRPCPerformanceTester()
-    
-    try:
-        # Setup
-        print("[INIT] Setting up test environment...")
-        if not tester.setup():
-            print("[FATAL] Setup failed")
-            return False
-        
-        print("[INIT] Setup completed successfully\n")
-        
-        # Run comprehensive tests
-        print("[START] Running comprehensive performance tests...")
-        success = tester.test_comprehensive_performance()
-        
-        # Print final results
-        tester.results.print_summary()
-        
-        # Print conclusion
-        print("\n" + "="*80)
-        print("TEST CONCLUSION")
-        print("="*80)
-        if success:
-            print("[SUCCESS] All performance tests completed successfully!")
-            print("- CoroRPC interface is functioning correctly")
-            print("- Performance metrics collected for analysis")
-            print("- Tensor validation passed for all tests")
-            return True
-        else:
-            print("[FAILURE] Some performance tests failed")
-            print("- Check error logs above for details")
-            return False
-        
-    except Exception as e:
-        print(f"[FATAL] Test suite failed with exception: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-        
-    finally:
-        print("\n[CLEANUP] Cleaning up test environment...")
-        tester.teardown()
-
-
-if __name__ == "__main__":
-    success = main()
-    print(f"\nFinal Result: {'SUCCESS' if success else 'FAILURE'}")
-    sys.exit(0 if success else 1)
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index baeb6b7e3..bda869e43 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -15,20 +15,6 @@ TARGET_PID=$!
 MC_METADATA_SERVER=http://127.0.0.1:8080/metadata python transfer_engine_initiator_test.py
 kill $TARGET_PID || true
 
-echo "Running CoroRPC performance tests..."
-# Check if we're in CI environment or if the test file exists
-if [ -f "../mooncake-transfer-engine/tests/test_coro_rpc_performance.py" ]; then
-    cd ../mooncake-transfer-engine/tests
-    pip install torch numpy
-    python test_coro_rpc_performance.py
-    cd ../../mooncake-wheel/tests
-else
-    echo "WARNING: CoroRPC performance test not found, skipping..."
-    echo "Current directory: $(pwd)"
-    echo "Looking for: ../mooncake-transfer-engine/tests/test_coro_rpc_performance.py"
-    ls -la ../mooncake-transfer-engine/tests/ 2>/dev/null || echo "Directory ../mooncake-transfer-engine/tests/ does not exist"
-fi
-
 echo "Running master tests..."
 
 which mooncake_master 2>/dev/null | grep -q '/usr/local/bin/mooncake_master' && \

From 2efd542c1b9bd8f633cba0a713cd344b0b8e9989 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 10:42:00 +0800
Subject: [PATCH 35/64] removed pybind11 in .github/workflows/ci.yml

---
 .github/workflows/ci.yml                               |  1 -
 .../tests/communicator_bandwidth_test.py               | 10 +++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e93f3d614..771efe13d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -227,7 +227,6 @@ jobs:
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
-        pip install pybind11
       shell: bash
 
     - name: Build transfer engine only
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index 736a86f7f..461d17037 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -61,11 +61,11 @@ def send_data(client):
 thread.start()
 
 for i in range(64):
-thread1 = threading.Thread(target=send_data, args=(client,))
-thread1.start()
-# while True:
-# result = client.send_data(url, test_data)
-# counter.inc()
+    thread1 = threading.Thread(target=send_data, args=(client,))
+    thread1.start()
+    # while True:
+    # result = client.send_data(url, test_data)
+    # counter.inc()
 
 thread.join()
 # print(f"Send result: {result}")
\ No newline at end of file

From cd85ad76b9bdcf2f86e531714468fcf801db87c2 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 11:46:48 +0800
Subject: [PATCH 36/64] fixed cmakelists.txt

---
 .../src/transport/coro_rpc_connector/CMakeLists.txt            | 1 -
 .../src/transport/coro_rpc_connector/cororpc_communicator.cpp  | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
index c433ba0ac..ef6ab6f02 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
@@ -34,5 +34,4 @@ target_include_directories(coro_rpc_connector PRIVATE
 target_link_libraries(coro_rpc_connector PRIVATE 
     yalantinglibs::yalantinglibs
     pthread
-    pybind11::module
 )
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 08faed0bc..223c17c8e 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -48,7 +48,8 @@ void CoroRPCCommunicator::setDataReceiveCallback(
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
-    //`easylog::set_min_severity(easylog::Serverity::WARNING);  // Set log level to WARNING
+    //`easylog::set_min_severity(easylog::Severity::WARNING);  // Set log level
+    // to WARNING
 
     if (!config.listen_address.empty()) {
         LOG(INFO) << "Initializing server on " << config.listen_address;

From 4158094b159fbbf3690a20e544ed4b87c66bada0 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 14:08:07 +0800
Subject: [PATCH 37/64] removed cross compilation problem

---
 mooncake-transfer-engine/src/transport/CMakeLists.txt         | 3 +--
 mooncake-transfer-engine/tests/communicator_bandwidth_test.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index 87a1ba023..a5ab0bb7b 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,9 +1,8 @@
 file(GLOB XPORT_SOURCES "*.cpp")
 
 add_subdirectory(rdma_transport)
-add_subdirectory(coro_rpc_connector)
 
-add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport> $<TARGET_OBJECTS:coro_rpc_connector>)
+add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
 target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread)
 
 if (USE_TCP)
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index 461d17037..a7af212c1 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -52,10 +52,10 @@ def send_data(client):
 CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
 
 server = CoroRPCInterface()
-client = CoroRPCInterface()
+# client = CoroRPCInterface()
 server.initialize("0.0.0.0:9004", 8, 30, 4)
 server.start_server()
-client.initialize("", 0, 30, 100)
+# client.initialize("", 0, 30, 100)
 
 thread = threading.Thread(target=print_qps)
 thread.start()

From dd83623bc36d78c255a20d85b86b54c13eb384df Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 14:25:36 +0800
Subject: [PATCH 38/64] removed cmakelists.txt

---
 .../coro_rpc_connector/CMakeLists.txt         | 37 -------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
deleted file mode 100644
index ef6ab6f02..000000000
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-# Find Python and pybind11 for the binding code
-find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
-find_package(pybind11 QUIET)
-if(NOT pybind11_FOUND)
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -m pybind11 --cmakedir
-        OUTPUT_VARIABLE pybind11_DIR
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE pybind11_RESULT
-    )
-    if(pybind11_RESULT EQUAL 0)
-        find_package(pybind11 REQUIRED PATHS ${pybind11_DIR})
-    else()
-        message(FATAL_ERROR "pybind11 not found. Please install with: pip install pybind11")
-    endif()
-endif()
-
-# Create object library for coro_rpc_connector
-set(CORO_RPC_SOURCES
-    cororpc_interface.cpp
-    cororpc_communicator.cpp
-)
-
-add_library(coro_rpc_connector OBJECT ${CORO_RPC_SOURCES})
-
-target_compile_features(coro_rpc_connector PRIVATE cxx_std_20)
-target_compile_options(coro_rpc_connector PRIVATE -O3 -Wall)
-
-target_include_directories(coro_rpc_connector PRIVATE 
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
-    ${Python3_INCLUDE_DIRS}
-)
-
-target_link_libraries(coro_rpc_connector PRIVATE 
-    yalantinglibs::yalantinglibs
-    pthread
-)
\ No newline at end of file

From 45987ba27e32ddd6cc94ad96f03824e6f6fdbcc1 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 15:07:29 +0800
Subject: [PATCH 39/64] refactor the test

---
 .../tests/communicator_bandwidth_test.py      | 95 ++++++++++++++-----
 1 file changed, 72 insertions(+), 23 deletions(-)

diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index a7af212c1..b076a5bf9 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -4,6 +4,7 @@
 import sys
 import threading
 import struct
+import argparse
 from typing import List, Tuple, Dict, Any
 
 import mooncake.engine as engine
@@ -33,39 +34,87 @@ def get(self):
 
 size_1mb = 1024 * 1024
 test_data = b'\x00' * size_1mb
-url = "127.0.0.1:9004"
 
 def print_qps():
-    while(True):
+    while True:
         time.sleep(1)
         val = counter.get()
-        if(val == 0):   
+        if val == 0:   
             continue
-
         print("qps:", val)
 
-def send_data(client):
+def send_data(client, target_url):
     while True:
-        result = client.send_data(url, test_data)
+        result = client.send_data(target_url, test_data)
         counter.inc()
 
-CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
-
-server = CoroRPCInterface()
-# client = CoroRPCInterface()
-server.initialize("0.0.0.0:9004", 8, 30, 4)
-server.start_server()
-# client.initialize("", 0, 30, 100)
+def run_server(bind_url):
+    """Run server mode"""
+    print(f"Starting server on {bind_url}")
+    
+    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    server = CoroRPCInterface()
+    server.initialize(bind_url, 8, 30, 4)
+    server.start_server()
+    
+    # Start QPS statistics thread
+    thread = threading.Thread(target=print_qps)
+    thread.daemon = True
+    thread.start()
+    
+    print(f"Server started on {bind_url}, press Ctrl+C to stop")
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\nServer stopping...")
 
-thread = threading.Thread(target=print_qps)
-thread.start()
+def run_client(target_url, num_threads=64):
+    """Run client mode"""
+    print(f"Starting client, connecting to {target_url} with {num_threads} threads")
+    
+    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    client = CoroRPCInterface()
+    client.initialize("", 0, 30, 100)
+    
+    # Start QPS statistics thread
+    qps_thread = threading.Thread(target=print_qps)
+    qps_thread.daemon = True
+    qps_thread.start()
+    
+    # Start multiple sending threads
+    threads = []
+    for i in range(num_threads):
+        thread = threading.Thread(target=send_data, args=(client, target_url))
+        thread.daemon = True
+        thread.start()
+        threads.append(thread)
+    
+    print(f"Client started with {num_threads} threads, press Ctrl+C to stop")
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\nClient stopping...")
 
-for i in range(64):
-    thread1 = threading.Thread(target=send_data, args=(client,))
-    thread1.start()
-    # while True:
-    # result = client.send_data(url, test_data)
-    # counter.inc()
+def main():
+    parser = argparse.ArgumentParser(description='Mooncake Communication Bandwidth Test Tool')
+    parser.add_argument('mode', choices=['server', 'client'], 
+                       help='Run mode: server or client')
+    parser.add_argument('--url', default='127.0.0.1:9004',
+                       help='URL address (default: 127.0.0.1:9004)')
+    parser.add_argument('--threads', type=int, default=64,
+                       help='Number of client threads (default: 64)')
+    
+    args = parser.parse_args()
+    
+    if args.mode == 'server':
+        # Server mode, URL as bind address
+        bind_url = f"0.0.0.0:{args.url.split(':')[-1]}" if ':' in args.url else f"0.0.0.0:{args.url}"
+        run_server(bind_url)
+    else:
+        # Client mode, URL as target address
+        run_client(args.url, args.threads)
 
-thread.join()
-# print(f"Send result: {result}")
\ No newline at end of file
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 4d218a39a0733b6cc7f11823d26d5f7ab5012cc7 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 15:23:36 +0800
Subject: [PATCH 40/64] added packet size as parameters

---
 .../tests/communicator_bandwidth_test.py      | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index b076a5bf9..aa5030677 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -32,8 +32,9 @@ def get(self):
 
 counter = AtomicCounter()
 
-size_1mb = 1024 * 1024
-test_data = b'\x00' * size_1mb
+# Global variable to store data size
+data_size = 1024 * 1024  # Default 1MB
+test_data = None
 
 def print_qps():
     while True:
@@ -41,16 +42,20 @@ def print_qps():
         val = counter.get()
         if val == 0:   
             continue
-        print("qps:", val)
+        print("bandwidth:", 8 * val * data_size / (1000 * 1000 * 1000), "Gb/s")
 
 def send_data(client, target_url):
     while True:
         result = client.send_data(target_url, test_data)
         counter.inc()
 
-def run_server(bind_url):
+def run_server(bind_url, data_size_mb=1):
     """Run server mode"""
-    print(f"Starting server on {bind_url}")
+    global data_size, test_data
+    data_size = data_size_mb * 1024 * 1024
+    test_data = b'\x00' * data_size
+    
+    print(f"Starting server on {bind_url} with {data_size_mb}MB data packets")
     
     CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
     server = CoroRPCInterface()
@@ -69,9 +74,13 @@ def run_server(bind_url):
     except KeyboardInterrupt:
         print("\nServer stopping...")
 
-def run_client(target_url, num_threads=64):
+def run_client(target_url, num_threads=8, data_size_mb=1):
     """Run client mode"""
-    print(f"Starting client, connecting to {target_url} with {num_threads} threads")
+    global data_size, test_data
+    data_size = data_size_mb * 1024 * 1024
+    test_data = b'\x00' * data_size
+    
+    print(f"Starting client, connecting to {target_url} with {num_threads} threads, {data_size_mb}MB data packets")
     
     CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
     client = CoroRPCInterface()
@@ -103,18 +112,20 @@ def main():
                        help='Run mode: server or client')
     parser.add_argument('--url', default='127.0.0.1:9004',
                        help='URL address (default: 127.0.0.1:9004)')
-    parser.add_argument('--threads', type=int, default=64,
-                       help='Number of client threads (default: 64)')
-    
+    parser.add_argument('--threads', type=int, default=8,
+                       help='Number of client threads (default: 8)')
+    parser.add_argument('--data-size', type=int, default=1,
+                       help='Data packet size in MB (default: 1)')
+
     args = parser.parse_args()
     
     if args.mode == 'server':
         # Server mode, URL as bind address
         bind_url = f"0.0.0.0:{args.url.split(':')[-1]}" if ':' in args.url else f"0.0.0.0:{args.url}"
-        run_server(bind_url)
+        run_server(bind_url, args.data_size)
     else:
         # Client mode, URL as target address
-        run_client(args.url, args.threads)
+        run_client(args.url, args.threads, args.data_size)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 393e5b8a6cd4186903c6b5b4886346c985ef4763 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 15:29:00 +0800
Subject: [PATCH 41/64] remove pybind dependency and integration

---
 .github/workflows/ci.yml            | 1 -
 mooncake-integration/CMakeLists.txt | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 771efe13d..049d0d60a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,7 +65,6 @@ jobs:
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
-        pip install pybind11
         mkdir build
         cd build
         cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DSTORE_USE_ETCD=ON -DENABLE_ASAN=ON -DENABLE_SCCACHE=ON
diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt
index 1eafbfd3b..27fdecf74 100644
--- a/mooncake-integration/CMakeLists.txt
+++ b/mooncake-integration/CMakeLists.txt
@@ -34,10 +34,7 @@ message("${PYTHON_SYS_PATH}")
 set(PYTHON_PACKAGE_NAME "mooncake")
 
 pybind11_add_module(engine ${SOURCES} ${CACHE_ALLOCATOR_SOURCES}
-    transfer_engine/transfer_engine_py.cpp 
-    ../mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
-    ../mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
-)
+    transfer_engine/transfer_engine_py.cpp )
 
 target_include_directories(engine PRIVATE 
     ${Python3_INCLUDE_DIRS}

From e925202bf7b41a60472bab63571623b604513390 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 17:26:52 +0800
Subject: [PATCH 42/64] readd cmakelists.txt to connector folder

---
 .../src/transport/CMakeLists.txt              | 21 +++++++++++++++++--
 .../coro_rpc_connector/CMakeLists.txt         |  0
 .../cororpc_communicator.cpp                  |  2 +-
 3 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt

diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index a5ab0bb7b..b609c6830 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,9 +1,26 @@
-file(GLOB XPORT_SOURCES "*.cpp")
+file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
+
+# Find Python and pybind11 for coro_rpc_connector
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+find_package(pybind11 QUIET)
+if(NOT pybind11_FOUND)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -m pybind11 --cmakedir
+        OUTPUT_VARIABLE pybind11_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE pybind11_RESULT
+    )
+    if(pybind11_RESULT EQUAL 0)
+        find_package(pybind11 REQUIRED PATHS ${pybind11_DIR})
+    else()
+        message(FATAL_ERROR "pybind11 not found. Please install with: pip install pybind11")
+    endif()
+endif()
 
 add_subdirectory(rdma_transport)
 
 add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
-target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread)
+target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread pybind11::module)
 
 if (USE_TCP)
   add_subdirectory(tcp_transport)
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 223c17c8e..e4e019f8e 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -48,7 +48,7 @@ void CoroRPCCommunicator::setDataReceiveCallback(
 
 bool CoroRPCCommunicator::initialize(const Config& config) {
     impl_->config = config;
-    //`easylog::set_min_severity(easylog::Severity::WARNING);  // Set log level
+    easylog::set_min_severity(easylog::Severity::WARNING);  // Set log level
     // to WARNING
 
     if (!config.listen_address.empty()) {

From dc479af8367a098e13ac6f12cc0bce1751a93ee6 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 19:07:45 +0800
Subject: [PATCH 43/64] fixed CMakeLists.txt

---
 .../coro_rpc_connector/cororpc_communicator.h   |  2 +-
 mooncake-transfer-engine/src/CMakeLists.txt     | 17 ++++++++++++++++-
 .../src/transport/CMakeLists.txt                |  2 +-
 .../coro_rpc_connector/cororpc_communicator.cpp | 13 +++++++++++--
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index a2d3cdca8..9c0d197ad 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -94,7 +94,7 @@ class CoroRPCCommunicator {
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
    private:
-    coro_io::client_pools<coro_rpc::coro_rpc_client> client_pools_;
+    std::shared_ptr<coro_io::client_pools<coro_rpc::coro_rpc_client>> client_pools_;
     std::shared_ptr<Impl> impl_;
 };
 
diff --git a/mooncake-transfer-engine/src/CMakeLists.txt b/mooncake-transfer-engine/src/CMakeLists.txt
index 73a1c4500..8c0918f3a 100644
--- a/mooncake-transfer-engine/src/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/CMakeLists.txt
@@ -2,6 +2,21 @@ file(GLOB ENGINE_SOURCES "*.cpp")
 add_subdirectory(common)
 add_subdirectory(transport)
 
+# Find Python for pybind11 support
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+find_package(pybind11 QUIET)
+if(NOT pybind11_FOUND)
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -m pybind11 --cmakedir
+        OUTPUT_VARIABLE pybind11_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE pybind11_RESULT
+    )
+    if(pybind11_RESULT EQUAL 0)
+        find_package(pybind11 REQUIRED PATHS ${pybind11_DIR})
+    endif()
+endif()
+
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
 add_library(transfer_engine ${ENGINE_SOURCES} $<TARGET_OBJECTS:transport>)
@@ -36,7 +51,7 @@ endif()
 target_link_libraries(
     transfer_engine
     PUBLIC
-    base transport rdma_transport ibverbs glog::glog gflags::gflags pthread JsonCpp::JsonCpp numa yalantinglibs::yalantinglibs
+    base transport rdma_transport ibverbs glog::glog gflags::gflags pthread JsonCpp::JsonCpp numa yalantinglibs::yalantinglibs pybind11::module ${Python3_LIBRARIES}
     )
 
 if (USE_CUDA)
diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index b609c6830..d4606d217 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -20,7 +20,7 @@ endif()
 add_subdirectory(rdma_transport)
 
 add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
-target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread pybind11::module)
+target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread pybind11::module ${Python3_LIBRARIES})
 
 if (USE_TCP)
   add_subdirectory(tcp_transport)
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index e4e019f8e..829bea89a 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -51,6 +51,15 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     easylog::set_min_severity(easylog::Severity::WARNING);  // Set log level
     // to WARNING
 
+    // Initialize client pools with proper configuration
+    coro_io::client_pool<coro_rpc::coro_rpc_client>::pool_config pool_conf{};
+    const char* value = std::getenv("MC_RPC_PROTOCOL");
+    if (value && std::string_view(value) == "rdma") {
+        pool_conf.client_config.socket_config = coro_io::ib_socket_t::config_t{};
+        impl_->server_->init_ibv();
+    }
+    client_pools_ = std::make_shared<coro_io::client_pools<coro_rpc::coro_rpc_client>>(pool_conf);
+
     if (!config.listen_address.empty()) {
         LOG(INFO) << "Initializing server on " << config.listen_address;
 
@@ -135,7 +144,7 @@ async_simple::coro::Lazy<result> CoroRPCCommunicator::sendDataAsync(
     // For large data, use attachment to avoid copying
     const size_t ATTACHMENT_THRESHOLD = 1024;  // Use attachment for data > 1KB
 
-    auto rpc_result = co_await client_pools_.send_request(
+    auto rpc_result = co_await client_pools_->send_request(
         target_address,
         [data_view, data_size](coro_rpc::coro_rpc_client& client)
             -> async_simple::coro::Lazy<void> {
@@ -183,7 +192,7 @@ int CoroRPCCommunicator::sendTensor(const std::string& target_address,
 
 async_simple::coro::Lazy<int> CoroRPCCommunicator::sendTensorAsync(
     const std::string& target_address, const TensorInfo& tensor) {
-    auto rpc_result = co_await client_pools_.send_request(
+    auto rpc_result = co_await client_pools_->send_request(
         target_address,
         [&tensor](coro_rpc::coro_rpc_client& client)
             -> async_simple::coro::Lazy<void> {

From 638391002305faeab2a4841bcfbeee89f83e8b74 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Thu, 11 Sep 2025 21:18:14 +0800
Subject: [PATCH 44/64] fixed pybind modules

---
 .../src/transport/CMakeLists.txt              | 19 ++++---------------
 .../coro_rpc_connector/CMakeLists.txt         |  0
 .../cororpc_communicator.cpp                  |  7 +++++--
 3 files changed, 9 insertions(+), 17 deletions(-)
 delete mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt

diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index d4606d217..30f38cdf7 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,26 +1,15 @@
 file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
 
-# Find Python and pybind11 for coro_rpc_connector
+file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
+
+# Find Python - pybind11 is already configured at the root level
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
-find_package(pybind11 QUIET)
-if(NOT pybind11_FOUND)
-    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -m pybind11 --cmakedir
-        OUTPUT_VARIABLE pybind11_DIR
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE pybind11_RESULT
-    )
-    if(pybind11_RESULT EQUAL 0)
-        find_package(pybind11 REQUIRED PATHS ${pybind11_DIR})
-    else()
-        message(FATAL_ERROR "pybind11 not found. Please install with: pip install pybind11")
-    endif()
-endif()
 
 add_subdirectory(rdma_transport)
 
 add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
 target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread pybind11::module ${Python3_LIBRARIES})
+target_include_directories(transport PRIVATE ${Python3_INCLUDE_DIRS})
 
 if (USE_TCP)
   add_subdirectory(tcp_transport)
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 829bea89a..959cd247f 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -55,10 +55,13 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     coro_io::client_pool<coro_rpc::coro_rpc_client>::pool_config pool_conf{};
     const char* value = std::getenv("MC_RPC_PROTOCOL");
     if (value && std::string_view(value) == "rdma") {
-        pool_conf.client_config.socket_config = coro_io::ib_socket_t::config_t{};
+        pool_conf.client_config.socket_config =
+            coro_io::ib_socket_t::config_t{};
         impl_->server_->init_ibv();
     }
-    client_pools_ = std::make_shared<coro_io::client_pools<coro_rpc::coro_rpc_client>>(pool_conf);
+    client_pools_ =
+        std::make_shared<coro_io::client_pools<coro_rpc::coro_rpc_client>>(
+            pool_conf);
 
     if (!config.listen_address.empty()) {
         LOG(INFO) << "Initializing server on " << config.listen_address;

From d4caef8b6474d8c01be1f3fefd9744a52d18f650 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 12 Sep 2025 10:06:42 +0800
Subject: [PATCH 45/64] reformat headers

---
 .../transport/coro_rpc_connector/cororpc_communicator.h      | 3 ++-
 .../transport/coro_rpc_connector/cororpc_communicator.cpp    | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 9c0d197ad..1a28472c1 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -94,7 +94,8 @@ class CoroRPCCommunicator {
     std::shared_ptr<Impl> getImpl() { return impl_; }
 
    private:
-    std::shared_ptr<coro_io::client_pools<coro_rpc::coro_rpc_client>> client_pools_;
+    std::shared_ptr<coro_io::client_pools<coro_rpc::coro_rpc_client>>
+        client_pools_;
     std::shared_ptr<Impl> impl_;
 };
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 959cd247f..8f450ecf4 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -57,7 +57,6 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
     if (value && std::string_view(value) == "rdma") {
         pool_conf.client_config.socket_config =
             coro_io::ib_socket_t::config_t{};
-        impl_->server_->init_ibv();
     }
     client_pools_ =
         std::make_shared<coro_io::client_pools<coro_rpc::coro_rpc_client>>(
@@ -70,6 +69,10 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             config.thread_count, config.listen_address,
             std::chrono::seconds(config.timeout_seconds));
 
+        if (value && std::string_view(value) == "rdma") {
+            impl_->server_->init_ibv();
+        }
+
         impl_->server_->register_handler<
             &CoroRPCCommunicator::Impl::handleDataTransfer,
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());

From e52aa02865840c216bd1461a73a313ed482c18a0 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 12 Sep 2025 10:28:40 +0800
Subject: [PATCH 46/64] fixed CMakeLists.txt

---
 mooncake-transfer-engine/src/CMakeLists.txt           | 7 ++++++-
 mooncake-transfer-engine/src/transport/CMakeLists.txt | 9 ++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/mooncake-transfer-engine/src/CMakeLists.txt b/mooncake-transfer-engine/src/CMakeLists.txt
index 8c0918f3a..0d51bf063 100644
--- a/mooncake-transfer-engine/src/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/CMakeLists.txt
@@ -51,9 +51,14 @@ endif()
 target_link_libraries(
     transfer_engine
     PUBLIC
-    base transport rdma_transport ibverbs glog::glog gflags::gflags pthread JsonCpp::JsonCpp numa yalantinglibs::yalantinglibs pybind11::module ${Python3_LIBRARIES}
+    base transport rdma_transport ibverbs glog::glog gflags::gflags pthread JsonCpp::JsonCpp numa yalantinglibs::yalantinglibs ${Python3_LIBRARIES}
     )
 
+# Add pybind11 headers if pybind11 is available
+if(TARGET pybind11::headers)
+    target_link_libraries(transfer_engine PUBLIC pybind11::headers)
+endif()
+
 if (USE_CUDA)
   target_include_directories(transfer_engine PRIVATE /usr/local/cuda/include)
   target_link_libraries(transfer_engine PUBLIC cuda cudart rt)
diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index 30f38cdf7..5dafb70f5 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,16 +1,19 @@
 file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
 
-file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
-
 # Find Python - pybind11 is already configured at the root level
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
 add_subdirectory(rdma_transport)
 
 add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
-target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread pybind11::module ${Python3_LIBRARIES})
+target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread ${Python3_LIBRARIES})
 target_include_directories(transport PRIVATE ${Python3_INCLUDE_DIRS})
 
+# Add pybind11 headers if available
+if(TARGET pybind11::headers)
+    target_link_libraries(transport PRIVATE pybind11::headers)
+endif()
+
 if (USE_TCP)
   add_subdirectory(tcp_transport)
   target_sources(transport PUBLIC $<TARGET_OBJECTS:tcp_transport>)

From fa1006285c7a04ed3ea74b415d8153fdeac2272a Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 12 Sep 2025 10:42:25 +0800
Subject: [PATCH 47/64] add indication of protocol

---
 .../transport/coro_rpc_connector/cororpc_communicator.cpp  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 8f450ecf4..e367cf039 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -77,6 +77,13 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             &CoroRPCCommunicator::Impl::handleDataTransfer,
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
+    LOG(INFO) << "Environment variable MOONCAKE_TRANSFER_PROTOCOL is set to "
+              << (value ? value : "not set");
+    if (value && std::string_view(value) == "rdma") {
+        LOG(INFO) << "Using RDMA transport for RPC communication";
+    } else {
+        LOG(INFO) << "Using TCP transport for RPC communication";
+    }
 
     LOG(INFO) << "Communicator initialized with client pool support";
     return true;

From e72c7a0bb2fafb4e7464effd2b335a570db17951 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 12 Sep 2025 16:13:33 +0800
Subject: [PATCH 48/64] fixed rdma bugs

---
 .../coro_rpc_connector/cororpc_communicator.h |  3 --
 .../coro_rpc_connector/cororpc_interface.h    |  1 +
 .../cororpc_communicator.cpp                  | 39 ++++++++++---------
 .../coro_rpc_connector/cororpc_interface.cpp  |  5 +++
 4 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
index 1a28472c1..8e3f89f25 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_communicator.h
@@ -99,7 +99,4 @@ class CoroRPCCommunicator {
     std::shared_ptr<Impl> impl_;
 };
 
-std::unique_ptr<CoroRPCCommunicator> createServer(
-    const std::string& listen_address, size_t thread_count = 0);
-
 }  // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 35263544c..d13695389 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -49,6 +49,7 @@ class CoroRPCInterface {
 
     bool startServer();
     bool startServerAsync();
+    bool startServerImpl(bool is_async = true);
     void stopServer();
 
     int sendData(const std::string& target_address, pybind11::handle data);
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index e367cf039..041f59ec5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -62,6 +62,8 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
         std::make_shared<coro_io::client_pools<coro_rpc::coro_rpc_client>>(
             pool_conf);
 
+    LOG(INFO) << "create coro_rpc_client_pool with " << config.pool_size
+              << " threads";
     if (!config.listen_address.empty()) {
         LOG(INFO) << "Initializing server on " << config.listen_address;
 
@@ -70,14 +72,31 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             std::chrono::seconds(config.timeout_seconds));
 
         if (value && std::string_view(value) == "rdma") {
-            impl_->server_->init_ibv();
+            if (impl_->server_) {
+                try {
+                    impl_->server_->init_ibv();
+                    LOG(INFO) << "RDMA initialized successfully";
+                } catch (const std::exception& e) {
+                    LOG(ERROR) << "RDMA initialization failed: " << e.what();
+                    LOG(WARNING) << "Falling back to TCP mode";
+                    // Continue without RDMA - the server will use TCP
+                } catch (...) {
+                    LOG(ERROR)
+                        << "RDMA initialization failed with unknown error";
+                    LOG(WARNING) << "Falling back to TCP mode";
+                    // Continue without RDMA - the server will use TCP
+                }
+            } else {
+                LOG(ERROR) << "Server pointer is null, cannot initialize RDMA";
+                LOG(WARNING) << "Falling back to TCP mode";
+            }
         }
 
         impl_->server_->register_handler<
             &CoroRPCCommunicator::Impl::handleDataTransfer,
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
-    LOG(INFO) << "Environment variable MOONCAKE_TRANSFER_PROTOCOL is set to "
+    LOG(INFO) << "Environment variable MC_TRANSFER_PROTOCOL is set to "
               << (value ? value : "not set");
     if (value && std::string_view(value) == "rdma") {
         LOG(INFO) << "Using RDMA transport for RPC communication";
@@ -345,20 +364,4 @@ void CoroRPCCommunicator::Impl::handleTensorTransferWithAttachment(
     // context.response_msg();
 }
 
-std::unique_ptr<CoroRPCCommunicator> createServer(
-    const std::string& listen_address, size_t thread_count) {
-    Config config;
-    config.listen_address = listen_address;
-    config.thread_count = thread_count;
-    config.pool_size = 10;  // Default pool size for server-side client pools
-
-    auto communicator = std::make_unique<CoroRPCCommunicator>();
-    if (communicator->initialize(config)) {
-        LOG(INFO) << "Created server communicator with pool size: "
-                  << config.pool_size << std::endl;
-        return communicator;
-    }
-    return nullptr;
-}
-
 }  // namespace mooncake
\ No newline at end of file
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 4e17f6862..c15a95c3c 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -48,6 +48,11 @@ bool CoroRPCInterface::startServerAsync() {
     return impl_->communicator->startServerAsync();
 }
 
+bool CoroRPCInterface::startServerImpl(bool is_async) {
+    if (!impl_->communicator) return false;
+    return impl_->communicator->startServerImpl(is_async);
+}
+
 void CoroRPCInterface::stopServer() {
     if (impl_->communicator) {
         impl_->communicator->stopServer();

From 732fe0eba643211f565ed32c468f043276c3e9be Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 12 Sep 2025 16:34:32 +0800
Subject: [PATCH 49/64] fixed some minor bugs

---
 .../src/transport/coro_rpc_connector/cororpc_communicator.cpp  | 2 +-
 mooncake-transfer-engine/tests/communicator_bandwidth_test.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
index 041f59ec5..fad60bed7 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_communicator.cpp
@@ -96,7 +96,7 @@ bool CoroRPCCommunicator::initialize(const Config& config) {
             &CoroRPCCommunicator::Impl::handleDataTransfer,
             &CoroRPCCommunicator::Impl::handleTensorTransfer>(impl_.get());
     }
-    LOG(INFO) << "Environment variable MC_TRANSFER_PROTOCOL is set to "
+    LOG(INFO) << "Environment variable MC_RPC_PROTOCOL is set to "
               << (value ? value : "not set");
     if (value && std::string_view(value) == "rdma") {
         LOG(INFO) << "Using RDMA transport for RPC communication";
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index aa5030677..39b5057e9 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -60,7 +60,7 @@ def run_server(bind_url, data_size_mb=1):
     CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
     server = CoroRPCInterface()
     server.initialize(bind_url, 8, 30, 4)
-    server.start_server()
+    server.start_server_async()  # 使用异步启动，立即返回
     
     # Start QPS statistics thread
     thread = threading.Thread(target=print_qps)
@@ -73,6 +73,7 @@ def run_server(bind_url, data_size_mb=1):
             time.sleep(1)
     except KeyboardInterrupt:
         print("\nServer stopping...")
+        server.stop_server()  # 显式停止服务器
 
 def run_client(target_url, num_threads=8, data_size_mb=1):
     """Run client mode"""

From c40b13a43033181f8b1cd24fbd91e08ab74aaae9 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 10:06:35 +0800
Subject: [PATCH 50/64] removed Chinese comments

---
 debug_rdma.cpp                                                | 0
 mooncake-transfer-engine/tests/communicator_bandwidth_test.py | 4 ++--
 test_rdma_fallback.py                                         | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 debug_rdma.cpp
 create mode 100644 test_rdma_fallback.py

diff --git a/debug_rdma.cpp b/debug_rdma.cpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index 39b5057e9..9cec3128d 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -60,7 +60,7 @@ def run_server(bind_url, data_size_mb=1):
     CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
     server = CoroRPCInterface()
     server.initialize(bind_url, 8, 30, 4)
-    server.start_server_async()  # 使用异步启动，立即返回
+    server.start_server_async()
     
     # Start QPS statistics thread
     thread = threading.Thread(target=print_qps)
@@ -73,7 +73,7 @@ def run_server(bind_url, data_size_mb=1):
             time.sleep(1)
     except KeyboardInterrupt:
         print("\nServer stopping...")
-        server.stop_server()  # 显式停止服务器
+        server.stop_server()
 
 def run_client(target_url, num_threads=8, data_size_mb=1):
     """Run client mode"""
diff --git a/test_rdma_fallback.py b/test_rdma_fallback.py
new file mode 100644
index 000000000..e69de29bb

From aa62943f3ef07a1dc119f40adfaf8421904c0cf7 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 10:59:17 +0800
Subject: [PATCH 51/64] remove useless files

---
 debug_rdma.cpp                                                | 0
 mooncake-transfer-engine/tests/communicator_bandwidth_test.py | 2 +-
 test_rdma_fallback.py                                         | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 debug_rdma.cpp
 delete mode 100644 test_rdma_fallback.py

diff --git a/debug_rdma.cpp b/debug_rdma.cpp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index 9cec3128d..04126f108 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -60,7 +60,7 @@ def run_server(bind_url, data_size_mb=1):
     CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
     server = CoroRPCInterface()
     server.initialize(bind_url, 8, 30, 4)
-    server.start_server_async()
+    server.start_server_async() #start the server asynchronously
     
     # Start QPS statistics thread
     thread = threading.Thread(target=print_qps)
diff --git a/test_rdma_fallback.py b/test_rdma_fallback.py
deleted file mode 100644
index e69de29bb..000000000

From ff9525a589fad04af1faf762a6970ae3a17b6060 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 15:22:21 +0800
Subject: [PATCH 52/64] removed unnecessary include in CMakeLists.txt

---
 mooncake-integration/CMakeLists.txt                         | 2 --
 mooncake-integration/transfer_engine/transfer_engine_py.cpp | 6 ++----
 mooncake-transfer-engine/src/transport/CMakeLists.txt       | 4 +---
 .../tests/communicator_bandwidth_test.py                    | 4 ++--
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt
index 27fdecf74..445361bd9 100644
--- a/mooncake-integration/CMakeLists.txt
+++ b/mooncake-integration/CMakeLists.txt
@@ -44,8 +44,6 @@ target_link_libraries(engine PUBLIC
     transfer_engine 
     glog::glog
     gflags::gflags
-    yalantinglibs::yalantinglibs
-    pybind11::module
 )
 
 set(ALLOCATOR_SO_PATH "${CMAKE_BINARY_DIR}/mooncake-transfer-engine/nvlink-allocator/nvlink_allocator.so")
diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index 16d6485fb..b80ff3dc7 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -743,8 +743,6 @@ PYBIND11_MODULE(engine, m) {
 
     adaptor_cls.attr("TransferOpcode") = transfer_opcode;
 
-    // Add coro_rpc_interface as a submodule
-    auto coro_rpc_submodule = m.def_submodule(
-        "coro_rpc_interface", "CoroRPC interface for communication");
-    bind_coro_rpc_interface(coro_rpc_submodule);
+    // Bind coro_rpc_interface directly to the main module
+    bind_coro_rpc_interface(m);
 }
diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index 5dafb70f5..e0806a9f2 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -37,6 +37,4 @@ endif()
 if (USE_MNNVL)
   add_subdirectory(nvlink_transport)
   target_sources(transport PUBLIC $<TARGET_OBJECTS:nvlink_transport>)
-endif()
-
-target_include_directories(transport PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
\ No newline at end of file
+endif()
\ No newline at end of file
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index 04126f108..c6606347d 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -57,7 +57,7 @@ def run_server(bind_url, data_size_mb=1):
     
     print(f"Starting server on {bind_url} with {data_size_mb}MB data packets")
     
-    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    CoroRPCInterface = engine.CoroRPCInterface
     server = CoroRPCInterface()
     server.initialize(bind_url, 8, 30, 4)
     server.start_server_async() #start the server asynchronously
@@ -83,7 +83,7 @@ def run_client(target_url, num_threads=8, data_size_mb=1):
     
     print(f"Starting client, connecting to {target_url} with {num_threads} threads, {data_size_mb}MB data packets")
     
-    CoroRPCInterface = engine.coro_rpc_interface.CoroRPCInterface
+    CoroRPCInterface = engine.CoroRPCInterface
     client = CoroRPCInterface()
     client.initialize("", 0, 30, 100)
     

From 2b83a765351cbdf31ac54a34f35920641f72d0d6 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 15:40:04 +0800
Subject: [PATCH 53/64] refactored client and server initialization methods

---
 .../transfer_engine/transfer_engine_py.cpp           |  5 +++++
 .../transport/coro_rpc_connector/cororpc_interface.h |  6 ++++++
 .../coro_rpc_connector/cororpc_interface.cpp         | 12 ++++++++++++
 .../tests/communicator_bandwidth_test.py             |  6 ++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index b80ff3dc7..82722369f 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -675,6 +675,11 @@ void bind_coro_rpc_interface(py::module_ &m) {
         .def("initialize", &CoroRPCInterface::initialize,
              "listen_address"_a = "", "thread_count"_a = 0,
              "timeout_seconds"_a = 30, "pool_size"_a = 10)
+        .def("initialize_client", &CoroRPCInterface::initializeClient,
+             "pool_size"_a = 10, "timeout_seconds"_a = 30)
+        .def("initialize_server", &CoroRPCInterface::initializeServer,
+             "listen_address"_a, "thread_count"_a = 8,
+             "timeout_seconds"_a = 30, "pool_size"_a = 4)
         .def("start_server", &CoroRPCInterface::startServer)
         .def("start_server_async", &CoroRPCInterface::startServerAsync)
         .def("stop_server", &CoroRPCInterface::stopServer)
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index d13695389..e9f2eb840 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -47,6 +47,12 @@ class CoroRPCInterface {
                     size_t thread_count = 0, size_t timeout_seconds = 30,
                     size_t pool_size = 10);
 
+    // Convenience methods for common use cases
+    bool initializeClient(size_t pool_size = 10, size_t timeout_seconds = 30);
+    bool initializeServer(const std::string& listen_address, 
+                         size_t thread_count = 8, size_t timeout_seconds = 30,
+                         size_t pool_size = 4);
+
     bool startServer();
     bool startServerAsync();
     bool startServerImpl(bool is_async = true);
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index c15a95c3c..5deee4ac5 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -38,6 +38,18 @@ bool CoroRPCInterface::initialize(const std::string& local_address,
     return impl_->communicator->initialize(config);
 }
 
+// Convenience method for client initialization
+bool CoroRPCInterface::initializeClient(size_t pool_size, size_t timeout_seconds) {
+    return initialize("", 0, timeout_seconds, pool_size);
+}
+
+// Convenience method for server initialization  
+bool CoroRPCInterface::initializeServer(const std::string& listen_address,
+                                       size_t thread_count, size_t timeout_seconds,
+                                       size_t pool_size) {
+    return initialize(listen_address, thread_count, timeout_seconds, pool_size);
+}
+
 bool CoroRPCInterface::startServer() {
     if (!impl_->communicator) return false;
     return impl_->communicator->startServer();
diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index c6606347d..f67010df5 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -59,7 +59,8 @@ def run_server(bind_url, data_size_mb=1):
     
     CoroRPCInterface = engine.CoroRPCInterface
     server = CoroRPCInterface()
-    server.initialize(bind_url, 8, 30, 4)
+    # Server使用专门的初始化方法
+    server.initialize_server(bind_url, thread_count=8)
     server.start_server_async() #start the server asynchronously
     
     # Start QPS statistics thread
@@ -85,7 +86,8 @@ def run_client(target_url, num_threads=8, data_size_mb=1):
     
     CoroRPCInterface = engine.CoroRPCInterface
     client = CoroRPCInterface()
-    client.initialize("", 0, 30, 100)
+    # Client使用专门的初始化方法，只需要提供pool_size
+    client.initialize_client(pool_size=100)
     
     # Start QPS statistics thread
     qps_thread = threading.Thread(target=print_qps)

From ed76b8b05ce92a1c0e325fc3a89194c61e60197c Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 16:38:49 +0800
Subject: [PATCH 54/64] fixed server initialization pool size

---
 .../include/transport/coro_rpc_connector/cororpc_interface.h | 3 +--
 .../src/transport/coro_rpc_connector/cororpc_interface.cpp   | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index e9f2eb840..7e81d7498 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -50,8 +50,7 @@ class CoroRPCInterface {
     // Convenience methods for common use cases
     bool initializeClient(size_t pool_size = 10, size_t timeout_seconds = 30);
     bool initializeServer(const std::string& listen_address, 
-                         size_t thread_count = 8, size_t timeout_seconds = 30,
-                         size_t pool_size = 4);
+                         size_t thread_count = 8, size_t timeout_seconds = 30);
 
     bool startServer();
     bool startServerAsync();
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index 5deee4ac5..a066e09e4 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -45,9 +45,8 @@ bool CoroRPCInterface::initializeClient(size_t pool_size, size_t timeout_seconds
 
 // Convenience method for server initialization  
 bool CoroRPCInterface::initializeServer(const std::string& listen_address,
-                                       size_t thread_count, size_t timeout_seconds,
-                                       size_t pool_size) {
-    return initialize(listen_address, thread_count, timeout_seconds, pool_size);
+                                       size_t thread_count, size_t timeout_seconds) {
+    return initialize(listen_address, thread_count, timeout_seconds, 4);
 }
 
 bool CoroRPCInterface::startServer() {

From 600db58cad7f34c274a117098c173bb51f3cfc8c Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Mon, 15 Sep 2025 17:54:08 +0800
Subject: [PATCH 55/64] fixed interface pybind bugs

---
 .../transfer_engine/transfer_engine_py.cpp                | 3 +--
 .../transport/coro_rpc_connector/cororpc_interface.h      | 4 ++--
 .../transport/coro_rpc_connector/cororpc_interface.cpp    | 8 +++++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/mooncake-integration/transfer_engine/transfer_engine_py.cpp b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
index 82722369f..64edfbecd 100644
--- a/mooncake-integration/transfer_engine/transfer_engine_py.cpp
+++ b/mooncake-integration/transfer_engine/transfer_engine_py.cpp
@@ -678,8 +678,7 @@ void bind_coro_rpc_interface(py::module_ &m) {
         .def("initialize_client", &CoroRPCInterface::initializeClient,
              "pool_size"_a = 10, "timeout_seconds"_a = 30)
         .def("initialize_server", &CoroRPCInterface::initializeServer,
-             "listen_address"_a, "thread_count"_a = 8,
-             "timeout_seconds"_a = 30, "pool_size"_a = 4)
+             "listen_address"_a, "thread_count"_a = 8, "timeout_seconds"_a = 30)
         .def("start_server", &CoroRPCInterface::startServer)
         .def("start_server_async", &CoroRPCInterface::startServerAsync)
         .def("stop_server", &CoroRPCInterface::stopServer)
diff --git a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
index 7e81d7498..866b1e97f 100644
--- a/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
+++ b/mooncake-transfer-engine/include/transport/coro_rpc_connector/cororpc_interface.h
@@ -49,8 +49,8 @@ class CoroRPCInterface {
 
     // Convenience methods for common use cases
     bool initializeClient(size_t pool_size = 10, size_t timeout_seconds = 30);
-    bool initializeServer(const std::string& listen_address, 
-                         size_t thread_count = 8, size_t timeout_seconds = 30);
+    bool initializeServer(const std::string& listen_address,
+                          size_t thread_count = 8, size_t timeout_seconds = 30);
 
     bool startServer();
     bool startServerAsync();
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
index a066e09e4..9003d5113 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/cororpc_interface.cpp
@@ -39,13 +39,15 @@ bool CoroRPCInterface::initialize(const std::string& local_address,
 }
 
 // Convenience method for client initialization
-bool CoroRPCInterface::initializeClient(size_t pool_size, size_t timeout_seconds) {
+bool CoroRPCInterface::initializeClient(size_t pool_size,
+                                        size_t timeout_seconds) {
     return initialize("", 0, timeout_seconds, pool_size);
 }
 
-// Convenience method for server initialization  
+// Convenience method for server initialization
 bool CoroRPCInterface::initializeServer(const std::string& listen_address,
-                                       size_t thread_count, size_t timeout_seconds) {
+                                        size_t thread_count,
+                                        size_t timeout_seconds) {
     return initialize(listen_address, thread_count, timeout_seconds, 4);
 }
 

From abd15a1e762004d54f5ce4a79f908682c354cd2d Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 17 Sep 2025 11:47:10 +0800
Subject: [PATCH 56/64] fixed pybind

---
 .../src/transport/CMakeLists.txt              |  5 ++--
 .../coro_rpc_connector/CMakeLists.txt         | 25 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt

diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt
index e0806a9f2..1e4972967 100644
--- a/mooncake-transfer-engine/src/transport/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt
@@ -1,11 +1,12 @@
-file(GLOB XPORT_SOURCES "*.cpp" "coro_rpc_connector/*.cpp")
+file(GLOB XPORT_SOURCES "*.cpp")
 
 # Find Python - pybind11 is already configured at the root level
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
 add_subdirectory(rdma_transport)
+add_subdirectory(coro_rpc_connector)
 
-add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport>)
+add_library(transport OBJECT ${XPORT_SOURCES} $<TARGET_OBJECTS:rdma_transport> $<TARGET_OBJECTS:coro_rpc_connector>)
 target_link_libraries(transport PRIVATE JsonCpp::JsonCpp yalantinglibs::yalantinglibs glog::glog pthread ${Python3_LIBRARIES})
 target_include_directories(transport PRIVATE ${Python3_INCLUDE_DIRS})
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
new file mode 100644
index 000000000..6b11f7fae
--- /dev/null
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
@@ -0,0 +1,25 @@
+file(GLOB CORO_RPC_SOURCES "*.cpp")
+
+# Find Python for pybind11 support
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+
+add_library(coro_rpc_connector OBJECT ${CORO_RPC_SOURCES})
+
+target_link_libraries(coro_rpc_connector 
+    PRIVATE 
+    JsonCpp::JsonCpp 
+    yalantinglibs::yalantinglibs 
+    glog::glog 
+    pthread 
+    ${Python3_LIBRARIES}
+)
+
+target_include_directories(coro_rpc_connector 
+    PRIVATE 
+    ${Python3_INCLUDE_DIRS}
+)
+
+# Add pybind11 headers if available
+if(TARGET pybind11::headers)
+    target_link_libraries(coro_rpc_connector PRIVATE pybind11::headers)
+endif()
\ No newline at end of file

From 20b1bc298b1337e9a31781d59db2a74e5f0dabc7 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 23 Sep 2025 16:05:11 +0800
Subject: [PATCH 57/64] update pybind11 parameters to -Dpybind11_DIR

---
 .github/workflows/ci.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 049d0d60a..896aca528 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,6 +17,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: 'recursive'
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -133,6 +135,8 @@ jobs:
     runs-on: ${{ matrix.ubuntu-version }}
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: 'recursive'
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -235,7 +239,7 @@ jobs:
         cd build
         export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
         export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
-        cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
+        cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs" -Dpybind11_DIR=${{ github.workspace }}/extern/pybind11/share/cmake/pybind11
         make -j
         sudo make install
       shell: bash

From fc7dde64f6fabc615403b53e94d5ff36d1c1c64f Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Tue, 23 Sep 2025 19:59:27 +0800
Subject: [PATCH 58/64] update cmakelists

---
 .github/workflows/ci.yml                      |  6 +--
 CMakeLists.txt                                | 19 ++++++++-
 mooncake-integration/CMakeLists.txt           |  2 -
 .../coro_rpc_connector/CMakeLists.txt         | 41 ++++++++++++++++++-
 4 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 896aca528..e8ac4adb9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,8 +17,6 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -135,8 +133,6 @@ jobs:
     runs-on: ${{ matrix.ubuntu-version }}
     steps:
     - uses: actions/checkout@v4
-      with:
-        submodules: 'recursive'
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -239,7 +235,7 @@ jobs:
         cd build
         export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
         export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
-        cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs" -Dpybind11_DIR=${{ github.workspace }}/extern/pybind11/share/cmake/pybind11
+    cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
         make -j
         sudo make install
       shell: bash
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 047ae3eb0..52321a6f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,24 @@ option(WITH_STORE "build mooncake store library and sample code" ON)
 option(WITH_P2P_STORE "build p2p store library and sample code" OFF)
 option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the transfer engine" OFF)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extern/pybind11)
+# pybind11: prefer system/finder (>=2.13), then vendored extern, finally FetchContent
+find_package(pybind11 2.13 CONFIG QUIET)
+
+if (NOT pybind11_FOUND)
+  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/extern/pybind11/CMakeLists.txt)
+    message(STATUS "Using vendored pybind11 from extern/pybind11")
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extern/pybind11)
+  else()
+    include(FetchContent)
+    message(STATUS "Fetching pybind11 (v2.13.6) via FetchContent")
+    FetchContent_Declare(pybind11
+      GIT_REPOSITORY https://github.com/pybind/pybind11.git
+      GIT_TAG v2.13.6
+      GIT_SHALLOW TRUE
+    )
+    FetchContent_MakeAvailable(pybind11)
+  endif()
+endif()
 set(PYTHON_EXECUTABLE "python3")
 execute_process(
     COMMAND ${PYTHON_EXECUTABLE} -c "import sys; print(sys.path[-1])"
diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt
index 445361bd9..dc471a327 100644
--- a/mooncake-integration/CMakeLists.txt
+++ b/mooncake-integration/CMakeLists.txt
@@ -22,10 +22,8 @@ include_directories("/usr/include/jsoncpp")
 include_directories("./")
 include_directories("../mooncake-transfer-engine/include")
 
-# Find Python for pybind11 integration
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
-
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 
diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
index 6b11f7fae..586c399e4 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
@@ -1,10 +1,37 @@
 file(GLOB CORO_RPC_SOURCES "*.cpp")
 
-# Find Python for pybind11 support
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
+# Ensure pybind11 is available (standalone build fallback)
+if (NOT TARGET pybind11::headers)
+    find_package(pybind11 CONFIG QUIET)
+    if (NOT pybind11_FOUND)
+        include(FetchContent)
+        message(STATUS "Fetching pybind11 (v2.13.6) for coro_rpc_connector")
+        FetchContent_Declare(pybind11
+            GIT_REPOSITORY https://github.com/pybind/pybind11.git
+            GIT_TAG v2.13.6
+            GIT_SHALLOW TRUE
+        )
+        FetchContent_MakeAvailable(pybind11)
+    endif()
+endif()
+
 add_library(coro_rpc_connector OBJECT ${CORO_RPC_SOURCES})
 
+# Ensure pybind11 include dirs are applied to this target's compilation
+set(_PYBIND11_INC_DIRS "")
+if(TARGET pybind11::headers)
+    get_target_property(_PYBIND11_INC_DIRS pybind11::headers INTERFACE_INCLUDE_DIRECTORIES)
+elseif(TARGET pybind11::pybind11)
+    get_target_property(_PYBIND11_INC_DIRS pybind11::pybind11 INTERFACE_INCLUDE_DIRECTORIES)
+elseif(TARGET pybind11::pybind11_headers)
+    get_target_property(_PYBIND11_INC_DIRS pybind11::pybind11_headers INTERFACE_INCLUDE_DIRECTORIES)
+endif()
+if(_PYBIND11_INC_DIRS)
+    target_include_directories(coro_rpc_connector PRIVATE ${_PYBIND11_INC_DIRS})
+endif()
+
 target_link_libraries(coro_rpc_connector 
     PRIVATE 
     JsonCpp::JsonCpp 
@@ -19,7 +46,17 @@ target_include_directories(coro_rpc_connector
     ${Python3_INCLUDE_DIRS}
 )
 
-# Add pybind11 headers if available
+# Add pybind11 headers/include paths if available
 if(TARGET pybind11::headers)
     target_link_libraries(coro_rpc_connector PRIVATE pybind11::headers)
+elseif(TARGET pybind11::pybind11)
+    target_link_libraries(coro_rpc_connector PRIVATE pybind11::pybind11)
+else()
+    # Fallback: try to read include dirs from known pybind11 targets if present
+    if(TARGET pybind11::pybind11_headers)
+        get_target_property(_PYBIND_INCLUDES pybind11::pybind11_headers INTERFACE_INCLUDE_DIRECTORIES)
+        if(_PYBIND_INCLUDES)
+            target_include_directories(coro_rpc_connector PRIVATE ${_PYBIND_INCLUDES})
+        endif()
+    endif()
 endif()
\ No newline at end of file

From c790f21458905769b5a81f119080a5d3024ee452 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 24 Sep 2025 11:13:28 +0800
Subject: [PATCH 59/64] local py312 test complete

---
 .../src/transport/coro_rpc_connector/CMakeLists.txt             | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
index 586c399e4..58ebc4351 100644
--- a/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
+++ b/mooncake-transfer-engine/src/transport/coro_rpc_connector/CMakeLists.txt
@@ -2,7 +2,6 @@ file(GLOB CORO_RPC_SOURCES "*.cpp")
 
 find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 
-# Ensure pybind11 is available (standalone build fallback)
 if (NOT TARGET pybind11::headers)
     find_package(pybind11 CONFIG QUIET)
     if (NOT pybind11_FOUND)
@@ -19,7 +18,6 @@ endif()
 
 add_library(coro_rpc_connector OBJECT ${CORO_RPC_SOURCES})
 
-# Ensure pybind11 include dirs are applied to this target's compilation
 set(_PYBIND11_INC_DIRS "")
 if(TARGET pybind11::headers)
     get_target_property(_PYBIND11_INC_DIRS pybind11::headers INTERFACE_INCLUDE_DIRECTORIES)

From 255a5a92dcf4011b211e76b25efb41dab0818f91 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 24 Sep 2025 11:22:21 +0800
Subject: [PATCH 60/64] adjused workflow

---
 .github/workflows/ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e8ac4adb9..27d13fc0b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -223,10 +223,11 @@ jobs:
       run: ${SCCACHE_PATH} --show-stats
 
     - name: Install dependencies
+      shell: bash
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
-      shell: bash
+
 
     - name: Build transfer engine only
       run: |
@@ -235,7 +236,7 @@ jobs:
         cd build
         export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
         export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
-    cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
+        cmake .. -DUSE_ETCD=OFF -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
         make -j
         sudo make install
       shell: bash

From 61b03da94fba51d78acab251eae229deb0354286 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 24 Sep 2025 11:47:19 +0800
Subject: [PATCH 61/64] adjusted Cmakelists.txxt

---
 mooncake-integration/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mooncake-integration/CMakeLists.txt b/mooncake-integration/CMakeLists.txt
index dc471a327..5411574fe 100644
--- a/mooncake-integration/CMakeLists.txt
+++ b/mooncake-integration/CMakeLists.txt
@@ -32,7 +32,9 @@ message("${PYTHON_SYS_PATH}")
 set(PYTHON_PACKAGE_NAME "mooncake")
 
 pybind11_add_module(engine ${SOURCES} ${CACHE_ALLOCATOR_SOURCES}
-    transfer_engine/transfer_engine_py.cpp )
+    transfer_engine/transfer_engine_py.cpp
+    $<TARGET_OBJECTS:coro_rpc_connector>
+)
 
 target_include_directories(engine PRIVATE 
     ${Python3_INCLUDE_DIRS}
@@ -40,6 +42,7 @@ target_include_directories(engine PRIVATE
 
 target_link_libraries(engine PUBLIC 
     transfer_engine 
+    coro_rpc_connector
     glog::glog
     gflags::gflags
 )

From 2ac0105acc207289880ff1c3f2e16c4fc281456b Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Sun, 28 Sep 2025 14:58:39 +0800
Subject: [PATCH 62/64] restored some useless changes

---
 .github/workflows/ci.yml | 2 +-
 .gitignore               | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 27d13fc0b..7c7d1a46b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -223,10 +223,10 @@ jobs:
       run: ${SCCACHE_PATH} --show-stats
 
     - name: Install dependencies
-      shell: bash
       run: |
         sudo apt update -y
         sudo bash -x dependencies.sh -y
+      shell: bash
 
 
     - name: Build transfer engine only
diff --git a/.gitignore b/.gitignore
index 5314dc685..d0ad29f0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,4 +194,7 @@ libetcd_wrapper.h
 
 mooncake-wheel/mooncake/allocator.py
 mooncake-wheel/mooncake/mooncake_master
-mooncake-wheel/mooncake/transfer_engine_bench
\ No newline at end of file
+mooncake-wheel/mooncake/transfer_engine_bench
+
+# Claude Code Memory
+CLAUDE.md
\ No newline at end of file

From 41e871faf57c5490a4e2bbf7de6999c6c9abdf12 Mon Sep 17 00:00:00 2001
From: Yixin Zhang <yuyang.zyx@alibaba-inc.com>
Date: Fri, 10 Oct 2025 20:37:35 +0800
Subject: [PATCH 63/64] remove Chinese comments

---
 mooncake-transfer-engine/tests/communicator_bandwidth_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
index f67010df5..ccb4f2198 100644
--- a/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
+++ b/mooncake-transfer-engine/tests/communicator_bandwidth_test.py
@@ -59,7 +59,6 @@ def run_server(bind_url, data_size_mb=1):
     
     CoroRPCInterface = engine.CoroRPCInterface
     server = CoroRPCInterface()
-    # Server使用专门的初始化方法
     server.initialize_server(bind_url, thread_count=8)
     server.start_server_async() #start the server asynchronously
     
@@ -86,7 +85,6 @@ def run_client(target_url, num_threads=8, data_size_mb=1):
     
     CoroRPCInterface = engine.CoroRPCInterface
     client = CoroRPCInterface()
-    # Client使用专门的初始化方法，只需要提供pool_size
     client.initialize_client(pool_size=100)
     
     # Start QPS statistics thread
@@ -131,4 +129,4 @@ def main():
         run_client(args.url, args.threads, args.data_size)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From ba6bd7f21d853ce1927e768087a4d353ac8b8863 Mon Sep 17 00:00:00 2001
From: yuyang <yuyang.zyx@alibaba-inc.com>
Date: Wed, 15 Oct 2025 14:58:45 +0800
Subject: [PATCH 64/64] remove blank lines

---
 .github/workflows/ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7c7d1a46b..049d0d60a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -228,7 +228,6 @@ jobs:
         sudo bash -x dependencies.sh -y
       shell: bash
 
-
     - name: Build transfer engine only
       run: |
         cd mooncake-transfer-engine