From 69ccf80e743fee64ef84c426b3907decdad25cce Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 26 Aug 2025 03:38:20 +0000 Subject: [PATCH 01/15] [TransferEngine]: Support specifying transport specific args when installing Signed-off-by: Jinlong Chen --- .../include/multi_transport.h | 4 +++ .../include/transport/transport.h | 3 +++ .../src/multi_transport.cpp | 26 +++++++++++++++++++ .../src/transfer_engine.cpp | 20 +++++++++----- .../src/transport/transport.cpp | 7 +++++ 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/mooncake-transfer-engine/include/multi_transport.h b/mooncake-transfer-engine/include/multi_transport.h index b5214b58c..91d484601 100644 --- a/mooncake-transfer-engine/include/multi_transport.h +++ b/mooncake-transfer-engine/include/multi_transport.h @@ -49,6 +49,10 @@ class MultiTransport { Transport *installTransport(const std::string &proto, std::shared_ptr topo); + bool transportNeedArgs(const std::string &proto); + + Transport *installTransport(const std::string &proto, void **args); + Transport *getTransport(const std::string &proto); std::vector listTransports(); diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index 29c2aadb4..6614bd8e7 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -257,6 +257,9 @@ class Transport { std::shared_ptr meta, std::shared_ptr topo); + virtual int install(std::string &local_server_name, + std::shared_ptr meta, void **args); + std::string local_server_name_; std::shared_ptr metadata_; diff --git a/mooncake-transfer-engine/src/multi_transport.cpp b/mooncake-transfer-engine/src/multi_transport.cpp index c56bcfa0d..26a61997d 100644 --- a/mooncake-transfer-engine/src/multi_transport.cpp +++ b/mooncake-transfer-engine/src/multi_transport.cpp @@ -240,6 +240,32 @@ Transport *MultiTransport::installTransport(const std::string &proto, return transport; } +bool MultiTransport::transportNeedArgs(const std::string &proto) { + return false; +} + +Transport *MultiTransport::installTransport(const std::string &proto, + void **args) { + std::shared_ptr transport = nullptr; + + // Add transport creation logic here. + + if (!transport) { + LOG(ERROR) << "Unsupported transport " << proto + << ", please rebuild Mooncake"; + return nullptr; + } + + int rc = transport->install(local_server_name_, metadata_, args); + if (rc != 0) { + LOG(ERROR) << "Failed to install transport " << proto << ", rc=" << rc; + return nullptr; + } + + transport_map_[proto] = transport; + return transport.get(); +} + Status MultiTransport::selectTransport(const TransferRequest &entry, Transport *&transport) { auto target_segment_desc = metadata_->getSegmentDescByID(entry.target_id); diff --git a/mooncake-transfer-engine/src/transfer_engine.cpp b/mooncake-transfer-engine/src/transfer_engine.cpp index 384b7d35f..187e606a6 100644 --- a/mooncake-transfer-engine/src/transfer_engine.cpp +++ b/mooncake-transfer-engine/src/transfer_engine.cpp @@ -259,16 +259,22 @@ Transport *TransferEngine::installTransport(const std::string &proto, return transport; } - if (args != nullptr && args[0] != nullptr) { - const std::string nic_priority_matrix = static_cast(args[0]); - int ret = local_topology_->parse(nic_priority_matrix); - if (ret) { - LOG(ERROR) << "Failed to parse NIC priority matrix"; - return nullptr; + if (multi_transports_->transportNeedArgs(proto)) { + transport = multi_transports_->installTransport(proto, args); + } else { + if (args != nullptr && args[0] != nullptr) { + const std::string nic_priority_matrix = + static_cast(args[0]); + int ret = local_topology_->parse(nic_priority_matrix); + if (ret) { + LOG(ERROR) << "Failed to parse NIC priority matrix"; + return nullptr; + } } + + transport = multi_transports_->installTransport(proto, local_topology_); } - transport = multi_transports_->installTransport(proto, local_topology_); if (!transport) return nullptr; // Since installTransport() is only called once during initialization diff --git a/mooncake-transfer-engine/src/transport/transport.cpp b/mooncake-transfer-engine/src/transport/transport.cpp index 00e8bf96f..7c8fed42b 100644 --- a/mooncake-transfer-engine/src/transport/transport.cpp +++ b/mooncake-transfer-engine/src/transport/transport.cpp @@ -64,4 +64,11 @@ int Transport::install(std::string &local_server_name, metadata_ = meta; return 0; } + +int Transport::install(std::string &local_server_name, + std::shared_ptr meta, void **args) { + local_server_name_ = local_server_name; + metadata_ = meta; + return 0; +} } // namespace mooncake \ No newline at end of file From b66c445d7a464eb0c4fdaa5fa0f95dd159e950d3 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 22 Aug 2025 09:46:28 +0000 Subject: [PATCH 02/15] [TransferEngine]: Support registering files as shared buffers Signed-off-by: Jinlong Chen --- .../include/transfer_engine.h | 31 +++++++ .../include/transfer_engine_c.h | 9 +++ .../include/transfer_metadata.h | 16 ++++ .../include/transport/transport.h | 15 ++++ .../src/transfer_engine.cpp | 80 +++++++++++++++++++ .../src/transfer_engine_c.cpp | 17 ++++ .../src/transfer_metadata.cpp | 54 +++++++++++++ 7 files changed, 222 insertions(+) diff --git a/mooncake-transfer-engine/include/transfer_engine.h b/mooncake-transfer-engine/include/transfer_engine.h index 831d72436..df1f649ab 100644 --- a/mooncake-transfer-engine/include/transfer_engine.h +++ b/mooncake-transfer-engine/include/transfer_engine.h @@ -46,12 +46,14 @@ using SegmentHandle = Transport::SegmentHandle; using SegmentID = Transport::SegmentID; using BatchID = Transport::BatchID; using BufferEntry = Transport::BufferEntry; +using FileBufferID = TransferMetadata::FileBufferID; class TransferEngine { public: TransferEngine(bool auto_discover = false) : metadata_(nullptr), local_topology_(std::make_shared()), + next_file_id_(1), auto_discover_(auto_discover) { #ifdef WITH_METRICS InitializeMetricsConfig(); @@ -62,6 +64,7 @@ class TransferEngine { TransferEngine(bool auto_discover, const std::vector &filter) : metadata_(nullptr), local_topology_(std::make_shared()), + next_file_id_(1), auto_discover_(auto_discover), filter_(filter) { #ifdef WITH_METRICS @@ -111,6 +114,25 @@ class TransferEngine { int unregisterLocalMemoryBatch(const std::vector &addr_list); + bool supportFileBuffer(); + + /** + * @brief Register a local file as a shared buffer. + * @param[in] path Local path of the file. + * @param[in] size Available size of the file. + * @param[out] id The id of the registered file buffer. + * @return 0 on success, or error number on failure. + */ + int registerLocalFile(const std::string &path, size_t size, + FileBufferID &id); + + /** + * @brief Unregister a previously registered file. + * @param[in] path The path of the registered file buffer. + * @return 0 on success, or error number on failure. + */ + int unregisterLocalFile(const std::string &path); + BatchID allocateBatchID(size_t batch_size) { return multi_transports_->allocateBatchID(batch_size); } @@ -224,6 +246,12 @@ class TransferEngine { bool remote_accessible; }; + struct LocalFile { + FileBufferID id; + std::string path; + std::size_t size; + }; + std::shared_ptr metadata_; std::string local_server_name_; std::shared_ptr multi_transports_; @@ -231,6 +259,9 @@ class TransferEngine { std::vector local_memory_regions_; std::shared_ptr local_topology_; + std::atomic next_file_id_; + std::unordered_map local_files_; + RWSpinlock send_notifies_lock_; std::unordered_map> diff --git a/mooncake-transfer-engine/include/transfer_engine_c.h b/mooncake-transfer-engine/include/transfer_engine_c.h index 48de41e76..301eace3c 100644 --- a/mooncake-transfer-engine/include/transfer_engine_c.h +++ b/mooncake-transfer-engine/include/transfer_engine_c.h @@ -24,6 +24,7 @@ extern "C" { #define segment_handle_t int32_t #define segment_id_t int32_t +#define file_id_t uint32_t #define batch_id_t uint64_t #define LOCAL_SEGMENT (0) #define INVALID_BATCH UINT64_MAX @@ -35,6 +36,7 @@ struct transfer_request { int opcode; void *source; segment_id_t target_id; + file_id_t file_id; uint64_t target_offset; uint64_t length; }; @@ -135,6 +137,13 @@ int registerLocalMemoryBatch(transfer_engine_t engine, int unregisterLocalMemoryBatch(transfer_engine_t engine, void **addr_list, size_t addr_len); +bool supportFileBuffer(transfer_engine_t engine); + +int registerLocalFile(transfer_engine_t engine, const char *path, size_t size, + file_id_t *id); + +int unregisterLocalFile(transfer_engine_t engine, const char *path); + batch_id_t allocateBatchID(transfer_engine_t engine, size_t batch_size); int submitTransfer(transfer_engine_t engine, batch_id_t batch_id, diff --git a/mooncake-transfer-engine/include/transfer_metadata.h b/mooncake-transfer-engine/include/transfer_metadata.h index 70f15c8d4..5e4f8dc0f 100644 --- a/mooncake-transfer-engine/include/transfer_metadata.h +++ b/mooncake-transfer-engine/include/transfer_metadata.h @@ -64,6 +64,16 @@ class TransferMetadata { std::unordered_map local_path_map; }; + // Identify a single file in a segment's file buffers. + using FileBufferID = uint32_t; + + struct FileBufferDesc { + FileBufferID id; + std::string path; + std::size_t size; + std::size_t align; // For future usage. + }; + struct RankInfoDesc { uint64_t rankId = 0xFFFFFFFF; // rank id, user rank std::string hostIp; @@ -87,6 +97,8 @@ class TransferMetadata { std::vector buffers; // this is for nvmeof. std::vector nvmeof_buffers; + // Generic file buffers. + std::vector file_buffers; // this is for cxl. std::string cxl_name; uint64_t cxl_base_addr; @@ -148,6 +160,10 @@ class TransferMetadata { int removeLocalMemoryBuffer(void *addr, bool update_metadata); + int addFileBuffer(const FileBufferDesc &buffer_desc, bool update_metadata); + + int removeFileBuffer(FileBufferID id, bool update_metadata); + int addLocalSegment(SegmentID segment_id, const std::string &segment_name, std::shared_ptr &&desc); diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index 6614bd8e7..56e809e30 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -43,6 +43,8 @@ class Transport { using SegmentID = uint64_t; using SegmentHandle = SegmentID; + using FileBufferID = TransferMetadata::FileBufferID; + using BatchID = uint64_t; const static BatchID INVALID_BATCH_ID = UINT64_MAX; @@ -60,6 +62,7 @@ class Transport { uint64_t target_offset; size_t length; int advise_retry_cnt = 0; + FileBufferID file_id; }; enum TransferStatusEnum { @@ -92,6 +95,7 @@ class Transport { SliceStatus status; TransferTask *task; bool from_cache; + FileBufferID file_id; union { struct { @@ -284,6 +288,17 @@ class Transport { virtual int unregisterLocalMemoryBatch( const std::vector &addr_list) = 0; + virtual bool supportFileBuffer() { return false; }; + + virtual int registerLocalFile(FileBufferID id, const std::string &path, + size_t size) { + return ERR_NOT_IMPLEMENTED; + } + + virtual int unregisterLocalFile(FileBufferID id) { + return ERR_NOT_IMPLEMENTED; + } + virtual const char *getName() const = 0; }; } // namespace mooncake diff --git a/mooncake-transfer-engine/src/transfer_engine.cpp b/mooncake-transfer-engine/src/transfer_engine.cpp index 187e606a6..972627f26 100644 --- a/mooncake-transfer-engine/src/transfer_engine.cpp +++ b/mooncake-transfer-engine/src/transfer_engine.cpp @@ -287,6 +287,15 @@ Transport *TransferEngine::installTransport(const std::string &proto, entry.addr, entry.length, entry.location, entry.remote_accessible); if (ret < 0) return nullptr; } + + if (transport->supportFileBuffer()) { + for (auto &file : local_files_) { + int ret = transport->registerLocalFile( + file.second.id, file.second.path, file.second.size); + if (ret < 0) return nullptr; + } + } + return transport; } @@ -436,6 +445,77 @@ int TransferEngine::unregisterLocalMemoryBatch( return 0; } +bool TransferEngine::supportFileBuffer() { + bool supported = false; + for (auto &transport : multi_transports_->listTransports()) { + supported = supported || transport->supportFileBuffer(); + } + return supported; +} + +int TransferEngine::registerLocalFile(const std::string &path, size_t size, + FileBufferID &id) { + if (!supportFileBuffer()) { + LOG(ERROR) << "File buffers not suppotred"; + return ERR_NOT_IMPLEMENTED; + } + + std::unique_lock lock(mutex_); + if (local_files_.count(path) > 0) { + LOG(ERROR) << "Registering an already registered file: " << path; + return ERR_ADDRESS_OVERLAPPED; + } + + const auto id_ = next_file_id_.fetch_add(1); + + for (auto &transport : multi_transports_->listTransports()) { + if (!transport->supportFileBuffer()) { + continue; + } + + int ret = transport->registerLocalFile(id_, path, size); + if (ret != 0) { + LOG(ERROR) << "Failed to register file " << path << " to transport " + << transport->getName() << ", ret=" << ret; + return ret; + } + } + + local_files_[path] = {id_, path, size}; + id = id_; + return 0; +} + +int TransferEngine::unregisterLocalFile(const std::string &path) { + if (!supportFileBuffer()) { + LOG(ERROR) << "File buffers not suppotred"; + return ERR_NOT_IMPLEMENTED; + } + + std::unique_lock lock(mutex_); + auto it = local_files_.find(path); + if (it == local_files_.end()) { + return ERR_ADDRESS_NOT_REGISTERED; + } + + for (auto &transport : multi_transports_->listTransports()) { + if (!transport->supportFileBuffer()) { + continue; + } + + int ret = transport->unregisterLocalFile(it->second.id); + if (ret != 0 && ret != ERR_ADDRESS_NOT_REGISTERED) { + LOG(ERROR) << "Failed to unregister file " << path + << " from transport " << transport->getName() + << ", ret=" << ret; + return ret; + } + } + + local_files_.erase(it); + return 0; +} + #ifdef WITH_METRICS // Helper function to convert string to lowercase for case-insensitive // comparison diff --git a/mooncake-transfer-engine/src/transfer_engine_c.cpp b/mooncake-transfer-engine/src/transfer_engine_c.cpp index 774cd37d6..87e760501 100644 --- a/mooncake-transfer-engine/src/transfer_engine_c.cpp +++ b/mooncake-transfer-engine/src/transfer_engine_c.cpp @@ -117,6 +117,22 @@ int unregisterLocalMemoryBatch(transfer_engine_t engine, void **addr_list, return native->unregisterLocalMemoryBatch(native_addr_list); } +bool supportFileBuffer(transfer_engine_t engine) { + TransferEngine *native = (TransferEngine *)engine; + return native->supportFileBuffer(); +} + +int registerLocalFile(transfer_engine_t engine, const char *path, size_t size, + file_id_t *id) { + TransferEngine *native = (TransferEngine *)engine; + return native->registerLocalFile(path, size, *id); +} + +int unregisterLocalFile(transfer_engine_t engine, const char *path) { + TransferEngine *native = (TransferEngine *)engine; + return native->unregisterLocalFile(path); +} + batch_id_t allocateBatchID(transfer_engine_t engine, size_t batch_size) { TransferEngine *native = (TransferEngine *)engine; return (batch_id_t)native->allocateBatchID(batch_size); @@ -132,6 +148,7 @@ int submitTransfer(transfer_engine_t engine, batch_id_t batch_id, (Transport::TransferRequest::OpCode)entries[index].opcode; native_entries[index].source = entries[index].source; native_entries[index].target_id = entries[index].target_id; + native_entries[index].file_id = entries[index].file_id; native_entries[index].target_offset = entries[index].target_offset; native_entries[index].length = entries[index].length; } diff --git a/mooncake-transfer-engine/src/transfer_metadata.cpp b/mooncake-transfer-engine/src/transfer_metadata.cpp index 52f5a228c..b03a80a93 100644 --- a/mooncake-transfer-engine/src/transfer_metadata.cpp +++ b/mooncake-transfer-engine/src/transfer_metadata.cpp @@ -231,6 +231,18 @@ int TransferMetadata::encodeSegmentDesc(const SegmentDesc &desc, << desc.name << " protocol " << desc.protocol; return ERR_METADATA; } + + Json::Value fileBuffersJson(Json::arrayValue); + for (const auto &fileBuffer : desc.file_buffers) { + Json::Value bufferJSON; + bufferJSON["id"] = fileBuffer.id; + bufferJSON["path"] = fileBuffer.path; + bufferJSON["size"] = fileBuffer.size; + bufferJSON["align"] = fileBuffer.align; + fileBuffersJson.append(bufferJSON); + } + segmentJSON["file_buffers"] = fileBuffersJson; + return 0; } @@ -415,6 +427,16 @@ TransferMetadata::decodeSegmentDesc(Json::Value &segmentJSON, << " protocol " << desc->protocol; return nullptr; } + + for (const auto &bufferJSON : segmentJSON["file_buffers"]) { + FileBufferDesc buffer; + buffer.id = bufferJSON["id"].asUInt(); + buffer.path = bufferJSON["path"].asString(); + buffer.size = bufferJSON["size"].asUInt64(); + buffer.align = bufferJSON["align"].asUInt64(); + desc->file_buffers.push_back(buffer); + } + return desc; } @@ -605,6 +627,38 @@ int TransferMetadata::removeLocalMemoryBuffer(void *addr, return ERR_ADDRESS_NOT_REGISTERED; } +int TransferMetadata::addFileBuffer(const FileBufferDesc &buffer_desc, + bool update_metadata) { + { + RWSpinlock::WriteGuard guard(segment_lock_); + auto &segment_desc = segment_id_to_desc_map_[LOCAL_SEGMENT_ID]; + segment_desc->file_buffers.push_back(buffer_desc); + } + if (update_metadata) return updateLocalSegmentDesc(); + return 0; +} + +int TransferMetadata::removeFileBuffer(FileBufferID id, bool update_metadata) { + bool buffer_exist = false; + { + RWSpinlock::WriteGuard guard(segment_lock_); + auto &segment_desc = segment_id_to_desc_map_[LOCAL_SEGMENT_ID]; + for (auto iter = segment_desc->file_buffers.begin(); + iter != segment_desc->file_buffers.end(); ++iter) { + if (iter->id == id) { + segment_desc->file_buffers.erase(iter); + buffer_exist = true; + break; + } + } + } + if (buffer_exist) { + if (update_metadata) return updateLocalSegmentDesc(); + return 0; + } + return ERR_ADDRESS_NOT_REGISTERED; +} + int TransferMetadata::addRpcMetaEntry(const std::string &server_name, RpcMetaDesc &desc) { local_rpc_meta_ = desc; From 57c3d9dd6357f435c606914ab91eadd8cf5b355c Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 22 Aug 2025 11:57:45 +0000 Subject: [PATCH 03/15] [TransferEngine]: Introduce generic NVMeoF transport Signed-off-by: Jinlong Chen --- dependencies.sh | 4 +- mooncake-common/common.cmake | 6 + .../example/CMakeLists.txt | 5 + .../transfer_engine_nvmeof_generic_bench.cpp | 396 ++++++++++++++++++ mooncake-transfer-engine/include/config.h | 4 + .../include/transfer_engine_c.h | 1 + .../include/transfer_metadata.h | 15 + .../nvmeof_initiator.h | 128 ++++++ .../nvmeof_generic_transport/nvmeof_target.h | 109 +++++ .../nvmeof_transport.h | 110 +++++ .../nvmeof_generic_transport/worker_pool.h | 83 ++++ .../include/transport/transport.h | 10 + mooncake-transfer-engine/src/CMakeLists.txt | 6 +- mooncake-transfer-engine/src/config.cpp | 28 ++ .../src/multi_transport.cpp | 14 +- .../src/transfer_metadata.cpp | 33 ++ .../src/transport/CMakeLists.txt | 5 + .../nvmeof_generic_transport/CMakeLists.txt | 4 + .../nvmeof_initiator.cpp | 320 ++++++++++++++ .../nvmeof_target.cpp | 333 +++++++++++++++ .../nvmeof_transport.cpp | 393 +++++++++++++++++ .../nvmeof_generic_transport/worker_pool.cpp | 226 ++++++++++ 22 files changed, 2230 insertions(+), 3 deletions(-) create mode 100644 mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp create mode 100644 mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h create mode 100644 mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h create mode 100644 mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h create mode 100644 mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h create mode 100644 mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt create mode 100644 mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp create mode 100644 mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp create mode 100644 mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp create mode 100644 mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp diff --git a/dependencies.sh b/dependencies.sh index bed94ceca..d64e1f5b3 100755 --- a/dependencies.sh +++ b/dependencies.sh @@ -121,7 +121,9 @@ SYSTEM_PACKAGES="build-essential \ libcurl4-openssl-dev \ libhiredis-dev \ pkg-config \ - patchelf" + patchelf \ + libaio-dev \ + libnvme-dev" apt-get install -y $SYSTEM_PACKAGES check_success "Failed to install system packages" diff --git a/mooncake-common/common.cmake b/mooncake-common/common.cmake index 68ee6254d..2aa2779a8 100644 --- a/mooncake-common/common.cmake +++ b/mooncake-common/common.cmake @@ -58,6 +58,7 @@ option(BUILD_EXAMPLES "Build examples" ON) option(BUILD_UNIT_TESTS "Build unit tests" ON) option(USE_CUDA "option for enabling gpu features" OFF) option(USE_NVMEOF "option for using NVMe over Fabric" OFF) +option(USE_NVMEOF_GENERIC "option for using generic NVMe over Fabric transport" OFF) option(USE_TCP "option for using TCP transport" ON) option(USE_ASCEND "option for using npu with HCCL" OFF) option(USE_ASCEND_DIRECT "option for using ascend npu with adxl engine" OFF) @@ -87,6 +88,11 @@ if (USE_NVMEOF) message(STATUS "NVMe-oF support is enabled") endif() +if (USE_NVMEOF_GENERIC) + add_compile_definitions(USE_NVMEOF_GENERIC) + message(STATUS "Generic NVMe-oF support is enabled") +endif() + if (USE_MNNVL) set(USE_CUDA ON) add_compile_definitions(USE_MNNVL) diff --git a/mooncake-transfer-engine/example/CMakeLists.txt b/mooncake-transfer-engine/example/CMakeLists.txt index 9c6a51470..b74cae8a0 100644 --- a/mooncake-transfer-engine/example/CMakeLists.txt +++ b/mooncake-transfer-engine/example/CMakeLists.txt @@ -21,4 +21,9 @@ endif() if (USE_ASCEND_DIRECT) add_executable(transfer_engine_ascend_direct_perf transfer_engine_ascend_direct_perf.cpp) target_link_libraries(transfer_engine_ascend_direct_perf PUBLIC transfer_engine) +endif() + +if (USE_NVMEOF_GENERIC) + add_executable(transfer_engine_nvmeof_generic_bench transfer_engine_nvmeof_generic_bench.cpp) + target_link_libraries(transfer_engine_nvmeof_generic_bench PUBLIC transfer_engine) endif() \ No newline at end of file diff --git a/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp new file mode 100644 index 000000000..61633de37 --- /dev/null +++ b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp @@ -0,0 +1,396 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "transfer_engine.h" + +// Common arguments. +DEFINE_string(local_server_name, mooncake::getHostname(), + "Local server name for segment discovery"); +DEFINE_string(metadata_server, "192.168.3.77:2379", "etcd server host address"); +DEFINE_string( + mode, "loopback", + "Running mode: initiator, target, or loopback. Initiator node read/write " + "data blocks from target node"); + +// Initiator arguments. +DEFINE_string(operation, "read", "Operation type: read or write"); +DEFINE_string(segment_id, "192.168.3.76", "Segment ID to access data"); +DEFINE_int32(batch_size, 4096, "Batch size"); +DEFINE_uint64(block_size, 65536, "Block size for each transfer request"); +DEFINE_int32(duration, 30, "Test duration in seconds"); +DEFINE_int32(threads, 1, "Task submission threads"); +DEFINE_string(report_unit, "GB", "Report unit: GB|GiB|Gb|MB|MiB|Mb|KB|KiB|Kb"); +DEFINE_uint32(report_precision, 2, "Report precision"); + +// Target arguments. +DEFINE_string(trtype, "tcp", "TRTYPE of NVMeoF: tcp|rdma"); +DEFINE_string(adrfam, "ipv4", "ADRFAM of NVMeoF: ipv4|ipv6"); +DEFINE_string(traddr, "127.0.0.1", + "TRADDR of NVMeoF, i.e. service listen address"); +DEFINE_string(trsvcid, "4420", "TRSVCID of NVMeoF, i.e. service listen port"); +DEFINE_string(files, "", + "Files to register as buffers, separated by space, e.g.: " + "\"/dev/nvme0n1 /dev/nvme1n1\""); + +using namespace mooncake; + +const static std::unordered_map RATE_UNIT_MP = { + {"GB", 1000ull * 1000ull * 1000ull}, + {"GiB", 1ull << 30}, + {"Gb", 1000ull * 1000ull * 1000ull / 8}, + {"MB", 1000ull * 1000ull}, + {"MiB", 1ull << 20}, + {"Mb", 1000ull * 1000ull / 8}, + {"KB", 1000ull}, + {"KiB", 1ull << 10}, + {"Kb", 1000ull / 8}}; + +static inline std::string calculateRate(uint64_t data_bytes, double duration) { + if (std::fabs(duration) < 1e-10) { + LOG(ERROR) << "Invalid args: duration shouldn't be 0"; + return ""; + } + + if (!RATE_UNIT_MP.count(FLAGS_report_unit)) { + LOG(WARNING) << "Invalid flag: report_unit only support " + "GB|GiB|Gb|MB|MiB|Mb|KB|KiB|Kb, not support " + << FLAGS_report_unit + << " . Now use GB(default) as report_unit"; + FLAGS_report_unit = "GB"; + } + + std::ostringstream oss; + oss << std::fixed << std::setprecision(FLAGS_report_precision) + << 1.0 * data_bytes / duration / RATE_UNIT_MP.at(FLAGS_report_unit) + << " " << FLAGS_report_unit << "/s"; + return oss.str(); +} + +static std::unique_ptr initTransferEngine() { + // Disable topology auto discovery for testing. + auto engine = std::make_unique(); + if (engine == nullptr) { + LOG(ERROR) << "Failed to create transfer engine"; + exit(EXIT_FAILURE); + } + + auto hostname_port = parseHostNameWithPort(FLAGS_local_server_name); + int rc = + engine->init(FLAGS_metadata_server, FLAGS_local_server_name.c_str(), + hostname_port.first.c_str(), hostname_port.second); + if (rc != 0) { + LOG(ERROR) << "Failed to init transfer engine, rc=" << rc; + exit(EXIT_FAILURE); + } + + const std::string trStr = + "trtype=" + FLAGS_trtype + " adrfam=" + FLAGS_adrfam + + " traddr=" + FLAGS_traddr + " trsvcid=" + FLAGS_trsvcid; + LOG(INFO) << "Using Trid: " << trStr; + + Transport *xport = nullptr; + void *args[2] = {(void *)trStr.c_str(), nullptr}; + xport = engine->installTransport("nvmeof_generic", args); + if (xport == nullptr) { + LOG(ERROR) << "Failed to install nvmeof_generic transport"; + exit(EXIT_FAILURE); + } + + return engine; +} + +static volatile bool initiator_running = true; +static std::atomic total_batch_count(0); + +static Status initiatorWorker(TransferEngine *engine, SegmentID segment_id, + int thread_id, void *addr) { + TransferRequest::OpCode opcode; + if (FLAGS_operation == "read") + opcode = TransferRequest::READ; + else if (FLAGS_operation == "write") + opcode = TransferRequest::WRITE; + else { + LOG(ERROR) << "Unsupported operation: must be 'read' or 'write'"; + exit(EXIT_FAILURE); + } + + auto segment_desc = engine->getMetadata()->getSegmentDescByID(segment_id); + if (!segment_desc) { + LOG(ERROR) << "Unable to get target segment ID, please recheck"; + exit(EXIT_FAILURE); + } + + auto &file_buffers = segment_desc->file_buffers; + if (file_buffers.size() <= 0) { + LOG(ERROR) << "No file buffer registered in segment, please check"; + exit(EXIT_FAILURE); + } + + size_t batch_count = 0; + while (initiator_running) { + std::vector requests; + for (int i = 0; i < FLAGS_batch_size; ++i) { + auto buffer_offset = + FLAGS_block_size * (i * FLAGS_threads + thread_id); + // Randomly pick a file. + auto file_index = std::rand() % file_buffers.size(); + // Randomly pick a file offset. + auto file_unit_cnt = file_buffers[file_index].size / + FLAGS_block_size / FLAGS_threads; + auto target_offset = + FLAGS_block_size * + ((std::rand() % file_unit_cnt) * FLAGS_threads + thread_id); + + TransferRequest entry; + entry.opcode = opcode; + entry.length = FLAGS_block_size; + entry.source = (void *)((uintptr_t)(addr) + buffer_offset); + entry.target_id = segment_id; + entry.file_id = file_buffers[file_index].id; + entry.target_offset = target_offset; + requests.emplace_back(entry); + } + + auto batch_id = engine->allocateBatchID(FLAGS_batch_size); + Status s = engine->submitTransfer(batch_id, requests); + if (!s.ok()) { + LOG(ERROR) << "Failed to submit request: " << s.ToString(); + } + + for (int task_id = 0; task_id < FLAGS_batch_size; ++task_id) { + bool completed = false; + TransferStatus status; + while (!completed) { + Status s = engine->getTransferStatus(batch_id, task_id, status); + LOG_ASSERT(s.ok()); + if (status.s == TransferStatusEnum::COMPLETED) + completed = true; + else if (status.s == TransferStatusEnum::FAILED) { + LOG(INFO) << "FAILED"; + completed = true; + exit(EXIT_FAILURE); + } + } + } + + s = engine->freeBatchID(batch_id); + LOG_ASSERT(s.ok()); + batch_count++; + } + + LOG(INFO) << "Worker " << thread_id << " stopped!"; + total_batch_count.fetch_add(batch_count); + return Status::OK(); +} + +static void startInitiator(TransferEngine *engine) { + auto buffer_size = FLAGS_block_size * FLAGS_batch_size * FLAGS_threads; + void *addr = std::aligned_alloc(4096, buffer_size); + if (addr == nullptr) { + LOG(ERROR) << "Failed to allocate buffer"; + exit(EXIT_FAILURE); + } + + int rc = engine->registerLocalMemory(addr, buffer_size); + if (rc != 0) { + LOG(ERROR) << "Failed to register buffer, rc=" << rc; + exit(EXIT_FAILURE); + } + + auto segment_id = engine->openSegment(FLAGS_segment_id.c_str()); + + struct timeval start_tv; + gettimeofday(&start_tv, nullptr); + + std::vector workers(FLAGS_threads); + for (int i = 0; i < FLAGS_threads; ++i) { + workers[i] = std::thread(initiatorWorker, engine, segment_id, i, addr); + } + + sleep(FLAGS_duration); + initiator_running = false; + + for (int i = 0; i < FLAGS_threads; ++i) { + workers[i].join(); + } + + struct timeval stop_tv; + gettimeofday(&stop_tv, nullptr); + + auto duration = (stop_tv.tv_sec - start_tv.tv_sec) + + (stop_tv.tv_usec - start_tv.tv_usec) / 1000000.0; + auto batch_count = total_batch_count.load(); + LOG(INFO) << "Test completed: duration " << std::fixed + << std::setprecision(2) << duration << ", batch count " + << batch_count << ", throughput " + << calculateRate( + batch_count * FLAGS_batch_size * FLAGS_block_size, + duration); + + engine->unregisterLocalMemory(addr); + std::free(addr); +} + +static volatile bool target_started = false; +static volatile bool target_running = true; + +static size_t getFileSize(const std::string &file) { + size_t size = 0; + struct stat st; + + int fd = open(file.c_str(), O_RDONLY); + if (fd < 0) { + goto err_out; + } + + if (fstat(fd, &st) != 0) { + goto err_close_file; + } + + if (S_ISLNK(st.st_mode)) { + goto err_close_file; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + ioctl(fd, BLKGETSIZE64, &size); + } else if (S_ISREG(st.st_mode)) { + size = st.st_size; + } + +err_close_file: + close(fd); +err_out: + return size; +} + +static void startTarget(TransferEngine *engine) { + std::vector files; + std::istringstream s(FLAGS_files); + std::string file; + while (s >> file) { + if (file.size() <= 0) { + LOG(ERROR) << "Invalid file path " << file; + exit(EXIT_FAILURE); + } + + auto size = getFileSize(file); + if (size == 0) { + LOG(ERROR) << "Invalid file " << file; + exit(EXIT_FAILURE); + } + + FileBufferID id; + int rc = engine->registerLocalFile(file, size, id); + if (rc != 0) { + LOG(ERROR) << "Failed to register file " << file << ", rc=" << rc; + exit(EXIT_FAILURE); + } + + files.push_back(file); + } + + if (files.size() <= 0) { + LOG(ERROR) << "No valid file in \"" << FLAGS_files << "\""; + exit(EXIT_FAILURE); + } + + target_started = true; + while (target_running) sleep(1); + + for (auto &file : files) { + engine->unregisterLocalFile(file); + } +} + +static int initiator() { + auto engine = initTransferEngine(); + startInitiator(engine.get()); + return 0; +} + +static void signalHandler(int signum) { + LOG(INFO) << "Received signal " << signum << ", stopping target server..."; + target_running = false; +} + +static int target() { + signal(SIGINT, signalHandler); + signal(SIGTERM, signalHandler); + + auto engine = initTransferEngine(); + startTarget(engine.get()); + + return 0; +} + +static int loopback() { + auto engine = initTransferEngine(); + + // Start target thread. + auto target_thread = std::thread(startTarget, engine.get()); + size_t wait_cnt = 0; + while (!target_started && wait_cnt < 60) { + sleep(1); + wait_cnt++; + } + + if (!target_started) { + LOG(ERROR) << "Target initialization timedout"; + exit(EXIT_FAILURE); + } + + // Start initiator thread. + auto initiator_thread = std::thread(startInitiator, engine.get()); + + // Wait initiator to complete. + initiator_thread.join(); + + // Terminate target. + target_running = false; + target_thread.join(); + + return 0; +} + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, false); + + if (FLAGS_mode == "initiator") + return initiator(); + else if (FLAGS_mode == "target") + return target(); + else if (FLAGS_mode == "loopback") + return loopback(); + + LOG(ERROR) + << "Unsupported mode: must be 'initiator', 'target', or 'loopback'"; + exit(EXIT_FAILURE); +} diff --git a/mooncake-transfer-engine/include/config.h b/mooncake-transfer-engine/include/config.h index e4d849c25..b7152a060 100644 --- a/mooncake-transfer-engine/include/config.h +++ b/mooncake-transfer-engine/include/config.h @@ -48,6 +48,10 @@ struct GlobalConfig { bool use_ipv6 = false; size_t fragment_limit = 16384; bool enable_dest_device_affinity = false; +#ifdef USE_NVMEOF_GENERIC + bool nvmeof_generic_direct_io = false; + uint32_t nvmeof_generic_num_workers = 8; +#endif }; void loadGlobalConfig(GlobalConfig &config); diff --git a/mooncake-transfer-engine/include/transfer_engine_c.h b/mooncake-transfer-engine/include/transfer_engine_c.h index 301eace3c..339db59b9 100644 --- a/mooncake-transfer-engine/include/transfer_engine_c.h +++ b/mooncake-transfer-engine/include/transfer_engine_c.h @@ -17,6 +17,7 @@ #include #include +#include #ifdef __cplusplus extern "C" { diff --git a/mooncake-transfer-engine/include/transfer_metadata.h b/mooncake-transfer-engine/include/transfer_metadata.h index 5e4f8dc0f..13d9b55ce 100644 --- a/mooncake-transfer-engine/include/transfer_metadata.h +++ b/mooncake-transfer-engine/include/transfer_metadata.h @@ -86,6 +86,17 @@ class TransferMetadata { uint64_t pid; }; +#ifdef USE_NVMEOF_GENERIC + // NVMeoF transport id. + struct NVMeoFGenericTrid { + std::string trtype; + std::string adrfam; + std::string traddr; + std::string trsvcid; + std::string subnqn; + }; +#endif + using SegmentID = uint64_t; struct SegmentDesc { @@ -106,6 +117,10 @@ class TransferMetadata { std::string timestamp; // this is for ascend RankInfoDesc rank_info; +#ifdef USE_NVMEOF_GENERIC + // this is for nvmeof_generic + NVMeoFGenericTrid nvmeof_generic_trid; +#endif int tcp_data_port; diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h new file mode 100644 index 000000000..8c1d8608c --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h @@ -0,0 +1,128 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_INITIATOR_H_ +#define NVMEOF_GENERIC_INITIATOR_H_ + +#include + +#include +#include + +#include "transport/transport.h" + +namespace mooncake { + +using Slice = Transport::Slice; +using NamespaceID = Transport::FileBufferID; + +class NVMeoFQueue; +class NVMeoFController; + +class NVMeoFInitiator : public std::enable_shared_from_this { + friend class NVMeoFController; + + public: + static std::shared_ptr create(bool direct_io = false); + + ~NVMeoFInitiator(); + + std::shared_ptr attachController( + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn); + + void detachController(std::shared_ptr ctrlr); + + private: + NVMeoFInitiator(bool direct_io); + + int setup(); + + const bool direct_io; + struct nvme_fabrics_config cfg; + nvme_root_t root; + nvme_host_t host; +}; + +class NVMeoFController : public std::enable_shared_from_this { + friend class NVMeoFInitiator; + + public: + ~NVMeoFController(); + + void rescan(); + + std::unique_ptr createQueue(size_t queueDepth); + + int getNsFd(NamespaceID nsid); + + private: + struct NVMeoFNamespace { + NamespaceID nsid; + int fd; + + ~NVMeoFNamespace() { close(fd); } + }; + + NVMeoFController(std::shared_ptr initiator, + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn); + + nvme_ctrl_t findCtrl(); + + int connect(); + + int disconnect(); + + const std::shared_ptr initiator; + const std::string trtype; + const std::string adrfam; + const std::string traddr; + const std::string trsvcid; + const std::string subnqn; + + nvme_ctrl_t ctrl; + bool should_disconnect_ctrl; + + RWSpinlock ns_lock; + std::unordered_map namespaces; +}; + +class NVMeoFQueue { + friend class NVMeoFController; + + public: + ~NVMeoFQueue(); + + int submitRequest(Slice *slice); + + void reapCompletions(); + + std::shared_ptr getCtrlr() { return this->ctrlr; } + + private: + NVMeoFQueue(std::shared_ptr ctrlr, size_t queueDepth); + + int setup(); + + std::shared_ptr ctrlr; + size_t depth; + io_context_t io_ctx; + std::vector events; +}; + +} // namespace mooncake +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h new file mode 100644 index 000000000..3a664ff1a --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h @@ -0,0 +1,109 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_TARGET_H_ +#define NVMEOF_GENERIC_TARGET_H_ + +#include +#include +#include +#include +#include + +namespace mooncake { +using FileBufferID = uint32_t; +using NamespaceID = FileBufferID; + +namespace nvmeof_target { +class NVMeoFNamespace { + public: + const std::string subnqn; + const NamespaceID nsid; + const std::string file; + + NVMeoFNamespace(const std::string &subnqn, NamespaceID nsid, + const std::string &file); + ~NVMeoFNamespace(); + + int setup(); +}; + +class NVMeoFSubsystem { + public: + const std::string subnqn; + + NVMeoFSubsystem(const std::string &subnqn); + ~NVMeoFSubsystem(); + + int setup(); + + int addNamespace(NamespaceID nsid, const std::string &file); + + int removeNamespace(NamespaceID nsid); + + private: + std::unordered_map> + namespaces; +}; + +class NVMeoFListener { + public: + const std::string trtype; + const std::string adrfam; + const std::string traddr; + const std::string trsvcid; + + NVMeoFListener(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid); + ~NVMeoFListener(); + + int setup(); + + int addSubsystem(std::shared_ptr subsys); + + int removeSubsystem(std::shared_ptr subsys); + + private: + static std::atomic next_id; + + const unsigned int id; + std::vector> subsystems; +}; +} // namespace nvmeof_target + +class NVMeoFTarget { + public: + NVMeoFTarget(const std::string &hostname); + ~NVMeoFTarget(); + + int setup(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid); + + int addFile(FileBufferID file_id, const std::string &file); + + int removeFile(FileBufferID file_id); + + const std::string &getSubNQN() { return subsystem->subnqn; } + + private: + const std::string hostname; + + std::mutex mutex; + std::unique_ptr listener; + std::shared_ptr subsystem; +}; + +} // namespace mooncake + +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h new file mode 100644 index 000000000..fd15ace1d --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h @@ -0,0 +1,110 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_TRANSPORT_H_ +#define NVMEOF_GENERIC_TRANSPORT_H_ + +#include + +#include +#include +#include +#include +#include + +#include "nvmeof_target.h" +#include "nvmeof_initiator.h" +#include "worker_pool.h" +#include "transfer_metadata.h" +#include "transport/transport.h" + +namespace mooncake { +using FileBufferID = TransferMetadata::FileBufferID; +using FileBufferDesc = TransferMetadata::FileBufferDesc; +using NVMeoFTrid = TransferMetadata::NVMeoFGenericTrid; + +class NVMeoFGenericTransport : public Transport { + public: + NVMeoFGenericTransport(); + + ~NVMeoFGenericTransport(); + + BatchID allocateBatchID(size_t batch_size) override; + + Status freeBatchID(BatchID batch_id) override; + + Status submitTransferTask( + const std::vector &task_list) override; + + Status submitTransfer(BatchID batch_id, + const std::vector &entries) override; + + Status getTransferStatus(BatchID batch_id, size_t task_id, + TransferStatus &status) override; + + private: + int install(std::string &local_server_name, + std::shared_ptr meta, void **args) override; + + int registerLocalMemory(void *addr, size_t length, + const std::string &location, bool remote_accessible, + bool update_metadata) override; + + int unregisterLocalMemory(void *addr, + bool update_metadata = false) override; + + int registerLocalMemoryBatch( + const std::vector &buffer_list, + const std::string &location) override { + return 0; + } + + int unregisterLocalMemoryBatch( + const std::vector &addr_list) override { + return 0; + } + + int setupLocalSegment(); + + bool supportFileBuffer() override { return true; } + + int registerLocalFile(FileBufferID id, const std::string &path, + size_t size) override; + + int unregisterLocalFile(FileBufferID id) override; + + const char *getName() const override { return "nvmeof_generic"; } + + int parseTrid(const std::string &trStr); + + bool validateTrid(const NVMeoFTrid &local_trid); + + int setupInitiator(); + + std::shared_ptr getOrCreateController( + SegmentHandle handle); + + std::shared_ptr initiator; + std::unique_ptr worker_pool; + + NVMeoFTrid local_trid; + std::unique_ptr target; + + RWSpinlock controller_lock_; + std::unordered_map> + segment_to_controller_; +}; +} // namespace mooncake + +#endif diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h new file mode 100644 index 000000000..1c7c99ec3 --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h @@ -0,0 +1,83 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_WORKER_POOL_H_ +#define NVMEOF_GENERIC_WORKER_POOL_H_ + +#include +#include +#include +#include +#include + +#include + +#include "nvmeof_initiator.h" + +namespace mooncake { +struct NVMeoFWorkerTask { + NVMeoFController *ctrlr; + Slice *slice; + uint64_t timestamp; +}; + +class NVMeoFWorker { + friend class NVMeoFWorkerPool; + + public: + NVMeoFWorker(size_t id); + ~NVMeoFWorker(); + + void addController(std::shared_ptr ctrlr); + void removeController(NVMeoFController *ctrlr); + + private: + void sendMsg(const std::function &func); + int submitTask(NVMeoFController *ctrlr, Slice *slice); + void dispatchTasks(); + void poll(); + + const size_t id; + std::thread thread; + bool stopping; + std::atomic clock; + + boost::lockfree::queue *> + msg_queue; + std::unordered_map> queues; + + NVMeoFWorkerTask *tasks; + NVMeoFWorkerTask *curr_task; + boost::lockfree::queue free_tasks; + boost::lockfree::queue task_queue; +}; + +class NVMeoFWorkerPool { + public: + NVMeoFWorkerPool(size_t num_workers); + ~NVMeoFWorkerPool(); + + int addController(std::shared_ptr ctrlr); + int removeController(NVMeoFController *ctrlr); + + int submitTask(NVMeoFController *ctrlr, Slice *slice); + + private: + const size_t num_workers; + std::atomic next_worker; + std::vector> workers; +}; +} // namespace mooncake + +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index 56e809e30..bd6f3d553 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -27,6 +27,10 @@ #include #include +#ifdef USE_NVMEOF_GENERIC +#include +#endif + #include "common/base/status.h" #include "transfer_metadata.h" @@ -128,6 +132,12 @@ class Transport { struct { uint64_t dest_addr; } ascend_direct; +#ifdef USE_NVMEOF_GENERIC + struct { + uint64_t offset; + struct iocb iocb; + } nvmeof_generic; +#endif }; public: diff --git a/mooncake-transfer-engine/src/CMakeLists.txt b/mooncake-transfer-engine/src/CMakeLists.txt index 5d88fdb32..881c5429e 100644 --- a/mooncake-transfer-engine/src/CMakeLists.txt +++ b/mooncake-transfer-engine/src/CMakeLists.txt @@ -53,4 +53,8 @@ if (USE_ASCEND OR USE_ASCEND_DIRECT) else() target_link_libraries(transfer_engine PUBLIC ascend_transport ascendcl adxl metadef) endif() -endif() \ No newline at end of file +endif() + +if (USE_NVMEOF_GENERIC) + target_link_libraries(transfer_engine PUBLIC nvmeof_generic_transport aio nvme) +endif() diff --git a/mooncake-transfer-engine/src/config.cpp b/mooncake-transfer-engine/src/config.cpp index 61fcc835e..20e956277 100644 --- a/mooncake-transfer-engine/src/config.cpp +++ b/mooncake-transfer-engine/src/config.cpp @@ -258,6 +258,28 @@ void loadGlobalConfig(GlobalConfig &config) { if (std::getenv("MC_ENABLE_DEST_DEVICE_AFFINITY")) { config.enable_dest_device_affinity = true; } + +#ifdef USE_NVMEOF_GENERIC + const char *nvmeof_generic_direct_io = + std::getenv("MC_NVMEOF_GENERIC_DIRECT_IO"); + if (nvmeof_generic_direct_io != nullptr && + strlen(nvmeof_generic_direct_io) > 0) { + LOG(INFO) << "Enabling direct I/O for nvmeof_generic transport"; + config.nvmeof_generic_direct_io = true; + } + + const char *nvmeof_generic_num_workers = + std::getenv("MC_NVMEOF_GENERIC_NUM_WORKERS"); + if (nvmeof_generic_num_workers != NULL) { + int val = atoi(nvmeof_generic_num_workers); + if (val > 0) { + config.nvmeof_generic_num_workers = val; + } else { + LOG(ERROR) << "Invalid value for MC_NVMEOF_GENERIC_NUM_WORKERS: " + << nvmeof_generic_num_workers; + } + } +#endif } std::string mtuLengthToString(ibv_mtu mtu) { @@ -306,6 +328,12 @@ void dumpGlobalConfig() { LOG(INFO) << "max_wr = " << config.max_wr; LOG(INFO) << "max_inline = " << config.max_inline; LOG(INFO) << "mtu_length = " << mtuLengthToString(config.mtu_length); +#ifdef USE_NVMEOF_GENERIC + LOG(INFO) << "nvmeof_generic_direct_io = " + << config.nvmeof_generic_direct_io; + LOG(INFO) << "nvmeof_generic_num_workers = " + << config.nvmeof_generic_num_workers; +#endif } GlobalConfig &globalConfig() { diff --git a/mooncake-transfer-engine/src/multi_transport.cpp b/mooncake-transfer-engine/src/multi_transport.cpp index 26a61997d..6ec55e26f 100644 --- a/mooncake-transfer-engine/src/multi_transport.cpp +++ b/mooncake-transfer-engine/src/multi_transport.cpp @@ -36,6 +36,9 @@ #ifdef USE_CXL #include "transport/cxl_transport/cxl_transport.h" #endif +#ifdef USE_NVMEOF_GENERIC +#include "transport/nvmeof_generic_transport/nvmeof_transport.h" +#endif #include @@ -241,6 +244,11 @@ Transport *MultiTransport::installTransport(const std::string &proto, } bool MultiTransport::transportNeedArgs(const std::string &proto) { +#ifdef USE_NVMEOF_GENERIC + if (proto == "nvmeof_generic") { + return true; + } +#endif return false; } @@ -248,7 +256,11 @@ Transport *MultiTransport::installTransport(const std::string &proto, void **args) { std::shared_ptr transport = nullptr; - // Add transport creation logic here. +#ifdef USE_NVMEOF_GENERIC + if (proto == "nvmeof_generic") { + transport = std::make_shared(); + } +#endif if (!transport) { LOG(ERROR) << "Unsupported transport " << proto diff --git a/mooncake-transfer-engine/src/transfer_metadata.cpp b/mooncake-transfer-engine/src/transfer_metadata.cpp index b03a80a93..c647f4357 100644 --- a/mooncake-transfer-engine/src/transfer_metadata.cpp +++ b/mooncake-transfer-engine/src/transfer_metadata.cpp @@ -226,6 +226,16 @@ int TransferMetadata::encodeSegmentDesc(const SegmentDesc &desc, buffersJSON.append(bufferJSON); } segmentJSON["buffers"] = buffersJSON; +#ifdef USE_NVMEOF_GENERIC + } else if (segmentJSON["protocol"] == "nvmeof_generic") { + Json::Value tridJSON; + tridJSON["trtype"] = desc.nvmeof_generic_trid.trtype; + tridJSON["adrfam"] = desc.nvmeof_generic_trid.adrfam; + tridJSON["traddr"] = desc.nvmeof_generic_trid.traddr; + tridJSON["trsvcid"] = desc.nvmeof_generic_trid.trsvcid; + tridJSON["subnqn"] = desc.nvmeof_generic_trid.subnqn; + segmentJSON["nvmeof_generic_trid"] = tridJSON; +#endif } else { LOG(ERROR) << "Unsupported segment descriptor for register, name " << desc.name << " protocol " << desc.protocol; @@ -422,6 +432,29 @@ TransferMetadata::decodeSegmentDesc(Json::Value &segmentJSON, } desc->buffers.push_back(buffer); } +#ifdef USE_NVMEOF_GENERIC + } else if (desc->protocol == "nvmeof_generic") { + if (!segmentJSON.isMember("nvmeof_generic_trid")) { + LOG(WARNING) << "Corrupted segment descriptor, name " + << segment_name << " protocol " << desc->protocol; + return nullptr; + } + + Json::Value tridJson = segmentJSON["nvmeof_generic_trid"]; + if (!tridJson.isMember("trtype") || !tridJson.isMember("adrfam") || + !tridJson.isMember("traddr") || !tridJson.isMember("trsvcid") || + !tridJson.isMember("subnqn")) { + LOG(WARNING) << "Corrupted segment descriptor, name " + << segment_name << " protocol " << desc->protocol; + return nullptr; + } + + desc->nvmeof_generic_trid.trtype = tridJson["trtype"].asString(); + desc->nvmeof_generic_trid.adrfam = tridJson["adrfam"].asString(); + desc->nvmeof_generic_trid.traddr = tridJson["traddr"].asString(); + desc->nvmeof_generic_trid.trsvcid = tridJson["trsvcid"].asString(); + desc->nvmeof_generic_trid.subnqn = tridJson["subnqn"].asString(); +#endif } else { LOG(ERROR) << "Unsupported segment descriptor, name " << segment_name << " protocol " << desc->protocol; diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt index 8f24ff1d8..67aa33301 100644 --- a/mooncake-transfer-engine/src/transport/CMakeLists.txt +++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt @@ -14,6 +14,11 @@ if (USE_NVMEOF) target_sources(transport PUBLIC $) endif() +if (USE_NVMEOF_GENERIC) + add_subdirectory(nvmeof_generic_transport) + target_sources(transport PUBLIC $) +endif() + if (USE_CXL) add_subdirectory(cxl_transport) target_sources(transport PUBLIC $) diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt new file mode 100644 index 000000000..10b03bca3 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt @@ -0,0 +1,4 @@ +file(GLOB NVMEOF_GENERIC_SOURCES "*.cpp") + +add_library(nvmeof_generic_transport OBJECT ${NVMEOF_GENERIC_SOURCES}) +target_include_directories(nvmeof_generic_transport PUBLIC) \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp new file mode 100644 index 000000000..2f4541999 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp @@ -0,0 +1,320 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_initiator.h" + +#include + +namespace mooncake { +std::shared_ptr NVMeoFInitiator::create(bool direct_io) { + auto initiator = + std::shared_ptr(new NVMeoFInitiator(direct_io)); + int rc = initiator->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to create nvmeof initiator, rc=" << rc; + return nullptr; + } + + return initiator; +} + +NVMeoFInitiator::NVMeoFInitiator(bool direct_io) + : direct_io(direct_io), root(nullptr), host(nullptr) {} + +NVMeoFInitiator::~NVMeoFInitiator() { + if (root != nullptr) { + nvme_free_tree(root); + } +} + +int NVMeoFInitiator::setup() { + nvmf_default_config(&cfg); + + // Disconnect the controller immediately on error. + cfg.ctrl_loss_tmo = 0; + + root = nvme_scan(NULL); + if (root == NULL) { + LOG(ERROR) << "Failed to create NVMe root"; + return -ENOMEM; + } + + host = nvme_default_host(root); + if (host == NULL) { + LOG(ERROR) << "Failed to create default NVMe host"; + return -ENOMEM; + } + + return 0; +} + +std::shared_ptr NVMeoFInitiator::attachController( + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn) { + auto ctrlr = std::shared_ptr(new NVMeoFController( + shared_from_this(), trtype, adrfam, traddr, trsvcid, subnqn)); + int rc = ctrlr->connect(); + if (rc != 0) { + LOG(ERROR) << "Failed to connect controller " << subnqn + << ", rc=" << rc; + return nullptr; + } + + return ctrlr; +} + +void NVMeoFInitiator::detachController( + std::shared_ptr ctrlr) { + ctrlr->disconnect(); +} + +NVMeoFController::NVMeoFController(std::shared_ptr initiator, + const std::string &trtype, + const std::string &adrfam, + const std::string &traddr, + const std::string &trsvcid, + const std::string &subnqn) + : initiator(initiator), + trtype(trtype), + adrfam(adrfam), + traddr(traddr), + trsvcid(trsvcid), + subnqn(subnqn), + ctrl(nullptr), + should_disconnect_ctrl(false) {} + +NVMeoFController::~NVMeoFController() { + if (ctrl != nullptr) { + if (should_disconnect_ctrl) { + nvme_disconnect_ctrl(ctrl); + } + nvme_free_ctrl(ctrl); + } +} + +nvme_ctrl_t NVMeoFController::findCtrl() { + nvme_subsystem_t subsys; + nvme_ctrl_t ctrl; + + // Scan the topology first. + nvme_scan_topology(initiator->root, NULL, NULL); + + nvme_for_each_subsystem(initiator->host, subsys) { + nvme_subsystem_for_each_ctrl(subsys, ctrl) { + if (strcasecmp(nvme_ctrl_get_transport(ctrl), trtype.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_traddr(ctrl), traddr.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_trsvcid(ctrl), trsvcid.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_subsysnqn(ctrl), subnqn.c_str())) { + continue; + } + + return ctrl; + } + } + + return nullptr; +} + +int NVMeoFController::connect() { + ctrl = findCtrl(); + if (ctrl != nullptr) { + // The controller has been connected. + rescan(); + return 0; + } + + ctrl = nvme_create_ctrl(initiator->root, subnqn.c_str(), trtype.c_str(), + traddr.c_str(), NULL, NULL, trsvcid.c_str()); + if (ctrl == NULL) { + LOG(ERROR) << "Failed to create nvme controller " << subnqn; + return -ENOMEM; + } + + int rc = nvmf_add_ctrl(initiator->host, ctrl, &initiator->cfg); + if (rc != 0) { + LOG(ERROR) << "Failed to connect to controller, " << subnqn + << " rc=" << rc; + return rc; + } + + // We connected the controller, so we are responsible for disconnecting it. + should_disconnect_ctrl = true; + + // Wait a moment to ensure all namespaces are attached. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Trigger rescan to open namespaces. + rescan(); + + return 0; +} + +void NVMeoFController::rescan() { + if (ctrl == nullptr) { + // Do not scan disconnected controller. + return; + } + + // Rescan the topology. + nvme_scan_topology(initiator->root, NULL, NULL); + + RWSpinlock::WriteGuard guard(ns_lock); + nvme_ns_t ns; + char ns_dev[64]; + + nvme_ctrl_for_each_ns(ctrl, ns) { + auto nsid = static_cast(nvme_ns_get_nsid(ns)); + auto it = namespaces.find(nsid); + if (it != namespaces.end() && it->second.fd >= 0) { + // Namespace has been open. + continue; + } + + const char *name = nvme_ns_get_name(ns); + int rc = snprintf(ns_dev, sizeof(ns_dev), "/dev/%s", name); + if (rc <= 0) { + LOG(ERROR) << "Invalid namespace device name " << name; + continue; + } + + int flags = O_RDWR; + if (initiator->direct_io) flags |= O_DIRECT; + + int fd = open(ns_dev, flags); + if (fd < 0) { + LOG(ERROR) << "Failed to open nvme namespace " << ns_dev + << ", errno=" << errno; + continue; + } + + LOG(INFO) << "Added namespace " << nsid << " to controller " + << nvme_ctrl_get_name(ctrl); + namespaces[nsid] = {nsid, fd}; + } +} + +int NVMeoFController::disconnect() { + { + RWSpinlock::WriteGuard guard(ns_lock); + namespaces.clear(); + } + + if (ctrl != nullptr) { + if (should_disconnect_ctrl) { + should_disconnect_ctrl = false; + nvme_disconnect_ctrl(ctrl); + } + nvme_free_ctrl(ctrl); + ctrl = nullptr; + } + + return 0; +} + +std::unique_ptr NVMeoFController::createQueue(size_t queueDepth) { + auto queue = std::unique_ptr( + new NVMeoFQueue(shared_from_this(), queueDepth)); + int rc = queue->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to create queue, rc=" << rc; + return nullptr; + } + + return queue; +} + +int NVMeoFController::getNsFd(NamespaceID nsid) { + RWSpinlock::ReadGuard guard(ns_lock); + auto it = namespaces.find(nsid); + if (it == namespaces.end()) { + return -1; + } + return it->second.fd; +} + +NVMeoFQueue::NVMeoFQueue(std::shared_ptr ctrlr, + size_t queueDepth) + : ctrlr(ctrlr), depth(queueDepth), io_ctx(nullptr), events(depth) {} + +NVMeoFQueue::~NVMeoFQueue() { + if (io_ctx != nullptr) { + io_destroy(this->io_ctx); + } +} + +int NVMeoFQueue::setup() { + int rc = io_setup(this->depth, &this->io_ctx); + if (rc != 0) { + LOG(ERROR) << "Failed to setup aio context, rc=" << rc; + return rc; + } + return 0; +} + +int NVMeoFQueue::submitRequest(Slice *slice) { + int fd = ctrlr->getNsFd(slice->file_id); + if (fd < 0) { + LOG(ERROR) << "No namespace " << slice->file_id + << " in nvme controller"; + return -ENOENT; + } + + struct iocb *iocb = &slice->nvmeof_generic.iocb; + if (slice->opcode == Transport::TransferRequest::READ) { + io_prep_pread(iocb, fd, slice->source_addr, slice->length, + slice->nvmeof.offset); + } else { + io_prep_pwrite(iocb, fd, slice->source_addr, slice->length, + slice->nvmeof.offset); + } + iocb->data = slice; + + int rc = io_submit(this->io_ctx, 1, &iocb); + return rc > 0 ? 0 : rc; +} + +void NVMeoFQueue::reapCompletions() { + struct timespec timeout = { + .tv_sec = 0, + .tv_nsec = 0, + }; + Slice *slice = nullptr; + + int rc = io_getevents(this->io_ctx, 0, this->depth, this->events.data(), + &timeout); + if (rc < 0) { + LOG(ERROR) << "Failed to poll aio events, rc = " << rc; + return; + } + + for (int i = 0; i < rc; i++) { + slice = (Slice *)(events[i].data); + if (events[i].res == slice->length) { + slice->markSuccess(); + } else { + slice->markFailed(); + } + } +} +}; // namespace mooncake \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp new file mode 100644 index 000000000..d41c98fc9 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp @@ -0,0 +1,333 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_target.h" + +#include +#include +#include +#include + +namespace mooncake { +namespace nvmeof_target { +static const std::filesystem::path kNVMeTConfigPath = + "/sys/kernel/config/nvmet"; + +static inline std::filesystem::path makePortPath(uint32_t portid) { + return kNVMeTConfigPath / "ports" / std::to_string(portid); +} + +static inline std::filesystem::path makeSubsysPath(const std::string &subnqn) { + return kNVMeTConfigPath / "subsystems" / subnqn; +} + +static inline std::filesystem::path makeSubsysLinkPath( + const std::string &subnqn, uint32_t portid) { + return makePortPath(portid) / "subsystems" / subnqn; +} + +static inline std::filesystem::path makeNamespacePath(const std::string &subnqn, + NamespaceID nsid) { + return makeSubsysPath(subnqn) / "namespaces" / std::to_string(nsid); +} + +static inline bool pathExists(const std::filesystem::path &path) { + return access(path.c_str(), F_OK) == 0; +} + +static inline int mkDir(const std::filesystem::path &path) { + int rc = mkdir(path.c_str(), 0644); + if (rc != 0) { + LOG(ERROR) << "Failed to make directory " << path + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int rmDir(const std::filesystem::path &path) { + int rc = rmdir(path.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to remove directory " << path + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int setAttr(const std::filesystem::path &path, + const std::string &value) { + int fd = open(path.c_str(), O_RDWR); + if (fd < 0) { + LOG(ERROR) << "Failed to open file " << path << ", errno=" << errno; + return -errno; + } + + int rc = write(fd, value.c_str(), value.size()); + if (rc < 0) { + LOG(ERROR) << "Failed to write \"" << value << "\" to file " << path + << ", errno=" << errno; + rc = -errno; + } + + close(fd); + + return rc >= 0 ? 0 : rc; +} + +static inline int symLink(const std::filesystem::path &dest, + const std::filesystem::path &name) { + int rc = symlink(dest.c_str(), name.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to create symlink to " << dest << " at " << name + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int unLink(const std::filesystem::path &name) { + int rc = unlink(name.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to unlink " << name << ", errno=" << errno; + return -errno; + } + return 0; +} + +std::atomic NVMeoFListener::next_id = 1; + +NVMeoFNamespace::NVMeoFNamespace(const std::string &subnqn, NamespaceID nsid, + const std::string &file) + : subnqn(subnqn), nsid(nsid), file(file) {} + +NVMeoFNamespace::~NVMeoFNamespace() { + auto path = makeNamespacePath(subnqn, nsid); + if (pathExists(path)) { + if (pathExists(path / "enable")) { + setAttr(path / "enable", "0"); + } + rmDir(path); + } +} + +int NVMeoFNamespace::setup() { + auto path = makeNamespacePath(subnqn, nsid); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create ns " << std::to_string(nsid) + << " of subsys " << subnqn; + return rc; + } + + rc = setAttr(path / "device_path", file); + if (rc != 0) { + LOG(ERROR) << "Failed to set device_path for ns " + << std::to_string(nsid) << " of subsys " << subnqn; + return rc; + } + + rc = setAttr(path / "enable", "1"); + if (rc != 0) { + LOG(ERROR) << "Failed to enable ns " << std::to_string(nsid) + << " of subsys " << subnqn; + return rc; + } + + return 0; +} + +NVMeoFSubsystem::NVMeoFSubsystem(const std::string &subnqn) : subnqn(subnqn) {} + +NVMeoFSubsystem::~NVMeoFSubsystem() { + // Remove namespaces before removing the subsystem. + namespaces.clear(); + + auto path = makeSubsysPath(subnqn); + if (pathExists(path)) { + rmDir(path); + } +} + +int NVMeoFSubsystem::setup() { + auto path = makeSubsysPath(subnqn); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create subsystem " << subnqn; + return rc; + } + + rc = setAttr(path / "attr_allow_any_host", "1"); + if (rc != 0) { + LOG(ERROR) << "Failed to set allow_any_host for subsystem " << subnqn; + return rc; + } + + return 0; +} + +int NVMeoFSubsystem::addNamespace(NamespaceID nsid, const std::string &file) { + for (auto &it : namespaces) { + if (it.first == nsid || it.second->file == file) { + LOG(ERROR) << "Duplicated namespace " << nsid << ", file=" << file; + return -EEXIST; + } + } + + auto ns = std::make_unique(subnqn, nsid, file); + int rc = ns->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to add namespace " << nsid << " to subsystem " + << subnqn << ", rc=" << rc; + return rc; + } + + namespaces[nsid] = std::move(ns); + return 0; +} + +int NVMeoFSubsystem::removeNamespace(NamespaceID nsid) { + namespaces.erase(nsid); + return 0; +} + +NVMeoFListener::NVMeoFListener(const std::string &trtype, + const std::string &adrfam, + const std::string &traddr, + const std::string &trsvcid) + : trtype(trtype), + adrfam(adrfam), + traddr(traddr), + trsvcid(trsvcid), + id(next_id++) {} + +NVMeoFListener::~NVMeoFListener() { + for (auto &subsys : subsystems) { + auto path = makeSubsysLinkPath(subsys->subnqn, id); + if (pathExists(path)) { + unLink(path); + } + } + subsystems.clear(); + + auto path = makePortPath(id); + if (pathExists(path)) { + rmDir(path); + } +} + +int NVMeoFListener::setup() { + auto path = makePortPath(id); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create port " << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_trtype", trtype); + if (rc != 0) { + LOG(ERROR) << "Failed to set trtype " << trtype << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_adrfam", adrfam); + if (rc != 0) { + LOG(ERROR) << "Failed to set adrfam " << adrfam << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_traddr", traddr); + if (rc != 0) { + LOG(ERROR) << "Failed to set traddr " << traddr << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_trsvcid", trsvcid); + if (rc != 0) { + LOG(ERROR) << "Failed to set trsvcid " << trsvcid << " for port " + << std::to_string(id); + return rc; + } + + return 0; +} + +int NVMeoFListener::addSubsystem(std::shared_ptr subsys) { + auto dest = makeSubsysPath(subsys->subnqn); + auto name = makeSubsysLinkPath(subsys->subnqn, id); + return symLink(dest, name); +} + +int NVMeoFListener::removeSubsystem(std::shared_ptr subsys) { + auto name = makeSubsysLinkPath(subsys->subnqn, id); + return unLink(name); +} +} // namespace nvmeof_target + +NVMeoFTarget::NVMeoFTarget(const std::string &hostname) + : hostname(hostname), listener(nullptr), subsystem(nullptr) {} + +NVMeoFTarget::~NVMeoFTarget() { + if (listener != nullptr && subsystem != nullptr) { + listener->removeSubsystem(subsystem); + } +} + +int NVMeoFTarget::setup(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid) { + listener = std::make_unique(trtype, adrfam, + traddr, trsvcid); + int rc = listener->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup nvmeof target listener, trtype=" + << trtype << " adrfam=" << adrfam << " traddr=" << traddr + << " trsvcid=" << trsvcid << ", rc=" << rc; + return rc; + } + + auto subnqn = "nqn.2016-06.io.mc:" + hostname; + subsystem = std::make_shared(subnqn); + rc = subsystem->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup nvmeof subsystem, subnqn=" << subnqn + << ", rc=" << rc; + return rc; + } + + rc = listener->addSubsystem(subsystem); + if (rc != 0) { + LOG(ERROR) << "Failed to add subsystem " << subsystem->subnqn + << " to listener, rc=" << rc; + return rc; + } + + return 0; +} + +int NVMeoFTarget::addFile(FileBufferID file_id, const std::string &file) { + std::lock_guard guard(this->mutex); + return subsystem->addNamespace(file_id, file); +} + +int NVMeoFTarget::removeFile(FileBufferID file_id) { + std::lock_guard guard(this->mutex); + return subsystem->removeNamespace(file_id); +} +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp new file mode 100644 index 000000000..4ec88baa4 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp @@ -0,0 +1,393 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_transport.h" + +#include + +#include "common.h" +#include "config.h" +#include "transfer_engine.h" +#include "transfer_metadata.h" +#include "transport/transport.h" + +namespace mooncake { +NVMeoFGenericTransport::NVMeoFGenericTransport() + : initiator(nullptr), worker_pool(nullptr), target(nullptr) {} + +NVMeoFGenericTransport::~NVMeoFGenericTransport() { + for (auto &it : segment_to_controller_) { + worker_pool->removeController(it.second.get()); + initiator->detachController(it.second); + } + + segment_to_controller_.clear(); + worker_pool.reset(); + initiator.reset(); + target.reset(); +} + +BatchID NVMeoFGenericTransport::allocateBatchID(size_t batch_size) { + auto batch_id = Transport::allocateBatchID(batch_size); + return batch_id; +} + +Status NVMeoFGenericTransport::freeBatchID(BatchID batch_id) { + Status rc = Transport::freeBatchID(batch_id); + return rc; +} + +Status NVMeoFGenericTransport::getTransferStatus(BatchID batch_id, + size_t task_id, + TransferStatus &status) { + auto &batch_desc = *((BatchDesc *)(batch_id)); + const size_t task_count = batch_desc.task_list.size(); + if (task_id >= task_count) { + return Status::InvalidArgument("Task ID " + std::to_string(task_id) + + " out of range " + + std::to_string(task_count)); + } + + auto &task = batch_desc.task_list[task_id]; + status.transferred_bytes = task.transferred_bytes; + + auto success_slice_count = task.success_slice_count; + auto failed_slice_count = task.failed_slice_count; + if (success_slice_count + failed_slice_count < task.slice_count) { + status.s = Transport::TransferStatusEnum::WAITING; + } else { + task.is_finished = true; + if (failed_slice_count) { + status.s = Transport::TransferStatusEnum::FAILED; + } else { + status.s = Transport::TransferStatusEnum::COMPLETED; + } + } + + return Status::OK(); +} + +Status NVMeoFGenericTransport::submitTransferTask( + const std::vector &task_list) { + std::unordered_map> slice_to_submit; + + for (auto task : task_list) { + auto request = task->request; + Slice *slice = getSliceCache().allocate(); + slice->task = task; + slice->source_addr = request->source; + slice->length = request->length; + slice->opcode = request->opcode; + slice->target_id = request->target_id; + slice->file_id = request->file_id; + slice->nvmeof_generic.offset = request->target_offset; + slice->status = Slice::PENDING; + slice->ts = 0; + + task->slice_list.push_back(slice); + task->total_bytes += request->length; + __sync_fetch_and_add(&task->slice_count, 1); + + slice_to_submit[request->target_id].push_back(slice); + } + + for (auto it : slice_to_submit) { + auto ctrlr = getOrCreateController(it.first); + if (ctrlr != nullptr) { + for (auto &slice : it.second) { + worker_pool->submitTask(ctrlr.get(), slice); + } + } else { + for (auto slice : it.second) { + slice->markFailed(); + } + } + it.second.clear(); + } + + return Status::OK(); +} + +Status NVMeoFGenericTransport::submitTransfer( + BatchID batch_id, const std::vector &entries) { + auto &batch_desc = *((BatchDesc *)(batch_id)); + if (batch_desc.task_list.size() + entries.size() > batch_desc.batch_size) { + LOG(ERROR) << "NVMeoFGenericTransport: Exceed the limitation of " + "current batch's " + "capacity"; + return Status::InvalidArgument( + "NVMeoFGenericTransport: Exceed the limitation of capacity, batch " + "id: " + + std::to_string(batch_id)); + } + + size_t task_id = batch_desc.task_list.size(); + batch_desc.task_list.resize(task_id + entries.size()); + + std::vector task_list; + for (auto &request : entries) { + auto &task = batch_desc.task_list[task_id++]; + task.batch_id = batch_id; + task.request = &request; + task_list.push_back(&task); + } + + return this->submitTransferTask(task_list); +} + +int NVMeoFGenericTransport::install(std::string &local_server_name, + std::shared_ptr meta, + void **args) { + int rc = Transport::install(local_server_name, meta, args); + if (rc != 0) { + LOG(ERROR) << "Transport::install failed, rc=" << rc; + return rc; + } + + if (args != nullptr && args[0] != nullptr) { + std::string trStr = static_cast(args[0]); + rc = parseTrid(trStr); + if (rc != 0) { + LOG(ERROR) << "Failed to parse nvmeof trid \"" << trStr + << "\", rc=" << rc; + return rc; + } + } + + return 0; +} + +int NVMeoFGenericTransport::setupLocalSegment() { + if (this->target != nullptr) { + return 0; + } + + if (!validateTrid(local_trid)) { + LOG(ERROR) << "NVMeoF trid not specified"; + return ERR_INVALID_ARGUMENT; + } + + this->target = std::make_unique(local_server_name_); + int rc = this->target->setup(local_trid.trtype, local_trid.adrfam, + local_trid.traddr, local_trid.trsvcid); + if (rc != 0) { + LOG(ERROR) << "Failed to create nvmeof target, rc=" << rc; + return ERR_INVALID_ARGUMENT; + } + + auto desc = std::make_shared(); + if (!desc) { + LOG(ERROR) << "Failed to create local segment"; + this->target.reset(); + return ERR_MEMORY; + } + + desc->name = local_server_name_; + desc->protocol = "nvmeof_generic"; + desc->nvmeof_generic_trid = local_trid; + desc->nvmeof_generic_trid.subnqn = this->target->getSubNQN(); + + metadata_->addLocalSegment(LOCAL_SEGMENT_ID, local_server_name_, + std::move(desc)); + return 0; +} + +int NVMeoFGenericTransport::registerLocalMemory(void *addr, size_t length, + const std::string &location, + bool remote_accessible, + bool update_metadata) { + return 0; +} + +int NVMeoFGenericTransport::unregisterLocalMemory(void *addr, + bool update_metadata) { + return 0; +} + +int NVMeoFGenericTransport::registerLocalFile(FileBufferID id, + const std::string &path, + size_t size) { + int rc = setupLocalSegment(); + if (rc != 0) { + LOG(ERROR) << "Failed to allocate local segment, rc=" << rc; + return ERR_MEMORY; + } + + rc = this->target->addFile(id, path); + if (rc != 0) { + LOG(ERROR) << "Failed to add file " << path << ", rc=" << rc; + return rc; + } + + FileBufferDesc buffer_desc; + buffer_desc.id = id; + buffer_desc.path = path; + buffer_desc.size = size; + /// TODO: Set align according to file type. + buffer_desc.align = 0; + + rc = this->metadata_->addFileBuffer(buffer_desc, true); + if (rc != 0) { + LOG(ERROR) << "Failed to add file buffer " << path << ", rc=" << rc; + this->target->removeFile(id); + return rc; + } + + return 0; +} + +int NVMeoFGenericTransport::unregisterLocalFile(FileBufferID id) { + if (this->target == nullptr) { + LOG(ERROR) << "NVMeoFGenericTransport::target has not been initialized"; + return ERR_ADDRESS_NOT_REGISTERED; + } + + int rc = this->metadata_->removeFileBuffer(id, true); + if (rc != 0) { + LOG(ERROR) << "Failed to remove file buffer " << id << ", rc=" << rc; + return rc; + } + + this->target->removeFile(id); + return 0; +} + +int NVMeoFGenericTransport::parseTrid(const std::string &trStr) { + std::istringstream stream(trStr); + std::string option; + + while (stream >> option) { + auto sep = option.find('='); + if (sep == option.npos) { + sep = option.find(':'); + if (sep == option.npos) { + LOG(ERROR) << "No separator '=' or ':' found in trid string \"" + << trStr << "\""; + return ERR_INVALID_ARGUMENT; + } + } + + auto key = option.substr(0, sep); + auto value = option.substr(sep + 1); + if (key.empty() || value.empty()) { + LOG(ERROR) << "Invalid trid option: key=" << key + << " value=" << value; + return ERR_INVALID_ARGUMENT; + } + + if (key == "trtype") { + local_trid.trtype = value; + } else if (key == "adrfam") { + local_trid.adrfam = value; + } else if (key == "traddr") { + local_trid.traddr = value; + } else if (key == "trsvcid") { + local_trid.trsvcid = value; + } else { + LOG(ERROR) << "Invalid trid string operation: key=" << key + << ", value=" << value; + return ERR_INVALID_ARGUMENT; + } + } + + if (!validateTrid(local_trid)) { + LOG(ERROR) << "Invalid trid: trtype=" << local_trid.trtype + << ", adrfam=" << local_trid.adrfam + << ", traddr=" << local_trid.traddr + << ", trsvcid=" << local_trid.trsvcid; + return ERR_INVALID_ARGUMENT; + } + + return 0; +} + +bool NVMeoFGenericTransport::validateTrid(const NVMeoFTrid &local_trid) { + return !(local_trid.trtype.empty() || local_trid.adrfam.empty() || + local_trid.traddr.empty() || local_trid.trsvcid.empty()); +} + +int NVMeoFGenericTransport::setupInitiator() { + if (this->initiator == nullptr) { + this->initiator = + NVMeoFInitiator::create(globalConfig().nvmeof_generic_direct_io); + if (this->initiator == nullptr) { + LOG(ERROR) << "Failed to create nvmeof initiator"; + return ERR_MEMORY; + } + } + + if (this->worker_pool == nullptr) { + this->worker_pool = std::make_unique( + globalConfig().nvmeof_generic_num_workers); + if (this->worker_pool == nullptr) { + LOG(ERROR) << "Failed to create nvmeof worker pool"; + return ERR_MEMORY; + } + } + + return 0; +} + +std::shared_ptr NVMeoFGenericTransport::getOrCreateController( + SegmentHandle handle) { + { + RWSpinlock::ReadGuard guard(controller_lock_); + auto it = segment_to_controller_.find(handle); + if (it != segment_to_controller_.end()) { + return it->second; + } + } + + auto desc = metadata_->getSegmentDescByID(handle); + if (desc == nullptr || desc->protocol != "nvmeof_generic" || + desc->file_buffers.size() <= 0) { + LOG(ERROR) << "Invalid segment " << desc; + return nullptr; + } + + RWSpinlock::WriteGuard guard(controller_lock_); + auto it = segment_to_controller_.find(handle); + if (it != segment_to_controller_.end()) { + // Someone else attached the controller. + return it->second; + } + + int rc = setupInitiator(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup initiator, rc=" << rc; + return nullptr; + } + + auto &trid = desc->nvmeof_generic_trid; + auto controller = initiator->attachController( + trid.trtype, trid.adrfam, trid.traddr, trid.trsvcid, trid.subnqn); + if (controller == nullptr) { + LOG(ERROR) << "Failed to attach controller trtype=" << trid.trtype + << " adrfam=" << trid.adrfam << " traddr=" << trid.traddr + << " trsvcid=" << trid.trsvcid << " subnqn=" << trid.subnqn; + return nullptr; + } + + rc = this->worker_pool->addController(controller); + if (rc != 0) { + LOG(ERROR) << "Failed to add controller to worker pool, rc=" << rc; + initiator->detachController(controller); + return nullptr; + } + + segment_to_controller_[handle] = controller; + return controller; +} + +} // namespace mooncake diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp new file mode 100644 index 000000000..47ed19f98 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp @@ -0,0 +1,226 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/worker_pool.h" + +#include + +#include + +#define WORKER_QUEUE_DEPTH 256 +#define WORKER_MAX_NUM_TASKS 4096 + +namespace mooncake { +NVMeoFWorker::NVMeoFWorker(size_t id) + : id(id), stopping(false), clock(0), tasks(nullptr), curr_task(nullptr) { + this->tasks = new NVMeoFWorkerTask[WORKER_MAX_NUM_TASKS]; + for (size_t i = 0; i < WORKER_MAX_NUM_TASKS; i++) { + this->free_tasks.push(&this->tasks[i]); + } + + this->thread = std::thread(std::bind(&NVMeoFWorker::poll, this)); +} + +NVMeoFWorker::~NVMeoFWorker() { + this->stopping = true; + if (this->thread.joinable()) { + this->thread.join(); + } + + delete[] this->tasks; +} + +void NVMeoFWorker::addController(std::shared_ptr ctrlr) { + auto it = this->queues.find(ctrlr.get()); + if (it != this->queues.end()) { + LOG(WARNING) << "Controller exists: " << ctrlr.get(); + return; + } + + auto queue = ctrlr->createQueue(WORKER_QUEUE_DEPTH); + if (queue == nullptr) { + LOG(ERROR) << "Failed to create nvmeof queue"; + return; + } + + this->queues[ctrlr.get()] = std::move(queue); +} + +void NVMeoFWorker::removeController(NVMeoFController *ctrlr) { + auto it = this->queues.find(ctrlr); + if (it == this->queues.end()) { + return; + } + + this->queues.erase(it); +} + +void NVMeoFWorker::sendMsg(const std::function &func) { + this->msg_queue.push(&func); +} + +int NVMeoFWorker::submitTask(NVMeoFController *ctrlr, Slice *slice) { + NVMeoFWorkerTask *task; + + if (!this->free_tasks.pop(task)) { + return -ENOMEM; + } + + task->ctrlr = ctrlr; + task->slice = slice; + task->timestamp = this->clock.load(); + this->task_queue.push(task); + + return 0; +} + +void NVMeoFWorker::dispatchTasks() { + uint64_t prev = this->clock.fetch_add(1); + + if (this->curr_task == nullptr && !this->task_queue.pop(this->curr_task)) { + return; + } + + do { + auto task = std::exchange(this->curr_task, nullptr); + + auto queue = this->queues.find(task->ctrlr); + if (queue == this->queues.end()) { + task->slice->markFailed(); + this->free_tasks.push(task); + } else { + int rc = queue->second->submitRequest(task->slice); + if (rc == 0) { + this->free_tasks.push(task); + } else if (rc == -EAGAIN || rc == -EWOULDBLOCK) { + task->timestamp = this->clock.load(); + this->task_queue.push(task); + } else { + LOG(ERROR) << "Failed to submit request, rc = " << rc; + task->slice->markFailed(); + this->free_tasks.push(task); + } + } + } while (this->task_queue.pop(this->curr_task) && + this->curr_task->timestamp == prev); +} + +void NVMeoFWorker::poll() { + // Allow thread to be scheduled to different CPU cores. + cpu_set_t cpuset; + memset(&cpuset, -1, sizeof(cpuset)); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + std::function *func; + while (!this->stopping) { + while (this->msg_queue.pop(func)) { + (*func)(this); + } + + this->dispatchTasks(); + + // Take a break before reaping completions. + std::this_thread::yield(); + + for (auto &it : this->queues) { + it.second->reapCompletions(); + } + } + + while (this->msg_queue.pop(func)) { + (*func)(this); + } + + if (this->curr_task != nullptr) { + this->curr_task->slice->markFailed(); + this->free_tasks.push(this->curr_task); + this->curr_task = nullptr; + } + + while (this->task_queue.pop(this->curr_task)) { + this->curr_task->slice->markFailed(); + this->free_tasks.push(this->curr_task); + this->curr_task = nullptr; + } +} + +NVMeoFWorkerPool::NVMeoFWorkerPool(size_t num_workers) + : num_workers(num_workers), next_worker(0) { + for (size_t i = 0; i < num_workers; i++) { + auto worker = std::make_unique(i); + this->workers.push_back(std::move(worker)); + } +} + +NVMeoFWorkerPool::~NVMeoFWorkerPool() { this->workers.clear(); } + +int NVMeoFWorkerPool::addController(std::shared_ptr ctrlr) { + std::latch latch(this->num_workers); + auto msg_fn = [&ctrlr, &latch](NVMeoFWorker *worker) { + worker->addController(ctrlr); + latch.count_down(); + }; + + for (size_t i = 0; i < this->num_workers; i++) { + auto worker = this->workers[i].get(); + worker->sendMsg(msg_fn); + } + + latch.wait(); + return 0; +} + +int NVMeoFWorkerPool::removeController(NVMeoFController *ctrlr) { + std::latch latch(this->num_workers); + auto msg_fn = [&ctrlr, &latch](NVMeoFWorker *worker) { + worker->removeController(ctrlr); + latch.count_down(); + }; + + for (size_t i = 0; i < this->num_workers; i++) { + auto worker = this->workers[i].get(); + worker->sendMsg(msg_fn); + } + + latch.wait(); + return 0; +} + +int NVMeoFWorkerPool::submitTask(NVMeoFController *ctrlr, Slice *slice) { + uint32_t worker_idx; + int rc; + + /// Randomly pick a worker. + worker_idx = std::rand() % this->num_workers; + rc = this->workers[worker_idx]->submitTask(ctrlr, slice); + if (rc == 0) { + return 0; + } + + /// Try all workers. + uint32_t failed_worker_idx = worker_idx; + worker_idx = (worker_idx + 1) % this->num_workers; + while (rc != 0 && worker_idx != failed_worker_idx) { + rc = this->workers[worker_idx]->submitTask(ctrlr, slice); + worker_idx = (worker_idx + 1) % this->num_workers; + } + + if (rc != 0) { + LOG(ERROR) << "Failed to submit transfer task"; + } + + return rc; +} + +} // namespace mooncake From 5dfcc0187bbc82ba38279108c4e4e85f7be9d6a0 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Sun, 24 Aug 2025 07:54:16 +0000 Subject: [PATCH 04/15] [Store]: Support zero based buffer in Cachelib Allocator Signed-off-by: Jinlong Chen --- mooncake-store/src/allocator.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mooncake-store/src/allocator.cpp b/mooncake-store/src/allocator.cpp index c97a85355..ad32a393d 100644 --- a/mooncake-store/src/allocator.cpp +++ b/mooncake-store/src/allocator.cpp @@ -56,12 +56,15 @@ CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, LOG_ASSERT(header_region_start_); + // Add a padding to base to support zero-based buffers. + auto padded_base = base + facebook::cachelib::Slab::kSize; + // Initialize the CacheLib MemoryAllocator. memory_allocator_ = std::make_unique( facebook::cachelib::MemoryAllocator::Config( facebook::cachelib::MemoryAllocator::generateAllocSizes()), reinterpret_cast(header_region_start_.get()), - header_region_size_, reinterpret_cast(base), size); + header_region_size_, reinterpret_cast(padded_base), size); if (!memory_allocator_) { LOG(ERROR) << "status=failed_to_init_facebook_memory_allocator"; @@ -88,6 +91,10 @@ std::unique_ptr CachelibBufferAllocator::allocate( << " current_size=" << cur_size_; return nullptr; } + + // Un-padding the buffer. + buffer = reinterpret_cast(reinterpret_cast(buffer) - + facebook::cachelib::Slab::kSize); } catch (const std::exception& e) { LOG(ERROR) << "allocation_exception error=" << e.what(); return nullptr; @@ -106,7 +113,10 @@ std::unique_ptr CachelibBufferAllocator::allocate( void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { try { // Deallocate memory using CacheLib. - memory_allocator_->free(handle->buffer_ptr_); + auto buffer = reinterpret_cast( + reinterpret_cast(handle->buffer_ptr_) + + facebook::cachelib::Slab::kSize); + memory_allocator_->free(buffer); handle->status = BufStatus::UNREGISTERED; size_t freed_size = handle->size_; // Store size before handle might become invalid From a116150e621ab42aaa7f5861baa85f92e5b5eb57 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 22 Aug 2025 08:09:52 +0000 Subject: [PATCH 05/15] [Store]: Support mounting file segments Signed-off-by: Jinlong Chen --- mooncake-store/include/allocator.h | 18 ++- mooncake-store/include/client.h | 14 +++ mooncake-store/include/types.h | 48 +++++++- mooncake-store/src/allocator.cpp | 23 ++-- mooncake-store/src/client.cpp | 157 ++++++++++++++++++++++++++- mooncake-store/src/segment.cpp | 6 +- mooncake-store/src/transfer_task.cpp | 1 + 7 files changed, 244 insertions(+), 23 deletions(-) diff --git a/mooncake-store/include/allocator.h b/mooncake-store/include/allocator.h index f2898cd3a..2a4f7f878 100644 --- a/mooncake-store/include/allocator.h +++ b/mooncake-store/include/allocator.h @@ -53,12 +53,13 @@ class AllocatedBuffer { struct Descriptor; AllocatedBuffer(std::shared_ptr allocator, - std::string segment_name, void* buffer_ptr, - std::size_t size, + std::string segment_name, FileBufferID file_id, + void* buffer_ptr, std::size_t size, std::optional&& offset_handle = std::nullopt) : allocator_(std::move(allocator)), segment_name_(std::move(segment_name)), + file_id_(file_id), buffer_ptr_(buffer_ptr), size_(size), offset_handle_(std::move(offset_handle)) {} @@ -92,10 +93,12 @@ class AllocatedBuffer { // Represents the serializable state struct Descriptor { std::string segment_name_; + FileBufferID file_id_; uint64_t size_; uintptr_t buffer_address_; BufStatus status_; - YLT_REFL(Descriptor, segment_name_, size_, buffer_address_, status_); + YLT_REFL(Descriptor, segment_name_, file_id_, size_, buffer_address_, + status_); }; void mark_complete() { status = BufStatus::COMPLETE; } @@ -103,6 +106,7 @@ class AllocatedBuffer { private: std::weak_ptr allocator_; std::string segment_name_; + FileBufferID file_id_; BufStatus status{BufStatus::INIT}; void* buffer_ptr_{nullptr}; std::size_t size_{0}; @@ -152,7 +156,8 @@ class CachelibBufferAllocator : public BufferAllocatorBase, public std::enable_shared_from_this { public: - CachelibBufferAllocator(std::string segment_name, size_t base, size_t size); + CachelibBufferAllocator(std::string segment_name, size_t base, size_t size, + FileBufferID file_id = 0); ~CachelibBufferAllocator() override; @@ -170,6 +175,7 @@ class CachelibBufferAllocator const size_t base_; const size_t total_size_; std::atomic_size_t cur_size_; + const FileBufferID file_id_; // metrics - removed allocated_bytes_ member // ylt::metric::gauge_t* allocated_bytes_{nullptr}; @@ -189,7 +195,8 @@ class OffsetBufferAllocator : public BufferAllocatorBase, public std::enable_shared_from_this { public: - OffsetBufferAllocator(std::string segment_name, size_t base, size_t size); + OffsetBufferAllocator(std::string segment_name, size_t base, size_t size, + FileBufferID file_id = 0); ~OffsetBufferAllocator() override; @@ -207,6 +214,7 @@ class OffsetBufferAllocator const size_t base_; const size_t total_size_; std::atomic_size_t cur_size_; + const FileBufferID file_id_; // offset allocator implementation std::shared_ptr offset_allocator_; diff --git a/mooncake-store/include/client.h b/mooncake-store/include/client.h index b5ad9625a..8658b8a6d 100644 --- a/mooncake-store/include/client.h +++ b/mooncake-store/include/client.h @@ -179,6 +179,20 @@ class Client { tl::expected UnmountSegment(const void* buffer, size_t size); + /** + * @brief Register a file to master for allocation + * @param path The file path + * @return ErrorCode indicating success/failure + */ + tl::expected MountFileSegment(const std::string& path); + + /** + * @brief Unregisters a file segment from master + * @param path File path to unregister + * @return ErrorCode indicating success/failure + */ + tl::expected UnmountFileSegment(const std::string& path); + /** * @brief Registers memory buffer with TransferEngine for data transfer * @param addr Memory address to register diff --git a/mooncake-store/include/types.h b/mooncake-store/include/types.h index 077926ba7..45d94da40 100644 --- a/mooncake-store/include/types.h +++ b/mooncake-store/include/types.h @@ -45,6 +45,7 @@ class Replica; using ObjectKey = std::string; using Version = uint64_t; using SegmentId = int64_t; +using FileBufferID = uint32_t; using TaskID = int64_t; using BufHandleList = std::vector>; // using ReplicaList = std::vector; @@ -164,20 +165,61 @@ const static uint64_t kMaxSliceSize = facebook::cachelib::Slab::kSize - 16; // should be lower than limit /** - * @brief Represents a contiguous memory region + * @brief Type of segments. + */ +enum class SegmentType { + UNKNOWN = -1, + MEMORY, + FILE, +}; + +/** + * @brief Stream operator for SegmentType + */ +inline std::ostream& operator<<(std::ostream& os, + const SegmentType& type) noexcept { + static const std::unordered_map type_strings{ + {SegmentType::UNKNOWN, "UNKNOWN"}, + {SegmentType::MEMORY, "MEMORY"}, + {SegmentType::FILE, "FILE"}}; + + os << (type_strings.count(type) ? type_strings.at(type) : "UNKNOWN"); + return os; +} + +/** + * @brief Represents a contiguous storage region, could be memory or file. */ struct Segment { UUID id{0, 0}; + SegmentType type{SegmentType::UNKNOWN}; std::string name{}; // The name of the segment, also might be the // hostname of the server that owns the segment uintptr_t base{0}; size_t size{0}; + // For a file segment, this will be the path of the file. + std::string path{}; + // For a file segment, this will be the id of the file buffer. + FileBufferID file_id{0}; Segment() = default; Segment(const UUID& id, const std::string& name, uintptr_t base, size_t size) - : id(id), name(name), base(base), size(size) {} + : id(id), + type(SegmentType::MEMORY), + name(name), + base(base), + size(size) {} + Segment(const UUID& id, const std::string& name, uintptr_t base, + size_t size, const std::string& path, FileBufferID file_id) + : id(id), + type(SegmentType::FILE), + name(name), + base(base), + size(size), + path(path), + file_id(file_id) {} }; -YLT_REFL(Segment, id, name, base, size); +YLT_REFL(Segment, id, type, name, base, size, path, file_id); /** * @brief Client status from the master's perspective diff --git a/mooncake-store/src/allocator.cpp b/mooncake-store/src/allocator.cpp index ad32a393d..4087ffe77 100644 --- a/mooncake-store/src/allocator.cpp +++ b/mooncake-store/src/allocator.cpp @@ -23,7 +23,7 @@ AllocatedBuffer::~AllocatedBuffer() { // Implementation of get_descriptor AllocatedBuffer::Descriptor AllocatedBuffer::get_descriptor() const { - return {segment_name_, static_cast(size()), + return {segment_name_, file_id_, static_cast(size()), reinterpret_cast(buffer_ptr_), status}; } @@ -31,6 +31,7 @@ AllocatedBuffer::Descriptor AllocatedBuffer::get_descriptor() const { std::ostream& operator<<(std::ostream& os, const AllocatedBuffer& buffer) { return os << "AllocatedBuffer: { " << "segment_name: " << buffer.segment_name_ << ", " + << "file_id: " << buffer.file_id_ << ", " << "size: " << buffer.size() << ", " << "status: " << buffer.status << ", " << "buffer_ptr: " << static_cast(buffer.data()) << " }"; @@ -38,14 +39,16 @@ std::ostream& operator<<(std::ostream& os, const AllocatedBuffer& buffer) { // Removed allocated_bytes parameter and member initialization CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, - size_t base, size_t size) + size_t base, size_t size, + FileBufferID file_id) : segment_name_(segment_name), base_(base), total_size_(size), - cur_size_(0) { + cur_size_(0), + file_id_(file_id) { VLOG(1) << "initializing_buffer_allocator segment_name=" << segment_name << " base_address=" << reinterpret_cast(base) - << " size=" << size; + << " size=" << size << " file_id=" << file_id; // Calculate the size of the header region. header_region_size_ = @@ -107,7 +110,7 @@ std::unique_ptr CachelibBufferAllocator::allocate( cur_size_.fetch_add(size); MasterMetricManager::instance().inc_allocated_size(size); return std::make_unique(shared_from_this(), segment_name_, - buffer, size); + file_id_, buffer, size); } void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { @@ -133,14 +136,16 @@ void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { // OffsetBufferAllocator implementation OffsetBufferAllocator::OffsetBufferAllocator(std::string segment_name, - size_t base, size_t size) + size_t base, size_t size, + FileBufferID file_id) : segment_name_(segment_name), base_(base), total_size_(size), - cur_size_(0) { + cur_size_(0), + file_id_(file_id) { VLOG(1) << "initializing_offset_buffer_allocator segment_name=" << segment_name << " base_address=" << reinterpret_cast(base) - << " size=" << size; + << " size=" << size << " file_id=" << file_id; try { // 1k <= init_capacity <= 64k @@ -196,7 +201,7 @@ std::unique_ptr OffsetBufferAllocator::allocate(size_t size) { // Create a custom AllocatedBuffer that manages the // OffsetAllocationHandle allocated_buffer = std::make_unique( - shared_from_this(), segment_name_, buffer_ptr, size, + shared_from_this(), segment_name_, file_id_, buffer_ptr, size, std::move(allocation_handle)); VLOG(1) << "allocation_succeeded size=" << size << " segment=" << segment_name_ << " address=" << buffer_ptr; diff --git a/mooncake-store/src/client.cpp b/mooncake-store/src/client.cpp index 536405a88..90a1f0b1d 100644 --- a/mooncake-store/src/client.cpp +++ b/mooncake-store/src/client.cpp @@ -10,6 +10,10 @@ #include #include #include +#include +#include +#include +#include #include "transfer_engine.h" #include "transfer_task.h" @@ -35,6 +39,45 @@ namespace mooncake { return slice_size; } +static size_t getFileSize(const std::string& file) { + size_t size = 0; + struct stat st; + int rc; + + int fd = open(file.c_str(), O_RDONLY); + if (fd < 0) { + LOG(ERROR) << "Failed to open file " << file << ", errno=" << errno; + goto out; + } + + rc = fstat(fd, &st); + if (rc < 0) { + LOG(ERROR) << "Failed fstat on file " << file << ", errno=" << errno; + goto close_file; + } + + if (S_ISLNK(st.st_mode)) { + LOG(ERROR) << "File " << file << " is a symbolic link"; + goto close_file; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + rc = ioctl(fd, BLKGETSIZE64, &size); + if (rc < 0) { + LOG(ERROR) << "Failed ioctl on file " << file + << ", errno=" << errno; + size = 0; + } + } else if (S_ISREG(st.st_mode)) { + size = st.st_size; + } + +close_file: + close(fd); +out: + return size; +} + Client::Client(const std::string& local_hostname, const std::string& metadata_connstring) : metrics_(ClientMetric::Create()), @@ -72,8 +115,21 @@ Client::~Client() { } for (auto& segment : segments_to_unmount) { - auto result = - UnmountSegment(reinterpret_cast(segment.base), segment.size); + tl::expected result; + switch (segment.type) { + case SegmentType::MEMORY: + result = UnmountSegment(reinterpret_cast(segment.base), + segment.size); + break; + case SegmentType::FILE: + result = UnmountFileSegment(segment.path); + break; + default: + result = tl::unexpected(ErrorCode::INVALID_PARAMS); + LOG(ERROR) << "Unknown segment type: " << segment.type; + break; + } + if (!result) { LOG(ERROR) << "Failed to unmount segment: " << toString(result.error()); @@ -237,6 +293,9 @@ ErrorCode Client::InitTransferEngine(const std::string& local_hostname, << e.what() << "\""; return ErrorCode::INTERNAL_ERROR; } + } else if (protocol == "nvmeof_generic") { + LOG(INFO) << "transport_type=" << protocol; + transport = transfer_engine_.installTransport(protocol, protocol_args); } else { LOG(ERROR) << "unsupported_protocol protocol=" << protocol; return ErrorCode::INVALID_PARAMS; @@ -1034,6 +1093,10 @@ tl::expected Client::MountSegment(const void* buffer, // Check if the segment overlaps with any existing segment for (auto& it : mounted_segments_) { auto& mtseg = it.second; + // Skip non-memory segments. + if (mtseg.type != SegmentType::MEMORY) { + continue; + } uintptr_t l1 = reinterpret_cast(mtseg.base); uintptr_t r1 = reinterpret_cast(mtseg.size) + l1; uintptr_t l2 = reinterpret_cast(buffer); @@ -1076,7 +1139,8 @@ tl::expected Client::UnmountSegment(const void* buffer, for (auto it = mounted_segments_.begin(); it != mounted_segments_.end(); ++it) { - if (it->second.base == reinterpret_cast(buffer) && + if (it->second.type == SegmentType::MEMORY && + it->second.base == reinterpret_cast(buffer) && it->second.size == size) { segment = it; break; @@ -1113,6 +1177,93 @@ tl::expected Client::UnmountSegment(const void* buffer, return {}; } +tl::expected Client::MountFileSegment( + const std::string& path) { + const size_t size = getFileSize(path); + if (size <= 0) { + LOG(ERROR) << "Invalid file " << path << " to mount"; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + std::lock_guard lock(mounted_segments_mutex_); + + for (auto& it : mounted_segments_) { + auto& mtseg = it.second; + // Skip non-file segments. + if (mtseg.type != SegmentType::FILE) { + continue; + } + + if (mtseg.path == path) { + LOG(ERROR) << "Duplicated file segment path=" << mtseg.path; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + } + + FileBufferID file_id; + int rc = transfer_engine_.registerLocalFile(path, size, file_id); + if (rc != 0) { + LOG(ERROR) << "register_local_file_failed path=" << path + << " size=" << size << ", error=" << rc; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + Segment segment(generate_uuid(), local_hostname_, 0, size, path, file_id); + + auto mount_result = master_client_.MountSegment(segment, client_id_); + if (!mount_result) { + ErrorCode err = mount_result.error(); + LOG(ERROR) << "mount_segment_to_master_failed path=" << path + << " size=" << size << ", error=" << err; + return tl::unexpected(err); + } + + mounted_segments_[segment.id] = segment; + return {}; +} + +tl::expected Client::UnmountFileSegment( + const std::string& path) { + std::lock_guard lock(mounted_segments_mutex_); + + auto segment = mounted_segments_.end(); + for (auto it = mounted_segments_.begin(); it != mounted_segments_.end(); + it++) { + if (it->second.type == SegmentType::FILE && it->second.path == path) { + segment = it; + break; + } + } + if (segment == mounted_segments_.end()) { + LOG(ERROR) << "segment_not_found path=" << path; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + auto unmount_result = + master_client_.UnmountSegment(segment->second.id, client_id_); + if (!unmount_result) { + ErrorCode err = unmount_result.error(); + LOG(ERROR) << "Failed to unmount segment from master: " + << toString(err); + return tl::unexpected(err); + } + + int rc = transfer_engine_.unregisterLocalFile(segment->second.path); + if (rc != 0) { + LOG(ERROR) << "Failed to unregister file with transfer " + "engine ret is " + << rc; + if (rc != ERR_ADDRESS_NOT_REGISTERED) { + return tl::unexpected(ErrorCode::INTERNAL_ERROR); + } + // Otherwise, the segment is already unregistered from transfer + // engine, we can continue + } + + mounted_segments_.erase(segment); + return {}; +} + tl::expected Client::RegisterLocalMemory( void* addr, size_t length, const std::string& location, bool remote_accessible, bool update_metadata) { diff --git a/mooncake-store/src/segment.cpp b/mooncake-store/src/segment.cpp index ae634fde4..66e0d1646 100644 --- a/mooncake-store/src/segment.cpp +++ b/mooncake-store/src/segment.cpp @@ -10,7 +10,7 @@ ErrorCode ScopedSegmentAccess::MountSegment(const Segment& segment, const size_t size = segment.size; // Check if parameters are valid before allocating memory. - if (buffer == 0 || size == 0) { + if ((segment.type == SegmentType::MEMORY && buffer == 0) || size == 0) { LOG(ERROR) << "buffer=" << buffer << " or size=" << size << " is invalid"; return ErrorCode::INVALID_PARAMS; @@ -50,11 +50,11 @@ ErrorCode ScopedSegmentAccess::MountSegment(const Segment& segment, switch (segment_manager_->memory_allocator_) { case BufferAllocatorType::CACHELIB: allocator = std::make_shared( - segment.name, buffer, size); + segment.name, buffer, size, segment.file_id); break; case BufferAllocatorType::OFFSET: allocator = std::make_shared( - segment.name, buffer, size); + segment.name, buffer, size, segment.file_id); break; default: LOG(ERROR) << "segment_name=" << segment.name diff --git a/mooncake-store/src/transfer_task.cpp b/mooncake-store/src/transfer_task.cpp index 27b550b7c..5f03cceed 100644 --- a/mooncake-store/src/transfer_task.cpp +++ b/mooncake-store/src/transfer_task.cpp @@ -495,6 +495,7 @@ std::optional TransferSubmitter::submitTransferEngineOperation( request.opcode = op_code; request.source = static_cast(slice.ptr); request.target_id = seg; + request.file_id = handle.file_id_; request.target_offset = handle.buffer_address_; request.length = handle.size_; From 142696e85ed194189aea35ee6370f6b37aa58975 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 22 Aug 2025 07:42:47 +0000 Subject: [PATCH 06/15] [Store][bind]: Add setup_with_files to setup file-based client Signed-off-by: Jinlong Chen --- mooncake-integration/store/store_py.cpp | 12 +++++ mooncake-store/include/pybind_client.h | 23 ++++++++ mooncake-store/src/pybind_client.cpp | 71 +++++++++++++++++++++++-- 3 files changed, 101 insertions(+), 5 deletions(-) diff --git a/mooncake-integration/store/store_py.cpp b/mooncake-integration/store/store_py.cpp index 4a45e195f..071f7f0f1 100644 --- a/mooncake-integration/store/store_py.cpp +++ b/mooncake-integration/store/store_py.cpp @@ -343,6 +343,18 @@ PYBIND11_MODULE(store, m) { local_buffer_size, protocol, rdma_devices, master_server_addr); }) + .def("setup_with_files", + [](MooncakeStorePyWrapper &self, const std::string &local_hostname, + const std::string &metadata_server, + const std::vector &files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string &protocol = "nvmeof_generic", + const std::string &protocol_arg = "", + const std::string &master_server_addr = "127.0.0.1:50051") { + return self.store_.setup_with_files( + local_hostname, metadata_server, files, local_buffer_size, + protocol, protocol_arg, master_server_addr); + }) .def("init_all", [](MooncakeStorePyWrapper &self, const std::string &protocol, const std::string &device_name, diff --git a/mooncake-store/include/pybind_client.h b/mooncake-store/include/pybind_client.h index 207c9449d..3d8140e8d 100644 --- a/mooncake-store/include/pybind_client.h +++ b/mooncake-store/include/pybind_client.h @@ -79,6 +79,14 @@ class PyClient { const std::string &rdma_devices = "", const std::string &master_server_addr = "127.0.0.1:50051"); + int setup_with_files( + const std::string &local_hostname, const std::string &metadata_server, + const std::vector &files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string &protocol = "nvmeof_generic", + const std::string &protocol_arg = "", + const std::string &master_server_addr = "127.0.0.1:50051"); + int initAll(const std::string &protocol, const std::string &device_name, size_t mount_segment_size = 1024 * 1024 * 16); // Default 16MB @@ -226,6 +234,13 @@ class PyClient { int64_t getSize(const std::string &key); // Internal versions that return tl::expected + + tl::expected common_setup_internal( + const std::string &local_hostname, const std::string &metadata_server, + size_t local_buffer_size, const std::string &protocol, + const std::string &protocol_args, + const std::string &master_server_addr); + tl::expected setup_internal( const std::string &local_hostname, const std::string &metadata_server, size_t global_segment_size = 1024 * 1024 * 16, @@ -234,6 +249,14 @@ class PyClient { const std::string &rdma_devices = "", const std::string &master_server_addr = "127.0.0.1:50051"); + tl::expected setup_with_files_internal( + const std::string &local_hostname, const std::string &metadata_server, + const std::vector &files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string &protocol = "nvmeof_generic", + const std::string &protocol_arg = "", + const std::string &master_server_addr = "127.0.0.1:50051"); + tl::expected initAll_internal( const std::string &protocol, const std::string &device_name, size_t mount_segment_size = 1024 * 1024 * 16); diff --git a/mooncake-store/src/pybind_client.cpp b/mooncake-store/src/pybind_client.cpp index c0d9e06a9..ab2732233 100644 --- a/mooncake-store/src/pybind_client.cpp +++ b/mooncake-store/src/pybind_client.cpp @@ -89,11 +89,10 @@ PyClient::~PyClient() { ResourceTracker::getInstance().unregisterInstance(this); } -tl::expected PyClient::setup_internal( +tl::expected PyClient::common_setup_internal( const std::string &local_hostname, const std::string &metadata_server, - size_t global_segment_size, size_t local_buffer_size, - const std::string &protocol, const std::string &rdma_devices, - const std::string &master_server_addr) { + size_t local_buffer_size, const std::string &protocol, + const std::string &protocol_args, const std::string &master_server_addr) { this->protocol = protocol; // Remove port if hostname already contains one @@ -112,7 +111,13 @@ tl::expected PyClient::setup_internal( this->local_hostname = local_hostname; } - void **args = (protocol == "rdma") ? rdma_args(rdma_devices) : nullptr; + void **args = nullptr; + if (protocol == "rdma") { + args = rdma_args(protocol_args); + } else if (protocol == "nvmeof_generic" && !protocol_args.empty()) { + args = (void **)calloc(2, sizeof(void *)); + args[0] = (void *)protocol_args.c_str(); + } auto client_opt = mooncake::Client::Create(this->local_hostname, metadata_server, protocol, args, master_server_addr); @@ -139,6 +144,23 @@ tl::expected PyClient::setup_internal( LOG(INFO) << "Local buffer size is 0, skip registering local memory"; } + return {}; +} + +tl::expected PyClient::setup_internal( + const std::string &local_hostname, const std::string &metadata_server, + size_t global_segment_size, size_t local_buffer_size, + const std::string &protocol, const std::string &rdma_devices, + const std::string &master_server_addr) { + // Common setups. + auto result = common_setup_internal(local_hostname, metadata_server, + local_buffer_size, protocol, + rdma_devices, master_server_addr); + if (!result.has_value()) { + LOG(ERROR) << "Failed to setup PyClient"; + return tl::unexpected(result.error()); + } + // If global_segment_size is 0, skip mount segment; // If global_segment_size is larger than max_mr_size, split to multiple // segments. @@ -182,6 +204,45 @@ int PyClient::setup(const std::string &local_hostname, protocol, rdma_devices, master_server_addr)); } +tl::expected PyClient::setup_with_files_internal( + const std::string &local_hostname, const std::string &metadata_server, + const std::vector &files, size_t local_buffer_size, + const std::string &protocol, const std::string &protocol_arg, + const std::string &master_server_addr) { + // Common setups. + auto result = common_setup_internal(local_hostname, metadata_server, + local_buffer_size, protocol, + protocol_arg, master_server_addr); + if (!result.has_value()) { + LOG(ERROR) << "Failed to setup PyClient"; + return tl::unexpected(result.error()); + } + + // Mount file segments. + for (auto &file : files) { + auto result = client_->MountFileSegment(file); + if (!result.has_value()) { + LOG(ERROR) << "Failed to mount file " << file + << ", error=" << result.error(); + return tl::unexpected(result.error()); + } + } + + return {}; +} + +int PyClient::setup_with_files(const std::string &local_hostname, + const std::string &metadata_server, + const std::vector &files, + size_t local_buffer_size, + const std::string &protocol, + const std::string &protocol_arg, + const std::string &master_server_addr) { + return to_py_ret(setup_with_files_internal( + local_hostname, metadata_server, files, local_buffer_size, protocol, + protocol_arg, master_server_addr)); +} + tl::expected PyClient::initAll_internal( const std::string &protocol_, const std::string &device_name, size_t mount_segment_size) { From 6273dc1d14993dae343a9599da78075c7401c3a4 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Mon, 25 Aug 2025 13:47:44 +0000 Subject: [PATCH 07/15] [Store]: Update stress_cluster_benchmark.py to support file segments Signed-off-by: Jinlong Chen --- .../tests/stress_cluster_benchmark.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/mooncake-store/tests/stress_cluster_benchmark.py b/mooncake-store/tests/stress_cluster_benchmark.py index cfd806d55..8d5445723 100644 --- a/mooncake-store/tests/stress_cluster_benchmark.py +++ b/mooncake-store/tests/stress_cluster_benchmark.py @@ -183,20 +183,26 @@ def setup(self): # Setup store protocol = self.args.protocol - device_name = self.args.device_name + protocol_args = self.args.protocol_args local_hostname = self.args.local_hostname metadata_server = self.args.metadata_server - global_segment_size = self.args.global_segment_size * 1024 * 1024 local_buffer_size = self.args.local_buffer_size * 1024 * 1024 master_server_address = self.args.master_server logger.info(f"Setting up {self.args.role} instance with batch_size={self.args.batch_size}") - logger.info(f" Protocol: {protocol}, Device: {device_name}") - logger.info(f" Global segment: {global_segment_size // (1024*1024)} MB") + logger.info(f" Protocol: {protocol}, Protocol args: {protocol_args}") logger.info(f" Local buffer: {local_buffer_size // (1024*1024)} MB") - retcode = self.store.setup(local_hostname, metadata_server, global_segment_size, - local_buffer_size, protocol, device_name, master_server_address) + if self.args.files is None: + global_segment_size = self.args.global_segment_size * 1024 * 1024 + logger.info(f" Global segment: {global_segment_size // (1024*1024)} MB") + retcode = self.store.setup(local_hostname, metadata_server, global_segment_size, + local_buffer_size, protocol, protocol_args, master_server_address) + else: + files = self.args.files.split() + logger.info(f" Files: {files}") + retcode = self.store.setup_with_files(local_hostname, metadata_server, files, + local_buffer_size, protocol, protocol_args, master_server_address) if retcode: logger.error(f"Store setup failed with return code {retcode}") exit(1) @@ -428,14 +434,18 @@ def parse_arguments(): # Network and connection settings parser.add_argument("--protocol", type=str, default="rdma", help="Communication protocol to use") - parser.add_argument("--device-name", type=str, default="erdma_0", help="Network device name for RDMA") + parser.add_argument("--protocol-args", "--device-name", dest="protocol_args", type=str, + default="erdma_0", help="Protocol specific args, e.g. Network device name for RDMA") parser.add_argument("--local-hostname", type=str, default="localhost", help="Local hostname") parser.add_argument("--metadata-server", type=str, default="http://127.0.0.1:8080/metadata", help="Metadata server address") parser.add_argument("--master-server", type=str, default="localhost:50051", help="Master server address") # Memory and storage settings - parser.add_argument("--global-segment-size", type=int, default=10000, help="Global segment size in MB") parser.add_argument("--local-buffer-size", type=int, default=512, help="Local buffer size in MB") + # Only one of --global-segment-size and --files should be specified. + group = parser.add_mutually_exclusive_group() + group.add_argument("--global-segment-size", type=int, default=10000, help="Global segment size in MB") + group.add_argument("--files", type=str, default=None, help="Files to be registered as global segments") # Test parameters parser.add_argument("--max-requests", type=int, default=1200, help="Maximum number of requests to process") From 8ada073baf69b0ba80c253753f157c500997f564 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 26 Aug 2025 13:12:39 +0000 Subject: [PATCH 08/15] [Doc]: Add documents for nvmeof_generic transport Signed-off-by: Jinlong Chen --- doc/en/nvmeof_generic_transport.md | 304 +++++++++++++++++++++++++++++ doc/zh/nvmeof_generic_transport.md | 304 +++++++++++++++++++++++++++++ 2 files changed, 608 insertions(+) create mode 100644 doc/en/nvmeof_generic_transport.md create mode 100644 doc/zh/nvmeof_generic_transport.md diff --git a/doc/en/nvmeof_generic_transport.md b/doc/en/nvmeof_generic_transport.md new file mode 100644 index 000000000..e4f96db26 --- /dev/null +++ b/doc/en/nvmeof_generic_transport.md @@ -0,0 +1,304 @@ +# Generic NVMeoF Transport + +## Overview + +NVMeoFGenericTransport is a more complete NVMeoF protocol-based TransferEngine Transport, designed to eventually replace the existing NVMeoFTransport and provide TransferEngine with the ability to manage and access file Segments. + +Compared to the legacy NVMeoFTransport, NVMeoFGenericTransport offers the following advantages: + +- **More Complete:** Provides a full set of management interfaces consistent with memory Segments, including registering/unregistering local files, mounting/unmounting remote files, etc. +- **More Generic:** No longer depends on cuFile, and can be deployed and used in environments without CUDA support. +- **Higher Performance:** Supports multi-threaded I/O and Direct I/O, fully leveraging the performance potential of NICs and SSDs. +- **More Reliable:** Ensures that unavailability of a single file or storage device does not affect the availability of others, through a more flexible multi-file management scheme. + +## Component Support + +Both TransferEngine and Mooncake Store have added full support for NVMeoFGenericTransport. The relevant API interfaces are listed below: + +### TransferEngine Support + +`TransferEngine` now supports registering and reading/writing file segments. This mainly includes adding fields related to file management and access in `SegmentDesc` and `TransferRequest`, and introducing interfaces for registering and unregistering files. + +#### SegmentDesc + +To support file registration management, the `file_buffers` field has been added to `SegmentDesc`. + +```cpp +using FileBufferID = uint32_t; +struct FileBufferDesc { + FileBufferID id; // File ID, used to identify the file within a Segment + std::string path; // File path on the owning node + std::size_t size; // Available space size of the file + std::size_t align; // For future usage. +}; + +struct SegmentDesc { + std::string name; + std::string protocol; + // Generic file buffers. + std::vector file_buffers; + + // Other fields... +}; +``` + +#### TransferRequest + +To support multi-file registration and access, the `file_id` field has been added to `TransferRequest` to identify the file to be read from or written to. + +```cpp +struct TransferRequest { + enum OpCode { READ, WRITE }; + OpCode opcode; + void *source; + SegmentID target_id; + uint64_t target_offset; // When accessing a file, target_offset indicates the offset within the target file + size_t length; + int advise_retry_cnt = 0; + FileBufferID file_id; // Target file ID, required only when accessing files, used with target_id to locate the target file +}; +``` + +`file_id` is the ID assigned by the target `TransferEngine` when registering the target file, and can be obtained from the `SegmentDesc` of the target `Segment`. + +#### installTransport + +```cpp +Transport *installTransport(const std::string &proto, void **args) +``` + +The `TransferEngine::installTransport` interface now supports directly passing the `args` parameter to the `install` interface of the corresponding Transport, enabling Transport-specific initialization parameters. + +For `NVMeoFGenericTransport`, if the current TransferEngine instance does not need to share local files, the `args` parameter can be `nullptr`. Otherwise, `args` should be a valid pointer array, where the first pointer points to a `char *` that references a string containing NVMeoF Target configuration parameters. For example: + +```cpp +// NVMeoF Target configuration parameters +char *trid_str = "trtype= adrfam= traddr= trsvcid="; + +// Arguments for installTransport +void **args = (void **)&trid_str; +``` + +#### registerLocalFile + +```cpp +int registerLocalFile(const std::string &path, size_t size, FileBufferID &id); +``` + +Registers a local file into TransferEngine, enabling cross-node access. The file can be a regular file or a block device file. **Note: Using a block device file for registration may cause data corruption or complete loss on the device—use with caution!** + +- `path`: File path, can be any regular file or block device file such as `/dev/nvmeXnY`; +- `size`: Available space size of the file, can be less than or equal to the physical size; +- `id`: ID assigned by `TransferEngine` to the file, used to distinguish each file when multiple files are registered; +- Return value: Returns 0 on success, otherwise returns a negative error code; + +#### unregisterLocalFile + +```cpp +int unregisterLocalFile(const std::string &path); +``` + +Unregisters a local file. + +- `path`: File path, must match the path used during registration; + +### Mooncake Store Support + +Mooncake Store now supports using files as shared storage space for storing objects. This capability is based on two newly added interfaces: + +#### MountFileSegment + +```cpp +tl::expected MountFileSegment(const std::string& path); +``` + +Mounts the local file at `path` as part of the shared storage space. + +#### UnmountFileSegment + +```cpp +tl::expected UnmountFileSegment(const std::string& path); +``` + +Unmounts a previously mounted file. + +### Mooncake Store Python API + +The Mooncake Store Python API now supports specifying a set of local files as shared storage space. + +#### setup_with_files + +```python +def setup_with_files( + local_hostname: str, + metadata_server: str, + files: List[str], + local_buffer_size: int, + protocol: str, + protocol_arg: str, + master_server_addr: str + ): + pass +``` + +Starts a Mooncake Store Client instance and registers the specified files as shared storage space. + +## Running Tests + +Users can test NVMeoFGenericTransport at both the TransferEngine and Mooncake Store levels. + +### Environment Requirements + +In addition to the original compilation and runtime environment of the Mooncake project, NVMeoFGenericTransport has additional requirements: + +#### Kernel Version and Drivers + +NVMeoFGenericTransport currently relies on the Linux kernel's nvme and nvmet driver suite, including the following kernel modules: + +- NVMeoF RDMA: Requires Linux Kernel 4.8 or higher, install drivers: + +```bash +# Initiator driver, required for accessing remote files +modprobe nvme_rdma + +# Target driver, required for sharing local files +modprobe nvmet_rdma +``` + +- NVMeoF TCP: Requires Linux Kernel 5.0 or higher, install drivers: + +```bash +# Initiator driver, required for accessing remote files +modprobe nvme_tcp + +# Target driver, required for sharing local files +modprobe nvmet_tcp +``` + +#### Dependencies + +NVMeoFGenericTransport depends on the following third-party libraries: + +```bash +apt install -y libaio-dev libnvme-dev +``` + +### Build Options + +To enable NVMeoFGenericTransport, the `USE_NVMEOF_GENERIC` build option must be turned on: + +```bash +cmake .. -DUSE_NVMEOF_GENERIC=ON +``` + +### Runtime Options + +NVMeoFGenericTransport supports configuring the following runtime options via environment variables: + +- `MC_NVMEOF_GENERIC_DIRECT_IO`: Use Direct I/O when reading/writing NVMeoF SSDs. Disabled by default. Enabling this option can significantly improve performance, but requires that buffer addresses, SSD locations, and I/O lengths all meet alignment requirements (typically 512-byte alignment, 4 KiB alignment recommended). +- `MC_NVMEOF_GENERIC_NUM_WORKERS`: Number of threads used for reading/writing NVMeoF SSDs. Default is 8. + +### TransferEngine Testing + +After enabling the `USE_NVMEOF_GENERIC` option and completing the build, an executable named `transfer_engine_nvmeof_generic_bench` can be found under `build/mooncake-transfer-engine/example`. This program can be used to test the performance of NVMeoFGenericTransport. + +#### Start Metadata Service + +Same as the `transfer_engine_bench` test tool. Refer to [transfer-engine.md](../zh/transfer-engine.md#范例程序transfer-engine-bench) for details. + +Assume the metadata service address is `http://127.0.0.1:8080/metadata` (using HTTP metadata service as an example). + +#### Start Target + +**Note: After file registration, existing data may be corrupted or completely lost—use with extreme caution!!** + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.0.0:8080/metadata \ + --mode=target \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Start Initiator + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8082 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=initiator \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB +``` + +#### Loopback Mode + +For quick validation, loopback mode can also be used to test on a single machine: + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=loopback \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Performance Tuning + +- For workloads involving many files, increasing `MC_NVMEOF_GENERIC_NUM_WORKERS` appropriately usually improves performance. +- If `--block_size` meets `4 KiB` alignment, set environment variable `MC_NVMEOF_GENERIC_DIRECT_IO=on` to significantly boost performance on SSD devices. + +### Mooncake Store Testing + +Use `mooncake-store/tests/stress_cluster_benchmark.py` to test the performance of Mooncake Store based on NVMeoFGenericTransport. + +#### Start Metadata Service + +Follow the instructions in [transfer-engine.md](./transfer-engine.md#example-transfer-engine-bench) and [mooncake-store-preview.md](./mooncake-store-preview.md#starting-the-master-service) to start the metadata service and Master service respectively. + +#### Start Prefill Instance + +```bash +python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Start Decode Instance + +```bash +python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" +``` + +#### Performance Tuning + +- For workloads involving many files, increasing `MC_NVMEOF_GENERIC_NUM_WORKERS` appropriately usually improves performance. +- Mooncake Store currently cannot guarantee allocation of buffers that meet Direct I/O alignment requirements; therefore, Direct I/O is not currently supported. \ No newline at end of file diff --git a/doc/zh/nvmeof_generic_transport.md b/doc/zh/nvmeof_generic_transport.md new file mode 100644 index 000000000..0da4bebb4 --- /dev/null +++ b/doc/zh/nvmeof_generic_transport.md @@ -0,0 +1,304 @@ +# Generic NVMeoF Transport + +## 概述 + +NVMeoFGenericTransport是一个更完善基于NVMeoF协议的TransferEngine Transport,旨在最终替代已有的NVMeoFTransport,为TransferEngine提供管理和访问文件Segment的能力。 + +相较于旧的NVMeoFTransport,NVMeoFGenericTransport具备以下优势: + +- **更完善:** 提供了与内存Segment一致的全套管理接口,包括注册/取消注册本地文件,挂载/取消挂载远端文件等; +- **更通用:** 不再依赖于cuFile,可在没有cuda支持的环境部署和使用; +- **更高性能:** 支持了多线程I/O和Direct I/O,可充分挖掘网卡与SSD的性能潜力; +- **更可靠:** 通过更加灵活的多文件管理方案,可以保证单个文件或存储设备不可用不影响其他文件或存储设备的可用性; + +## 组件支持 + +TransferEngine和Mooncake Store中均已增加了对NVMeoFGenericTransport完整支持,相关的API接口如下: + +### TransferEngine支持 + +`TransferEngine`现在支持了注册和读写文件segment,主要包括在`SegmentDesc`和`TransferRequest`中加入了与文件管理和访问相关的字段,并增加了注册和取消注册文件的接口。 + +#### SegmentDesc + +为了支持文件注册管理,`SegmentDesc`中增加了`file_buffers`字段。 + +```cpp +using FileBufferID = uint32_t; +struct FileBufferDesc { + FileBufferID id; // 文件ID,用于在Segment中标识文件 + std::string path; // 文件在所属节点上的路径 + std::size_t size; // 文件的可用空间大小 + std::size_t align; // For future usage. +}; + +struct SegmentDesc { + std::string name; + std::string protocol; + // Generic file buffers. + std::vector file_buffers; + + // Other fields... +}; +``` + +#### TransferRequest + +为了支持多文件注册与访问,`TransferRequest`中增加了`file_id`字段,用于标识需要读写的文件。 + +```cpp +struct TransferRequest { + enum OpCode { READ, WRITE }; + OpCode opcode; + void *source; + SegmentID target_id; + uint64_t target_offset; // 访问文件时,target_offset表示在目标文件中的偏移量 + size_t length; + int advise_retry_cnt = 0; + FileBufferID file_id; // 目标文件ID,只在访问文件时需要,与target_id一起定位目标文件 +}; +``` + +`file_id`是目标`TransferEngine`在注册目标文件时分配的ID,可从目标`Segment`的`SegmentDesc`中获得。 + +#### installTransport + +```cpp +Transport *installTransport(const std::string &proto, void **args) +``` + +`TransferEngine::installTransport`接口现在支持将`args`参数直接传递给相应Transport的`install`接口,以支持Transport特有的初始化参数。 + +对于`NVMeoFGenericTransport`来说,如果当前TransferEngine实例不需要共享本地文件,则`args`参数可以为`nullptr`。否则,`args`参数应为一个有效的指针数组,其中的第一个指针为一个`char *`类型的指针,指向一个包含NVMeoF Target配置参数的的字符串。如下所示: + +```cpp +// NVMeoF Target配置参数 +char *trid_str = "trtype= adrfam= traddr= trsvcid="; + +// 用于installTransport的参数 +void **args = (void **)&trid_str; +``` + +#### registerLocalFile + +```cpp +int registerLocalFile(const std::string &path, size_t size, FileBufferID &id); +``` + +将一个本地文件注册到TransferEngine,使其能够被跨节点访问。文件可以是普通文件或块设备文件。**注意:注册使用块设备文件会导致设备上原有的数据损坏或完全丢失,请谨慎使用!** + +- `path`: 文件路径,可以是任意普通文件,或块设备文件,如`/dev/nvmeXnY`; +- `size`: 文件的可用空间大小,可以小于等于文件的物理空间大小; +- `id`: `TransferEngine`为文件分配的ID,用于在注册了多个文件的情况下区分每个文件; +- 返回值:注册成功时返回0,否则返回负的错误码; + +#### unregisterLocalFile + +```cpp +int unregisterLocalFile(const std::string &path); +``` + +取消注册一个本地文件。 + +- `path`: 文件路径,需要与注册时使用的路径一致; + +### Mooncake Store支持 + +Mooncake Store现在支持了使用文件作为共享存储空间存储对象。这一能力基于两个新增的接口: + +#### MountFileSegment + +```cpp +tl::expected MountFileSegment(const std::string& path); +``` + +挂载路径为`path`的本地文件作为共享存储空间的一部分。 + +#### UnmountFileSegment + +```cpp +tl::expected UnmountFileSegment(const std::string& path); +``` + +取消挂载先前挂载了的文件。 + +### Mooncake Store Python API + +Mooncake Store Python API现在支持指定一组本地文件作为共享存储空间。 + +#### setup_with_files + +```python +def setup_with_files( + local_hostname: str, + metadata_server: str, + files: List[str], + local_buffer_size: int, + protocol: str, + protocol_arg: str, + master_server_addr: str + ): + pass +``` + +启动Mooncake Store Client实例,并将指定的文件注册为共享存储空间。 + +## 运行测试 + +用户可以从TransferEngine和Mooncake Store两个层面对NVMeoFGenericTransport进行测试。 + +### 环境要求 + +在Mooncake项目原有编译运行环境的基础上,NVMeoFGenericTransport还有一些额外的要求: + +#### 内核版本与驱动 + +NVMeoFGenericTransport当前依赖Linux内核的nvme和nvmet驱动组,包含以下内核模块: + +- NVMeoF RDMA: 依赖 Linux Kernel 4.8 及以上版本,安装驱动: + +```bash +# Initiator 驱动,访问远端文件需要 +modprobe nvme_rdma + +# Target 驱动,共享本地文件需要 +modprobe nvmet_rdma +``` + +- NVMeoF TCP: 依赖 Linux Kernel 5.0 及以上版本,安装驱动: + +```bash +# Initiator 驱动,访问远端文件需要 +modprobe nvme_tcp + +# Target 驱动,共享本地文件需要 +modprobe nvmet_tcp +``` + +#### 依赖库 + +NVMeoFGenericTransport依赖于以下第三方库: + +```bash +apt install -y libaio-dev libnvme-dev +``` + +### 编译选项 + +要启用NVMeoFGenericTransport,需要开启`USE_NVMEOF_GENERIC`编译选项: + +```bash +cmake .. -DUSE_NVMEOF_GENERIC=ON +``` + +### 运行时选项 + +NVMeoFGenericTransport支持通过环境变量配置以下运行时选项: + +- `MC_NVMEOF_GENERIC_DIRECT_IO` 在读写NVMeoF SSD时使用Direct I/O,默认关闭。开启这一选项可以大幅提升性能,但要求读写操作使用的buffer地址、读写的SSD位置以及读写长度全部满足对齐要求(通常是512字节对齐,建议4 KiB对齐) +- `MC_NVMEOF_GENERIC_NUM_WORKERS` 读写NVMeoF SSD时使用的线程数量,默认为8 + +### TransferEngine测试 + +开启`USE_NVMEOF_GENERIC`选项并完成编译后,在`build/mooncake-transfer-engine/example`下可以找到名为`transfer_engine_nvmeof_generic_bench`的可执行文件。此程序可用于测试NVMeoFGenericTransport的性能。 + +#### 启动元数据服务 + +与`transfer_engine_bench`测试工具相同,具体可参考 [transfer-engine.md](../zh/transfer-engine.md#范例程序transfer-engine-bench) + +后续以HTTP元数据服务为例,假设元数据服务地址为`http://127.0.0.1:8080/metadata` + +#### 启动target + +**注意:文件注册使用后,其中原有的数据将损坏甚至全部丢失,请谨慎使用!!!** + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=target \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 启动initiator + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8082 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=initiator \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB +``` + +#### Loopback模式 + +为了快速验证,也可以使用loopback模式在单机上进行测试: + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=loopback \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 性能调优 + +- 对于大量文件,适当调大`MC_NVMEOF_GENERIC_NUM_WORKERS`通常可以提升性能; +- 在`--block_size`满足`4 KiB`对齐的前提下,可以设置环境变量`MC_NVMEOF_GENERIC_DIRECT_IO=on`,对于SSD设备可以大幅提升性能; + +### Mooncake Store测试 + +使用`mooncake-store/tests/stress_cluster_benchmark.py`可以测试基于NVMeoFGenericTransport的Mooncake Store的性能。 + +#### 启动元数据服务 + +按照 [transfer-engine.md](./transfer-engine.md#范例程序transfer-engine-bench) 和 [mooncake-store-preview.md](./mooncake-store-preview.md#启动-master-service) 的说明分别启动元数据服务和Master服务。 + +#### 启动prefill实例 + +```bash +python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 启动decode实例 + +```bash +python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" +``` + +#### 性能调优 + +- 对于大量文件,适当调大`MC_NVMEOF_GENERIC_NUM_WORKERS`通常可以提升性能; +- Mooncake Store目前无法保证分配满足Direct I/O对齐要求的Buffer,因此暂无法启用Direct I/O; From fb7ed2ff89e4eab52491af9cf18cb40e0295f8c5 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 2 Sep 2025 01:54:40 +0000 Subject: [PATCH 09/15] [TransferEngine]: Use installTransportWithArgs to avoid wrong dispatch Signed-off-by: Jinlong Chen --- mooncake-transfer-engine/include/multi_transport.h | 2 +- .../transport/nvmeof_generic_transport/nvmeof_transport.h | 5 +++-- mooncake-transfer-engine/include/transport/transport.h | 5 +++-- mooncake-transfer-engine/src/multi_transport.cpp | 6 +++--- mooncake-transfer-engine/src/transfer_engine.cpp | 2 +- .../nvmeof_generic_transport/nvmeof_transport.cpp | 8 ++++---- mooncake-transfer-engine/src/transport/transport.cpp | 5 +++-- 7 files changed, 18 insertions(+), 15 deletions(-) diff --git a/mooncake-transfer-engine/include/multi_transport.h b/mooncake-transfer-engine/include/multi_transport.h index 91d484601..5e80c771e 100644 --- a/mooncake-transfer-engine/include/multi_transport.h +++ b/mooncake-transfer-engine/include/multi_transport.h @@ -51,7 +51,7 @@ class MultiTransport { bool transportNeedArgs(const std::string &proto); - Transport *installTransport(const std::string &proto, void **args); + Transport *installTransportWithArgs(const std::string &proto, void **args); Transport *getTransport(const std::string &proto); diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h index fd15ace1d..af9c0431a 100644 --- a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h @@ -54,8 +54,9 @@ class NVMeoFGenericTransport : public Transport { TransferStatus &status) override; private: - int install(std::string &local_server_name, - std::shared_ptr meta, void **args) override; + int installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args) override; int registerLocalMemory(void *addr, size_t length, const std::string &location, bool remote_accessible, diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index bd6f3d553..6ce3b0f97 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -271,8 +271,9 @@ class Transport { std::shared_ptr meta, std::shared_ptr topo); - virtual int install(std::string &local_server_name, - std::shared_ptr meta, void **args); + virtual int installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args); std::string local_server_name_; std::shared_ptr metadata_; diff --git a/mooncake-transfer-engine/src/multi_transport.cpp b/mooncake-transfer-engine/src/multi_transport.cpp index 6ec55e26f..4563899a9 100644 --- a/mooncake-transfer-engine/src/multi_transport.cpp +++ b/mooncake-transfer-engine/src/multi_transport.cpp @@ -252,8 +252,8 @@ bool MultiTransport::transportNeedArgs(const std::string &proto) { return false; } -Transport *MultiTransport::installTransport(const std::string &proto, - void **args) { +Transport *MultiTransport::installTransportWithArgs(const std::string &proto, + void **args) { std::shared_ptr transport = nullptr; #ifdef USE_NVMEOF_GENERIC @@ -268,7 +268,7 @@ Transport *MultiTransport::installTransport(const std::string &proto, return nullptr; } - int rc = transport->install(local_server_name_, metadata_, args); + int rc = transport->installWithArgs(local_server_name_, metadata_, args); if (rc != 0) { LOG(ERROR) << "Failed to install transport " << proto << ", rc=" << rc; return nullptr; diff --git a/mooncake-transfer-engine/src/transfer_engine.cpp b/mooncake-transfer-engine/src/transfer_engine.cpp index 972627f26..bbf7e1e72 100644 --- a/mooncake-transfer-engine/src/transfer_engine.cpp +++ b/mooncake-transfer-engine/src/transfer_engine.cpp @@ -260,7 +260,7 @@ Transport *TransferEngine::installTransport(const std::string &proto, } if (multi_transports_->transportNeedArgs(proto)) { - transport = multi_transports_->installTransport(proto, args); + transport = multi_transports_->installTransportWithArgs(proto, args); } else { if (args != nullptr && args[0] != nullptr) { const std::string nic_priority_matrix = diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp index 4ec88baa4..ea04144f5 100644 --- a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp @@ -146,10 +146,10 @@ Status NVMeoFGenericTransport::submitTransfer( return this->submitTransferTask(task_list); } -int NVMeoFGenericTransport::install(std::string &local_server_name, - std::shared_ptr meta, - void **args) { - int rc = Transport::install(local_server_name, meta, args); +int NVMeoFGenericTransport::installWithArgs( + std::string &local_server_name, std::shared_ptr meta, + void **args) { + int rc = Transport::installWithArgs(local_server_name, meta, args); if (rc != 0) { LOG(ERROR) << "Transport::install failed, rc=" << rc; return rc; diff --git a/mooncake-transfer-engine/src/transport/transport.cpp b/mooncake-transfer-engine/src/transport/transport.cpp index 7c8fed42b..c4d139a30 100644 --- a/mooncake-transfer-engine/src/transport/transport.cpp +++ b/mooncake-transfer-engine/src/transport/transport.cpp @@ -65,8 +65,9 @@ int Transport::install(std::string &local_server_name, return 0; } -int Transport::install(std::string &local_server_name, - std::shared_ptr meta, void **args) { +int Transport::installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args) { local_server_name_ = local_server_name; metadata_ = meta; return 0; From 8bd7ccbf066c5c1def120d8c19230552f9340ca9 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 5 Sep 2025 08:47:01 +0000 Subject: [PATCH 10/15] [TransferEngine]: Fix NVMeoFQueue::submitRequest --- .../transport/nvmeof_generic_transport/nvmeof_initiator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp index 2f4541999..9e11c1a94 100644 --- a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp @@ -283,10 +283,10 @@ int NVMeoFQueue::submitRequest(Slice *slice) { struct iocb *iocb = &slice->nvmeof_generic.iocb; if (slice->opcode == Transport::TransferRequest::READ) { io_prep_pread(iocb, fd, slice->source_addr, slice->length, - slice->nvmeof.offset); + slice->nvmeof_generic.offset); } else { io_prep_pwrite(iocb, fd, slice->source_addr, slice->length, - slice->nvmeof.offset); + slice->nvmeof_generic.offset); } iocb->data = slice; From 112d089836918a8d343c78d5c3a5d92ac77a47b8 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Mon, 15 Sep 2025 08:38:10 +0000 Subject: [PATCH 11/15] [Doc]: Update nvmeof_generic transport documents Signed-off-by: Jinlong Chen --- doc/en/nvmeof_generic_transport.md | 28 ++++++++++++++-------------- doc/zh/nvmeof_generic_transport.md | 28 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/doc/en/nvmeof_generic_transport.md b/doc/en/nvmeof_generic_transport.md index e4f96db26..c5b14c33f 100644 --- a/doc/en/nvmeof_generic_transport.md +++ b/doc/en/nvmeof_generic_transport.md @@ -277,25 +277,25 @@ Follow the instructions in [transfer-engine.md](./transfer-engine.md#example-tra #### Start Prefill Instance ```bash -python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ - --local-hostname=127.0.0.1:8081 \ - --role=prefill \ - --protocol=nvmeof_generic \ - --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ - --local-buffer-size=1024 \ - --files="/path/to/file0 /path/to/file1 ..." +MC_MS_AUTO_DISC=0 python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." ``` #### Start Decode Instance ```bash -python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ - --local-hostname=127.0.0.1:8082 \ - --role=decode \ - --protocol=nvmeof_generic \ - --protocol-args="" \ - --local-buffer-size=1024 \ - --files="" +MC_MS_AUTO_DISC=0 python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" ``` #### Performance Tuning diff --git a/doc/zh/nvmeof_generic_transport.md b/doc/zh/nvmeof_generic_transport.md index 0da4bebb4..c3ac0c5d0 100644 --- a/doc/zh/nvmeof_generic_transport.md +++ b/doc/zh/nvmeof_generic_transport.md @@ -277,25 +277,25 @@ NVMeoFGenericTransport支持通过环境变量配置以下运行时选项: #### 启动prefill实例 ```bash -python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ - --local-hostname=127.0.0.1:8081 \ - --role=prefill \ - --protocol=nvmeof_generic \ - --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ - --local-buffer-size=1024 \ - --files="/path/to/file0 /path/to/file1 ..." +MC_MS_AUTO_DISC=0 python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." ``` #### 启动decode实例 ```bash -python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ - --local-hostname=127.0.0.1:8082 \ - --role=decode \ - --protocol=nvmeof_generic \ - --protocol-args="" \ - --local-buffer-size=1024 \ - --files="" +MC_MS_AUTO_DISC=0 python3 ../mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" ``` #### 性能调优 From 52761ff8aecd2adfa591d64b4d7504b6a2e1c1c0 Mon Sep 17 00:00:00 2001 From: Chen Jinlong Date: Mon, 29 Sep 2025 09:49:57 +0000 Subject: [PATCH 12/15] [TransferEngine]: Fix open namespace problem in NVMeoFController::rescan --- .../nvmeof_initiator.h | 2 - .../nvmeof_initiator.cpp | 217 +++++++++++++----- 2 files changed, 154 insertions(+), 65 deletions(-) diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h index 8c1d8608c..4b92d7a5e 100644 --- a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h @@ -81,8 +81,6 @@ class NVMeoFController : public std::enable_shared_from_this { const std::string &traddr, const std::string &trsvcid, const std::string &subnqn); - nvme_ctrl_t findCtrl(); - int connect(); int disconnect(); diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp index 9e11c1a94..92f0a4066 100644 --- a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp @@ -15,8 +15,74 @@ #include "transport/nvmeof_generic_transport/nvmeof_initiator.h" #include +#include namespace mooncake { +static constexpr auto kMaxRescanDuration = std::chrono::seconds(15); + +static nvme_ctrl_t nvme_find_ctrl(nvme_root_t root, nvme_host_t host, + const std::string &trtype, + const std::string &traddr, + const std::string &trsvcid, + const std::string &subnqn) { + nvme_subsystem_t subsys; + nvme_ctrl_t ctrl; + + // Scan the topology first. + nvme_scan_topology(root, NULL, NULL); + + nvme_for_each_subsystem(host, subsys) { + nvme_subsystem_for_each_ctrl(subsys, ctrl) { + if (strcasecmp(nvme_ctrl_get_transport(ctrl), trtype.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_traddr(ctrl), traddr.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_trsvcid(ctrl), trsvcid.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_subsysnqn(ctrl), subnqn.c_str())) { + continue; + } + + return ctrl; + } + } + + return nullptr; +} + +static int nvme_get_active_ns_list(nvme_ctrl_t ctrl, + std::unordered_set &ns_list) { + struct nvme_ns_list ns_list_ = {0}; + + int fd = nvme_ctrl_get_fd(ctrl); + if (fd < 0) { + LOG(ERROR) << "Invalid fd " << fd << " of controller " + << nvme_ctrl_get_subsysnqn(ctrl); + return -EINVAL; + } + + int rc = nvme_identify_active_ns_list(fd, 0, &ns_list_); + if (rc != 0) { + LOG(ERROR) << "Failed to identify active ns list of controller " + << nvme_ctrl_get_subsysnqn(ctrl) << ", rc=" << rc; + return -EIO; + } + + for (size_t i = 0; i < NVME_ID_NS_LIST_MAX; i++) { + if (ns_list_.ns[i] > 0) { + ns_list.insert(ns_list_.ns[i]); + } + } + + return 0; +} + std::shared_ptr NVMeoFInitiator::create(bool direct_io) { auto initiator = std::shared_ptr(new NVMeoFInitiator(direct_io)); @@ -104,40 +170,9 @@ NVMeoFController::~NVMeoFController() { } } -nvme_ctrl_t NVMeoFController::findCtrl() { - nvme_subsystem_t subsys; - nvme_ctrl_t ctrl; - - // Scan the topology first. - nvme_scan_topology(initiator->root, NULL, NULL); - - nvme_for_each_subsystem(initiator->host, subsys) { - nvme_subsystem_for_each_ctrl(subsys, ctrl) { - if (strcasecmp(nvme_ctrl_get_transport(ctrl), trtype.c_str())) { - continue; - } - - if (strcmp(nvme_ctrl_get_traddr(ctrl), traddr.c_str())) { - continue; - } - - if (strcmp(nvme_ctrl_get_trsvcid(ctrl), trsvcid.c_str())) { - continue; - } - - if (strcmp(nvme_ctrl_get_subsysnqn(ctrl), subnqn.c_str())) { - continue; - } - - return ctrl; - } - } - - return nullptr; -} - int NVMeoFController::connect() { - ctrl = findCtrl(); + ctrl = nvme_find_ctrl(initiator->root, initiator->host, trtype, traddr, + trsvcid, subnqn); if (ctrl != nullptr) { // The controller has been connected. rescan(); @@ -161,9 +196,6 @@ int NVMeoFController::connect() { // We connected the controller, so we are responsible for disconnecting it. should_disconnect_ctrl = true; - // Wait a moment to ensure all namespaces are attached. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - // Trigger rescan to open namespaces. rescan(); @@ -176,41 +208,100 @@ void NVMeoFController::rescan() { return; } - // Rescan the topology. - nvme_scan_topology(initiator->root, NULL, NULL); - RWSpinlock::WriteGuard guard(ns_lock); - nvme_ns_t ns; - char ns_dev[64]; - - nvme_ctrl_for_each_ns(ctrl, ns) { - auto nsid = static_cast(nvme_ns_get_nsid(ns)); - auto it = namespaces.find(nsid); - if (it != namespaces.end() && it->second.fd >= 0) { - // Namespace has been open. - continue; + const auto rescan_timeout = + std::chrono::steady_clock::now() + kMaxRescanDuration; + + while (true) { + // Retrieve active namespace list via NVMe Identify command. + std::unordered_set active_ns; + int rc = nvme_get_active_ns_list(ctrl, active_ns); + if (rc != 0) { + LOG(ERROR) << "Failed to get active ns list of controller " + << nvme_ctrl_get_name(ctrl) << ", rc=" << rc; + break; + } + + // Remove invalid namespaces. + auto it = namespaces.begin(); + while (it != namespaces.end()) { + if (!active_ns.contains(it->first)) { + it = namespaces.erase(it); + } else { + it++; + } } - const char *name = nvme_ns_get_name(ns); - int rc = snprintf(ns_dev, sizeof(ns_dev), "/dev/%s", name); - if (rc <= 0) { - LOG(ERROR) << "Invalid namespace device name " << name; - continue; + // Scan controller sysfs directory to get attached namespaces. + struct dirent **ns_dirents = NULL; + int num_ns_dirents = nvme_scan_ctrl_namespaces(ctrl, &ns_dirents); + if (num_ns_dirents < 0) { + LOG(ERROR) << "Failed to scan namespaces of controller " + << nvme_ctrl_get_name(ctrl) << ", errno=" << errno; + break; } - int flags = O_RDWR; - if (initiator->direct_io) flags |= O_DIRECT; + // Open namespace block devices. + for (int i = 0; i < num_ns_dirents; i++) { + char ns_dev[256]; + rc = snprintf(ns_dev, sizeof(ns_dev), "/dev/%s", + ns_dirents[i]->d_name); + if (rc <= 0) { + LOG(ERROR) << "Invalid namespace device name " + << ns_dirents[i]->d_name; + continue; + } + + int flags = O_RDWR; + if (initiator->direct_io) flags |= O_DIRECT; + + int fd = open(ns_dev, flags); + if (fd < 0) { + LOG(ERROR) << "Failed to open nvme namespace " << ns_dev + << ", errno=" << errno; + continue; + } + + uint32_t nsid; + rc = nvme_get_nsid(fd, &nsid); + if (rc != 0) { + LOG(ERROR) << "Failed to get nsid of namespace " + << ns_dirents[i]->d_name << ", errno=" << errno; + close(fd); + continue; + } + + if (namespaces.contains(nsid) && namespaces[nsid].fd >= 0) { + // The namespace has been open. + close(fd); + continue; + } + + LOG(INFO) << "Added namespace " << nsid << " of controller " + << nvme_ctrl_get_name(ctrl); + namespaces[nsid] = {nsid, fd}; + } + + // Free dirents. + for (int i = 0; i < num_ns_dirents; i++) { + free(ns_dirents[i]); + } + free(ns_dirents); + + // Check if all active namespaces are open. + if (namespaces.size() == active_ns.size()) { + break; + } - int fd = open(ns_dev, flags); - if (fd < 0) { - LOG(ERROR) << "Failed to open nvme namespace " << ns_dev - << ", errno=" << errno; - continue; + if (std::chrono::steady_clock::now() >= rescan_timeout) { + LOG(ERROR) << "Timedout to wait for namespaces of " << subnqn + << " to be attached, expected " << active_ns.size() + << ", attached " << namespaces.size(); + break; } - LOG(INFO) << "Added namespace " << nsid << " to controller " - << nvme_ctrl_get_name(ctrl); - namespaces[nsid] = {nsid, fd}; + // Wait a moment for namespaces to be attached. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); } } From 12578a2b3218437d6e6e15a9752eb600e688f904 Mon Sep 17 00:00:00 2001 From: Chen Jinlong Date: Mon, 29 Sep 2025 10:51:36 +0000 Subject: [PATCH 13/15] [Test]: Fix unit tests --- mooncake-store/tests/master_service_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/mooncake-store/tests/master_service_test.cpp b/mooncake-store/tests/master_service_test.cpp index 1170397ff..e9f10e850 100644 --- a/mooncake-store/tests/master_service_test.cpp +++ b/mooncake-store/tests/master_service_test.cpp @@ -35,6 +35,7 @@ class MasterServiceTest : public ::testing::Test { size_t size = kDefaultSegmentSize) const { Segment segment; segment.id = generate_uuid(); + segment.type = SegmentType::MEMORY; segment.name = std::move(name); segment.base = base; segment.size = size; From 41cdb498322c2c5aaaa9a17a892d36373bdea8b0 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Wed, 15 Oct 2025 06:21:29 +0000 Subject: [PATCH 14/15] Resolve code review problems --- doc/en/nvmeof_generic_transport.md | 2 +- mooncake-store/src/allocator.cpp | 3 ++- mooncake-store/src/client.cpp | 10 +++++----- .../example/transfer_engine_nvmeof_generic_bench.cpp | 10 +++++----- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/en/nvmeof_generic_transport.md b/doc/en/nvmeof_generic_transport.md index 6ef4304e7..dd9fd9a8c 100644 --- a/doc/en/nvmeof_generic_transport.md +++ b/doc/en/nvmeof_generic_transport.md @@ -215,7 +215,7 @@ Assume the metadata service address is `http://127.0.0.1:8080/metadata` (using H ```bash ./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ --local_server_name=127.0.0.1:8081 \ - --metadata_server=http://127.0.0.0.0:8080/metadata \ + --metadata_server=http://127.0.0.1:8080/metadata \ --mode=target \ --trtype=tcp \ --traddr=127.0.0.1 \ diff --git a/mooncake-store/src/allocator.cpp b/mooncake-store/src/allocator.cpp index bcb5de071..a49f66cc8 100644 --- a/mooncake-store/src/allocator.cpp +++ b/mooncake-store/src/allocator.cpp @@ -79,7 +79,8 @@ CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, LOG_ASSERT(header_region_start_); - // Add a padding to base to support zero-based buffers. + /// Zero is not a valid buffer base address for CachelibAllocator. + /// Therefore, we add a padding to the base to support zero-based buffers. auto padded_base = base + facebook::cachelib::Slab::kSize; // Initialize the CacheLib MemoryAllocator. diff --git a/mooncake-store/src/client.cpp b/mooncake-store/src/client.cpp index 1e57609ef..e6fd62f47 100644 --- a/mooncake-store/src/client.cpp +++ b/mooncake-store/src/client.cpp @@ -47,18 +47,20 @@ static size_t getFileSize(const std::string& file) { int fd = open(file.c_str(), O_RDONLY); if (fd < 0) { LOG(ERROR) << "Failed to open file " << file << ", errno=" << errno; - goto out; + return 0; } rc = fstat(fd, &st); if (rc < 0) { LOG(ERROR) << "Failed fstat on file " << file << ", errno=" << errno; - goto close_file; + close(fd); + return 0; } if (S_ISLNK(st.st_mode)) { LOG(ERROR) << "File " << file << " is a symbolic link"; - goto close_file; + close(fd); + return 0; } if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { @@ -72,9 +74,7 @@ static size_t getFileSize(const std::string& file) { size = st.st_size; } -close_file: close(fd); -out: return size; } diff --git a/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp index 61633de37..ec6986707 100644 --- a/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp +++ b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp @@ -268,15 +268,17 @@ static size_t getFileSize(const std::string &file) { int fd = open(file.c_str(), O_RDONLY); if (fd < 0) { - goto err_out; + return 0; } if (fstat(fd, &st) != 0) { - goto err_close_file; + close(fd); + return 0; } if (S_ISLNK(st.st_mode)) { - goto err_close_file; + close(fd); + return 0; } if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { @@ -285,9 +287,7 @@ static size_t getFileSize(const std::string &file) { size = st.st_size; } -err_close_file: close(fd); -err_out: return size; } From 52c54f12b0ff6c5a748dc370095fbadde6d85a05 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Fri, 17 Oct 2025 03:28:06 +0000 Subject: [PATCH 15/15] Fix mistakes --- mooncake-store/include/pybind_client.h | 200 +++++++++++++------------ mooncake-store/src/client.cpp | 6 +- 2 files changed, 104 insertions(+), 102 deletions(-) diff --git a/mooncake-store/include/pybind_client.h b/mooncake-store/include/pybind_client.h index 995e3ac5d..9a459d4e6 100644 --- a/mooncake-store/include/pybind_client.h +++ b/mooncake-store/include/pybind_client.h @@ -22,7 +22,7 @@ constexpr bool is_supported_return_type_v = template requires is_supported_return_type_v -int64_t to_py_ret(const tl::expected &exp) noexcept { +int64_t to_py_ret(const tl::expected& exp) noexcept { if (!exp) { return static_cast(toInt(exp.error())); } @@ -40,18 +40,18 @@ int64_t to_py_ret(const tl::expected &exp) noexcept { class ResourceTracker { public: // Get the singleton instance - static ResourceTracker &getInstance(); + static ResourceTracker& getInstance(); // Register a DistributedObjectStore instance for cleanup - void registerInstance(const std::shared_ptr &instance); + void registerInstance(const std::shared_ptr& instance); private: ResourceTracker(); ~ResourceTracker(); // Prevent copying - ResourceTracker(const ResourceTracker &) = delete; - ResourceTracker &operator=(const ResourceTracker &) = delete; + ResourceTracker(const ResourceTracker&) = delete; + ResourceTracker& operator=(const ResourceTracker&) = delete; // Cleanup all registered resources void cleanupAllResources(); @@ -82,32 +82,33 @@ class PyClient { // Factory to create shared instances and auto-register to ResourceTracker static std::shared_ptr create(); - int setup(const std::string &local_hostname, - const std::string &metadata_server, + int setup(const std::string& local_hostname, + const std::string& metadata_server, size_t global_segment_size = 1024 * 1024 * 16, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "tcp", - const std::string &rdma_devices = "", - const std::string &master_server_addr = "127.0.0.1:50051", - const std::shared_ptr &transfer_engine = nullptr); + const std::string& protocol = "tcp", + const std::string& rdma_devices = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); int setup_with_files( - const std::string &local_hostname, const std::string &metadata_server, - const std::vector &files, + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "nvmeof_generic", - const std::string &protocol_arg = "", - const std::string &master_server_addr = "127.0.0.1:50051"); + const std::string& protocol = "nvmeof_generic", + const std::string& protocol_arg = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); - int initAll(const std::string &protocol, const std::string &device_name, + int initAll(const std::string& protocol, const std::string& device_name, size_t mount_segment_size = 1024 * 1024 * 16); // Default 16MB - int put(const std::string &key, std::span value, - const ReplicateConfig &config = ReplicateConfig{}); + int put(const std::string& key, std::span value, + const ReplicateConfig& config = ReplicateConfig{}); - int register_buffer(void *buffer, size_t size); + int register_buffer(void* buffer, size_t size); - int unregister_buffer(void *buffer); + int unregister_buffer(void* buffer); /** * @brief Get object data directly into a pre-allocated buffer @@ -119,7 +120,7 @@ class PyClient { * @note The buffer address must be previously registered with * register_buffer() for zero-copy operations */ - int64_t get_into(const std::string &key, void *buffer, size_t size); + int64_t get_into(const std::string& key, void* buffer, size_t size); /** * @brief Get object data directly into pre-allocated buffers for multiple @@ -132,9 +133,9 @@ class PyClient { * @note The buffer addresses must be previously registered with * register_buffer() for zero-copy operations */ - std::vector batch_get_into(const std::vector &keys, - const std::vector &buffers, - const std::vector &sizes); + std::vector batch_get_into(const std::vector& keys, + const std::vector& buffers, + const std::vector& sizes); /** * @brief Get object data directly into pre-allocated buffers for multiple @@ -149,9 +150,9 @@ class PyClient { * register_buffer() for zero-copy operations */ std::vector batch_get_into_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_same_node); /** @@ -164,8 +165,8 @@ class PyClient { * @note The buffer address must be previously registered with * register_buffer() for zero-copy operations */ - int put_from(const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config = ReplicateConfig{}); + int put_from(const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from pre-allocated buffers for multiple @@ -183,9 +184,9 @@ class PyClient { * register_buffer() for zero-copy operations */ int put_from_with_metadata( - const std::string &key, void *buffer, void *metadata_buffer, + const std::string& key, void* buffer, void* metadata_buffer, size_t size, size_t metadata_size, - const ReplicateConfig &config = ReplicateConfig{}); + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from pre-allocated buffers for multiple @@ -201,9 +202,9 @@ class PyClient { */ std::vector batch_put_from( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes, + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from multiple pre-allocated buffers for @@ -219,18 +220,18 @@ class PyClient { * register_buffer() for zero-copy operations */ std::vector batch_put_from_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, + const ReplicateConfig& config = ReplicateConfig{}); - int put_parts(const std::string &key, + int put_parts(const std::string& key, std::vector> values, - const ReplicateConfig &config = ReplicateConfig{}); + const ReplicateConfig& config = ReplicateConfig{}); - int put_batch(const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config = ReplicateConfig{}); + int put_batch(const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config = ReplicateConfig{}); [[nodiscard]] std::string get_hostname() const; @@ -240,7 +241,7 @@ class PyClient { * @return std::shared_ptr Buffer containing the data, or * nullptr if error */ - std::shared_ptr get_buffer(const std::string &key); + std::shared_ptr get_buffer(const std::string& key); /** * @brief Get buffers containing the data for multiple keys (batch version) @@ -249,11 +250,11 @@ class PyClient { * data, or nullptr for each key if error */ std::vector> batch_get_buffer( - const std::vector &keys); + const std::vector& keys); - int remove(const std::string &key); + int remove(const std::string& key); - long removeByRegex(const std::string &str); + long removeByRegex(const std::string& str); long removeAll(); @@ -264,7 +265,7 @@ class PyClient { * @param key Key to check * @return 1 if exists, 0 if not exists, -1 if error */ - int isExist(const std::string &key); + int isExist(const std::string& key); /** * @brief Check if multiple objects exist @@ -272,7 +273,7 @@ class PyClient { * @return Vector of existence results: 1 if exists, 0 if not exists, -1 if * error */ - std::vector batchIsExist(const std::vector &keys); + std::vector batchIsExist(const std::vector& keys); /** * @brief Get the size of an object @@ -280,111 +281,112 @@ class PyClient { * @return Size of the object in bytes, or -1 if error or object doesn't * exist */ - int64_t getSize(const std::string &key); + int64_t getSize(const std::string& key); // Internal versions that return tl::expected tl::expected common_setup_internal( - const std::string &local_hostname, const std::string &metadata_server, - size_t local_buffer_size, const std::string &protocol, - const std::string &protocol_args, - const std::string &master_server_addr); + const std::string& local_hostname, const std::string& metadata_server, + size_t local_buffer_size, const std::string& protocol, + const std::string& protocol_args, const std::string& master_server_addr, + const std::shared_ptr& transfer_engine); tl::expected setup_internal( - const std::string &local_hostname, const std::string &metadata_server, + const std::string& local_hostname, const std::string& metadata_server, size_t global_segment_size = 1024 * 1024 * 16, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "tcp", - const std::string &rdma_devices = "", - const std::string &master_server_addr = "127.0.0.1:50051", - const std::shared_ptr &transfer_engine = nullptr); + const std::string& protocol = "tcp", + const std::string& rdma_devices = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); tl::expected setup_with_files_internal( - const std::string &local_hostname, const std::string &metadata_server, - const std::vector &files, + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "nvmeof_generic", - const std::string &protocol_arg = "", - const std::string &master_server_addr = "127.0.0.1:50051"); + const std::string& protocol = "nvmeof_generic", + const std::string& protocol_arg = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); tl::expected initAll_internal( - const std::string &protocol, const std::string &device_name, + const std::string& protocol, const std::string& device_name, size_t mount_segment_size = 1024 * 1024 * 16); - tl::expected unregister_buffer_internal(void *buffer); + tl::expected unregister_buffer_internal(void* buffer); tl::expected put_internal( - const std::string &key, std::span value, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, std::span value, + const ReplicateConfig& config = ReplicateConfig{}); - tl::expected register_buffer_internal(void *buffer, + tl::expected register_buffer_internal(void* buffer, size_t size); - tl::expected get_into_internal(const std::string &key, - void *buffer, + tl::expected get_into_internal(const std::string& key, + void* buffer, size_t size); std::vector> batch_get_into_internal( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes); std::vector> batch_get_into_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_same_node); tl::expected put_from_internal( - const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config = ReplicateConfig{}); std::vector> batch_put_from_internal( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes, + const ReplicateConfig& config = ReplicateConfig{}); std::vector> batch_put_from_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, + const ReplicateConfig& config = ReplicateConfig{}); tl::expected put_parts_internal( - const std::string &key, std::vector> values, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, std::vector> values, + const ReplicateConfig& config = ReplicateConfig{}); tl::expected put_batch_internal( - const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config = ReplicateConfig{}); - tl::expected remove_internal(const std::string &key); + tl::expected remove_internal(const std::string& key); tl::expected removeByRegex_internal( - const std::string &str); + const std::string& str); tl::expected removeAll_internal(); tl::expected tearDownAll_internal(); - tl::expected isExist_internal(const std::string &key); + tl::expected isExist_internal(const std::string& key); std::vector> batchIsExist_internal( - const std::vector &keys); + const std::vector& keys); - tl::expected getSize_internal(const std::string &key); + tl::expected getSize_internal(const std::string& key); std::vector> batch_get_buffer_internal( - const std::vector &keys); + const std::vector& keys); std::shared_ptr client_ = nullptr; std::shared_ptr client_buffer_allocator_ = nullptr; std::unique_ptr port_binder_ = nullptr; struct SegmentDeleter { - void operator()(void *ptr) { + void operator()(void* ptr) { if (ptr) { free(ptr); } @@ -392,7 +394,7 @@ class PyClient { }; struct AscendSegmentDeleter { - void operator()(void *ptr) { + void operator()(void* ptr) { if (ptr) { free_memory("ascend", ptr); } diff --git a/mooncake-store/src/client.cpp b/mooncake-store/src/client.cpp index 73baa839c..775b685db 100644 --- a/mooncake-store/src/client.cpp +++ b/mooncake-store/src/client.cpp @@ -1578,7 +1578,7 @@ tl::expected Client::MountFileSegment( } FileBufferID file_id; - int rc = transfer_engine_.registerLocalFile(path, size, file_id); + int rc = transfer_engine_->registerLocalFile(path, size, file_id); if (rc != 0) { LOG(ERROR) << "register_local_file_failed path=" << path << " size=" << size << ", error=" << rc; @@ -1590,7 +1590,7 @@ tl::expected Client::MountFileSegment( // negotiated by the transfer engine. Otherwise, keep the logical hostname // so metadata backends (HTTP/etcd/redis) can resolve the segment by name. if (metadata_connstring_ == P2PHANDSHAKE) { - te_endpoint = transfer_engine_.getLocalIpAndPort(); + te_endpoint = transfer_engine_->getLocalIpAndPort(); } else { te_endpoint = local_hostname_; } @@ -1636,7 +1636,7 @@ tl::expected Client::UnmountFileSegment( return tl::unexpected(err); } - int rc = transfer_engine_.unregisterLocalFile(segment->second.path); + int rc = transfer_engine_->unregisterLocalFile(segment->second.path); if (rc != 0) { LOG(ERROR) << "Failed to unregister file with transfer " "engine ret is "