diff --git a/dependencies.sh b/dependencies.sh index 7c9ef329d..20a69b3bb 100755 --- a/dependencies.sh +++ b/dependencies.sh @@ -122,7 +122,9 @@ SYSTEM_PACKAGES="build-essential \ libhiredis-dev \ libjemalloc-dev \ pkg-config \ - patchelf" + patchelf \ + libaio-dev \ + libnvme-dev" apt-get install -y $SYSTEM_PACKAGES check_success "Failed to install system packages" diff --git a/doc/en/nvmeof_generic_transport.md b/doc/en/nvmeof_generic_transport.md new file mode 100644 index 000000000..dd9fd9a8c --- /dev/null +++ b/doc/en/nvmeof_generic_transport.md @@ -0,0 +1,304 @@ +# Generic NVMeoF Transport + +## Overview + +NVMeoFGenericTransport is a more complete NVMeoF protocol-based TransferEngine Transport, designed to eventually replace the existing NVMeoFTransport and provide TransferEngine with the ability to manage and access file Segments. + +Compared to the legacy NVMeoFTransport, NVMeoFGenericTransport offers the following advantages: + +- **More Complete:** Provides a full set of management interfaces consistent with memory Segments, including registering/unregistering local files, mounting/unmounting remote files, etc. +- **More Generic:** No longer depends on cuFile, and can be deployed and used in environments without CUDA support. +- **Higher Performance:** Supports multi-threaded I/O and Direct I/O, fully leveraging the performance potential of NICs and SSDs. +- **More Reliable:** Ensures that unavailability of a single file or storage device does not affect the availability of others, through a more flexible multi-file management scheme. + +## Component Support + +Both TransferEngine and Mooncake Store have added full support for NVMeoFGenericTransport. The relevant API interfaces are listed below: + +### TransferEngine Support + +`TransferEngine` now supports registering and reading/writing file segments. This mainly includes adding fields related to file management and access in `SegmentDesc` and `TransferRequest`, and introducing interfaces for registering and unregistering files. + +#### SegmentDesc + +To support file registration management, the `file_buffers` field has been added to `SegmentDesc`. + +```cpp +using FileBufferID = uint32_t; +struct FileBufferDesc { + FileBufferID id; // File ID, used to identify the file within a Segment + std::string path; // File path on the owning node + std::size_t size; // Available space size of the file + std::size_t align; // For future usage. +}; + +struct SegmentDesc { + std::string name; + std::string protocol; + // Generic file buffers. + std::vector file_buffers; + + // Other fields... +}; +``` + +#### TransferRequest + +To support multi-file registration and access, the `file_id` field has been added to `TransferRequest` to identify the file to be read from or written to. + +```cpp +struct TransferRequest { + enum OpCode { READ, WRITE }; + OpCode opcode; + void *source; + SegmentID target_id; + uint64_t target_offset; // When accessing a file, target_offset indicates the offset within the target file + size_t length; + int advise_retry_cnt = 0; + FileBufferID file_id; // Target file ID, required only when accessing files, used with target_id to locate the target file +}; +``` + +`file_id` is the ID assigned by the target `TransferEngine` when registering the target file, and can be obtained from the `SegmentDesc` of the target `Segment`. + +#### installTransport + +```cpp +Transport *installTransport(const std::string &proto, void **args) +``` + +The `TransferEngine::installTransport` interface now supports directly passing the `args` parameter to the `install` interface of the corresponding Transport, enabling Transport-specific initialization parameters. + +For `NVMeoFGenericTransport`, if the current TransferEngine instance does not need to share local files, the `args` parameter can be `nullptr`. Otherwise, `args` should be a valid pointer array, where the first pointer points to a `char *` that references a string containing NVMeoF Target configuration parameters. For example: + +```cpp +// NVMeoF Target configuration parameters +char *trid_str = "trtype= adrfam= traddr= trsvcid="; + +// Arguments for installTransport +void **args = (void **)&trid_str; +``` + +#### registerLocalFile + +```cpp +int registerLocalFile(const std::string &path, size_t size, FileBufferID &id); +``` + +Registers a local file into TransferEngine, enabling cross-node access. The file can be a regular file or a block device file. **Note: Using a block device file for registration may cause data corruption or complete loss on the device—use with caution!** + +- `path`: File path, can be any regular file or block device file such as `/dev/nvmeXnY`; +- `size`: Available space size of the file, can be less than or equal to the physical size; +- `id`: ID assigned by `TransferEngine` to the file, used to distinguish each file when multiple files are registered; +- Return value: Returns 0 on success, otherwise returns a negative error code; + +#### unregisterLocalFile + +```cpp +int unregisterLocalFile(const std::string &path); +``` + +Unregisters a local file. + +- `path`: File path, must match the path used during registration; + +### Mooncake Store Support + +Mooncake Store now supports using files as shared storage space for storing objects. This capability is based on two newly added interfaces: + +#### MountFileSegment + +```cpp +tl::expected MountFileSegment(const std::string& path); +``` + +Mounts the local file at `path` as part of the shared storage space. + +#### UnmountFileSegment + +```cpp +tl::expected UnmountFileSegment(const std::string& path); +``` + +Unmounts a previously mounted file. + +### Mooncake Store Python API + +The Mooncake Store Python API now supports specifying a set of local files as shared storage space. + +#### setup_with_files + +```python +def setup_with_files( + local_hostname: str, + metadata_server: str, + files: List[str], + local_buffer_size: int, + protocol: str, + protocol_arg: str, + master_server_addr: str + ): + pass +``` + +Starts a Mooncake Store Client instance and registers the specified files as shared storage space. + +## Running Tests + +Users can test NVMeoFGenericTransport at both the TransferEngine and Mooncake Store levels. + +### Environment Requirements + +In addition to the original compilation and runtime environment of the Mooncake project, NVMeoFGenericTransport has additional requirements: + +#### Kernel Version and Drivers + +NVMeoFGenericTransport currently relies on the Linux kernel's nvme and nvmet driver suite, including the following kernel modules: + +- NVMeoF RDMA: Requires Linux Kernel 4.8 or higher, install drivers: + +```bash +# Initiator driver, required for accessing remote files +modprobe nvme_rdma + +# Target driver, required for sharing local files +modprobe nvmet_rdma +``` + +- NVMeoF TCP: Requires Linux Kernel 5.0 or higher, install drivers: + +```bash +# Initiator driver, required for accessing remote files +modprobe nvme_tcp + +# Target driver, required for sharing local files +modprobe nvmet_tcp +``` + +#### Dependencies + +NVMeoFGenericTransport depends on the following third-party libraries: + +```bash +apt install -y libaio-dev libnvme-dev +``` + +### Build Options + +To enable NVMeoFGenericTransport, the `USE_NVMEOF_GENERIC` build option must be turned on: + +```bash +cmake .. -DUSE_NVMEOF_GENERIC=ON +``` + +### Runtime Options + +NVMeoFGenericTransport supports configuring the following runtime options via environment variables: + +- `MC_NVMEOF_GENERIC_DIRECT_IO`: Use Direct I/O when reading/writing NVMeoF SSDs. Disabled by default. Enabling this option can significantly improve performance, but requires that buffer addresses, SSD locations, and I/O lengths all meet alignment requirements (typically 512-byte alignment, 4 KiB alignment recommended). +- `MC_NVMEOF_GENERIC_NUM_WORKERS`: Number of threads used for reading/writing NVMeoF SSDs. Default is 8. + +### TransferEngine Testing + +After enabling the `USE_NVMEOF_GENERIC` option and completing the build, an executable named `transfer_engine_nvmeof_generic_bench` can be found under `build/mooncake-transfer-engine/example`. This program can be used to test the performance of NVMeoFGenericTransport. + +#### Start Metadata Service + +Same as the `transfer_engine_bench` test tool. Refer to [transfer-engine.md](../zh/transfer-engine.md#范例程序transfer-engine-bench) for details. + +Assume the metadata service address is `http://127.0.0.1:8080/metadata` (using HTTP metadata service as an example). + +#### Start Target + +**Note: After file registration, existing data may be corrupted or completely lost—use with extreme caution!!** + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=target \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Start Initiator + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8082 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=initiator \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB +``` + +#### Loopback Mode + +For quick validation, loopback mode can also be used to test on a single machine: + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=loopback \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Performance Tuning + +- For workloads involving many files, increasing `MC_NVMEOF_GENERIC_NUM_WORKERS` appropriately usually improves performance. +- If `--block_size` meets `4 KiB` alignment, set environment variable `MC_NVMEOF_GENERIC_DIRECT_IO=on` to significantly boost performance on SSD devices. + +### Mooncake Store Testing + +Use `mooncake-store/tests/stress_cluster_benchmark.py` to test the performance of Mooncake Store based on NVMeoFGenericTransport. + +#### Start Metadata Service + +Follow the instructions in [transfer-engine.md](./transfer-engine.md#example-transfer-engine-bench) and [mooncake-store-preview.md](./mooncake-store-preview.md#starting-the-master-service) to start the metadata service and Master service respectively. + +#### Start Prefill Instance + +```bash +python3 mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### Start Decode Instance + +```bash +python3 mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" +``` + +#### Performance Tuning + +- For workloads involving many files, increasing `MC_NVMEOF_GENERIC_NUM_WORKERS` appropriately usually improves performance. +- Mooncake Store currently cannot guarantee allocation of buffers that meet Direct I/O alignment requirements; therefore, Direct I/O is not currently supported. \ No newline at end of file diff --git a/doc/zh/nvmeof_generic_transport.md b/doc/zh/nvmeof_generic_transport.md new file mode 100644 index 000000000..c5b4f6d2d --- /dev/null +++ b/doc/zh/nvmeof_generic_transport.md @@ -0,0 +1,304 @@ +# Generic NVMeoF Transport + +## 概述 + +NVMeoFGenericTransport是一个更完善基于NVMeoF协议的TransferEngine Transport,旨在最终替代已有的NVMeoFTransport,为TransferEngine提供管理和访问文件Segment的能力。 + +相较于旧的NVMeoFTransport,NVMeoFGenericTransport具备以下优势: + +- **更完善:** 提供了与内存Segment一致的全套管理接口,包括注册/取消注册本地文件,挂载/取消挂载远端文件等; +- **更通用:** 不再依赖于cuFile,可在没有cuda支持的环境部署和使用; +- **更高性能:** 支持了多线程I/O和Direct I/O,可充分挖掘网卡与SSD的性能潜力; +- **更可靠:** 通过更加灵活的多文件管理方案,可以保证单个文件或存储设备不可用不影响其他文件或存储设备的可用性; + +## 组件支持 + +TransferEngine和Mooncake Store中均已增加了对NVMeoFGenericTransport完整支持,相关的API接口如下: + +### TransferEngine支持 + +`TransferEngine`现在支持了注册和读写文件segment,主要包括在`SegmentDesc`和`TransferRequest`中加入了与文件管理和访问相关的字段,并增加了注册和取消注册文件的接口。 + +#### SegmentDesc + +为了支持文件注册管理,`SegmentDesc`中增加了`file_buffers`字段。 + +```cpp +using FileBufferID = uint32_t; +struct FileBufferDesc { + FileBufferID id; // 文件ID,用于在Segment中标识文件 + std::string path; // 文件在所属节点上的路径 + std::size_t size; // 文件的可用空间大小 + std::size_t align; // For future usage. +}; + +struct SegmentDesc { + std::string name; + std::string protocol; + // Generic file buffers. + std::vector file_buffers; + + // Other fields... +}; +``` + +#### TransferRequest + +为了支持多文件注册与访问,`TransferRequest`中增加了`file_id`字段,用于标识需要读写的文件。 + +```cpp +struct TransferRequest { + enum OpCode { READ, WRITE }; + OpCode opcode; + void *source; + SegmentID target_id; + uint64_t target_offset; // 访问文件时,target_offset表示在目标文件中的偏移量 + size_t length; + int advise_retry_cnt = 0; + FileBufferID file_id; // 目标文件ID,只在访问文件时需要,与target_id一起定位目标文件 +}; +``` + +`file_id`是目标`TransferEngine`在注册目标文件时分配的ID,可从目标`Segment`的`SegmentDesc`中获得。 + +#### installTransport + +```cpp +Transport *installTransport(const std::string &proto, void **args) +``` + +`TransferEngine::installTransport`接口现在支持将`args`参数直接传递给相应Transport的`install`接口,以支持Transport特有的初始化参数。 + +对于`NVMeoFGenericTransport`来说,如果当前TransferEngine实例不需要共享本地文件,则`args`参数可以为`nullptr`。否则,`args`参数应为一个有效的指针数组,其中的第一个指针为一个`char *`类型的指针,指向一个包含NVMeoF Target配置参数的的字符串。如下所示: + +```cpp +// NVMeoF Target配置参数 +char *trid_str = "trtype= adrfam= traddr= trsvcid="; + +// 用于installTransport的参数 +void **args = (void **)&trid_str; +``` + +#### registerLocalFile + +```cpp +int registerLocalFile(const std::string &path, size_t size, FileBufferID &id); +``` + +将一个本地文件注册到TransferEngine,使其能够被跨节点访问。文件可以是普通文件或块设备文件。**注意:注册使用块设备文件会导致设备上原有的数据损坏或完全丢失,请谨慎使用!** + +- `path`: 文件路径,可以是任意普通文件,或块设备文件,如`/dev/nvmeXnY`; +- `size`: 文件的可用空间大小,可以小于等于文件的物理空间大小; +- `id`: `TransferEngine`为文件分配的ID,用于在注册了多个文件的情况下区分每个文件; +- 返回值:注册成功时返回0,否则返回负的错误码; + +#### unregisterLocalFile + +```cpp +int unregisterLocalFile(const std::string &path); +``` + +取消注册一个本地文件。 + +- `path`: 文件路径,需要与注册时使用的路径一致; + +### Mooncake Store支持 + +Mooncake Store现在支持了使用文件作为共享存储空间存储对象。这一能力基于两个新增的接口: + +#### MountFileSegment + +```cpp +tl::expected MountFileSegment(const std::string& path); +``` + +挂载路径为`path`的本地文件作为共享存储空间的一部分。 + +#### UnmountFileSegment + +```cpp +tl::expected UnmountFileSegment(const std::string& path); +``` + +取消挂载先前挂载了的文件。 + +### Mooncake Store Python API + +Mooncake Store Python API现在支持指定一组本地文件作为共享存储空间。 + +#### setup_with_files + +```python +def setup_with_files( + local_hostname: str, + metadata_server: str, + files: List[str], + local_buffer_size: int, + protocol: str, + protocol_arg: str, + master_server_addr: str + ): + pass +``` + +启动Mooncake Store Client实例,并将指定的文件注册为共享存储空间。 + +## 运行测试 + +用户可以从TransferEngine和Mooncake Store两个层面对NVMeoFGenericTransport进行测试。 + +### 环境要求 + +在Mooncake项目原有编译运行环境的基础上,NVMeoFGenericTransport还有一些额外的要求: + +#### 内核版本与驱动 + +NVMeoFGenericTransport当前依赖Linux内核的nvme和nvmet驱动组,包含以下内核模块: + +- NVMeoF RDMA: 依赖 Linux Kernel 4.8 及以上版本,安装驱动: + +```bash +# Initiator 驱动,访问远端文件需要 +modprobe nvme_rdma + +# Target 驱动,共享本地文件需要 +modprobe nvmet_rdma +``` + +- NVMeoF TCP: 依赖 Linux Kernel 5.0 及以上版本,安装驱动: + +```bash +# Initiator 驱动,访问远端文件需要 +modprobe nvme_tcp + +# Target 驱动,共享本地文件需要 +modprobe nvmet_tcp +``` + +#### 依赖库 + +NVMeoFGenericTransport依赖于以下第三方库: + +```bash +apt install -y libaio-dev libnvme-dev +``` + +### 编译选项 + +要启用NVMeoFGenericTransport,需要开启`USE_NVMEOF_GENERIC`编译选项: + +```bash +cmake .. -DUSE_NVMEOF_GENERIC=ON +``` + +### 运行时选项 + +NVMeoFGenericTransport支持通过环境变量配置以下运行时选项: + +- `MC_NVMEOF_GENERIC_DIRECT_IO` 在读写NVMeoF SSD时使用Direct I/O,默认关闭。开启这一选项可以大幅提升性能,但要求读写操作使用的buffer地址、读写的SSD位置以及读写长度全部满足对齐要求(通常是512字节对齐,建议4 KiB对齐) +- `MC_NVMEOF_GENERIC_NUM_WORKERS` 读写NVMeoF SSD时使用的线程数量,默认为8 + +### TransferEngine测试 + +开启`USE_NVMEOF_GENERIC`选项并完成编译后,在`build/mooncake-transfer-engine/example`下可以找到名为`transfer_engine_nvmeof_generic_bench`的可执行文件。此程序可用于测试NVMeoFGenericTransport的性能。 + +#### 启动元数据服务 + +与`transfer_engine_bench`测试工具相同,具体可参考 [transfer-engine.md](../zh/transfer-engine.md#范例程序transfer-engine-bench) + +后续以HTTP元数据服务为例,假设元数据服务地址为`http://127.0.0.1:8080/metadata` + +#### 启动target + +**注意:文件注册使用后,其中原有的数据将损坏甚至全部丢失,请谨慎使用!!!** + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=target \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 启动initiator + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8082 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=initiator \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB +``` + +#### Loopback模式 + +为了快速验证,也可以使用loopback模式在单机上进行测试: + +```bash +./build/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench \ + --local_server_name=127.0.0.1:8081 \ + --metadata_server=http://127.0.0.1:8080/metadata \ + --mode=loopback \ + --operation=read \ + --segment_id=127.0.0.1:8081 \ + --batch_size=4096 \ + --block_size=65536 \ + --duration=30 \ + --threads=1 \ + --report_unit=GB \ + --trtype=tcp \ + --traddr=127.0.0.1 \ + --trsvcid=4420 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 性能调优 + +- 对于大量文件,适当调大`MC_NVMEOF_GENERIC_NUM_WORKERS`通常可以提升性能; +- 在`--block_size`满足`4 KiB`对齐的前提下,可以设置环境变量`MC_NVMEOF_GENERIC_DIRECT_IO=on`,对于SSD设备可以大幅提升性能; + +### Mooncake Store测试 + +使用`mooncake-store/tests/stress_cluster_benchmark.py`可以测试基于NVMeoFGenericTransport的Mooncake Store的性能。 + +#### 启动元数据服务 + +按照 [transfer-engine.md](./transfer-engine.md#范例程序transfer-engine-bench) 和 [mooncake-store-preview.md](./mooncake-store-preview.md#启动-master-service) 的说明分别启动元数据服务和Master服务。 + +#### 启动prefill实例 + +```bash +python3 mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8081 \ + --role=prefill \ + --protocol=nvmeof_generic \ + --protocol-args="trtype=tcp adrfam=ipv4 traddr=127.0.0.1 trsvcid=4420" \ + --local-buffer-size=1024 \ + --files="/path/to/file0 /path/to/file1 ..." +``` + +#### 启动decode实例 + +```bash +python3 mooncake-store/tests/stress_cluster_benchmark.py \ + --local-hostname=127.0.0.1:8082 \ + --role=decode \ + --protocol=nvmeof_generic \ + --protocol-args="" \ + --local-buffer-size=1024 \ + --files="" +``` + +#### 性能调优 + +- 对于大量文件,适当调大`MC_NVMEOF_GENERIC_NUM_WORKERS`通常可以提升性能; +- Mooncake Store目前无法保证分配满足Direct I/O对齐要求的Buffer,因此暂无法启用Direct I/O; diff --git a/mooncake-common/common.cmake b/mooncake-common/common.cmake index f117c4b83..b9d7fe6de 100644 --- a/mooncake-common/common.cmake +++ b/mooncake-common/common.cmake @@ -59,6 +59,7 @@ option(BUILD_UNIT_TESTS "Build unit tests" ON) option(USE_CUDA "option for enabling gpu features" OFF) option(USE_MUSA "option for enabling Moore Threads gpu features by leveraging MUSA (Meta-computing Unified System Architecture)" OFF) option(USE_NVMEOF "option for using NVMe over Fabric" OFF) +option(USE_NVMEOF_GENERIC "option for using generic NVMe over Fabric transport" OFF) option(USE_TCP "option for using TCP transport" ON) option(USE_ASCEND "option for using npu with HCCL" OFF) option(USE_ASCEND_DIRECT "option for using ascend npu with adxl engine" OFF) @@ -89,6 +90,11 @@ if (USE_NVMEOF) message(STATUS "NVMe-oF support is enabled") endif() +if (USE_NVMEOF_GENERIC) + add_compile_definitions(USE_NVMEOF_GENERIC) + message(STATUS "Generic NVMe-oF support is enabled") +endif() + if (USE_MNNVL) set(USE_CUDA ON) add_compile_definitions(USE_MNNVL) diff --git a/mooncake-integration/store/store_py.cpp b/mooncake-integration/store/store_py.cpp index 20c66ee00..5bd834ace 100644 --- a/mooncake-integration/store/store_py.cpp +++ b/mooncake-integration/store/store_py.cpp @@ -14,15 +14,15 @@ namespace py = pybind11; namespace mooncake { namespace { -std::vector> CastAddrs2Ptrs( - const std::vector> &all_buffer_ptrs) { - std::vector> all_buffers; +std::vector> CastAddrs2Ptrs( + const std::vector>& all_buffer_ptrs) { + std::vector> all_buffers; all_buffers.reserve(all_buffer_ptrs.size()); - for (auto &buffer_ptrs : all_buffer_ptrs) { - std::vector ptrs; + for (auto& buffer_ptrs : all_buffer_ptrs) { + std::vector ptrs; ptrs.reserve(buffer_ptrs.size()); for (uintptr_t ptr : buffer_ptrs) { - ptrs.push_back(reinterpret_cast(ptr)); + ptrs.push_back(reinterpret_cast(ptr)); } all_buffers.emplace_back(std::move(ptrs)); } @@ -36,7 +36,7 @@ class MooncakeStorePyWrapper { MooncakeStorePyWrapper() : store_(PyClient::create()) {} - pybind11::bytes get(const std::string &key) { + pybind11::bytes get(const std::string& key) { if (!store_ || !store_->client_) { LOG(ERROR) << "Client is not initialized"; return pybind11::bytes("\\0", 0); @@ -53,13 +53,13 @@ class MooncakeStorePyWrapper { } py::gil_scoped_acquire acquire_gil; - return pybind11::bytes((char *)buffer_handle->ptr(), + return pybind11::bytes((char*)buffer_handle->ptr(), buffer_handle->size()); } } std::vector get_batch( - const std::vector &keys) { + const std::vector& keys) { const auto kNullString = pybind11::bytes("\\0", 0); if (!store_ || !store_->client_) { LOG(ERROR) << "Client is not initialized"; @@ -79,9 +79,9 @@ class MooncakeStorePyWrapper { std::vector results; results.reserve(batch_data.size()); - for (const auto &data : batch_data) { + for (const auto& data : batch_data) { results.emplace_back( - data ? pybind11::bytes((char *)data->ptr(), data->size()) + data ? pybind11::bytes((char*)data->ptr(), data->size()) : kNullString); } @@ -89,7 +89,7 @@ class MooncakeStorePyWrapper { } } - pybind11::object get_tensor(const std::string &key) { + pybind11::object get_tensor(const std::string& key) { if (!store_ || !store_->client_) { LOG(ERROR) << "Client is not initialized"; return pybind11::none(); @@ -105,7 +105,7 @@ class MooncakeStorePyWrapper { } // Create contiguous buffer and copy data auto total_length = buffer_handle->size(); - char *exported_data = new char[total_length]; + char* exported_data = new char[total_length]; if (!exported_data) { py::gil_scoped_acquire acquire_gil; LOG(ERROR) << "Invalid data format: insufficient data for " @@ -165,13 +165,13 @@ class MooncakeStorePyWrapper { torch_module().attr("from_numpy")(np_array); return tensor; - } catch (const pybind11::error_already_set &e) { + } catch (const pybind11::error_already_set& e) { LOG(ERROR) << "Failed to get tensor data: " << e.what(); return pybind11::none(); } } - int put_tensor(const std::string &key, pybind11::object tensor) { + int put_tensor(const std::string& key, pybind11::object tensor) { if (!store_ || !store_->client_) { LOG(ERROR) << "Client is not initialized"; return -static_cast(ErrorCode::INVALID_PARAMS); @@ -221,8 +221,8 @@ class MooncakeStorePyWrapper { // Section with GIL released py::gil_scoped_release release_gil; - char *buffer = reinterpret_cast(data_ptr); - char *metadata_buffer = reinterpret_cast(&metadata); + char* buffer = reinterpret_cast(data_ptr); + char* metadata_buffer = reinterpret_cast(&metadata); std::vector> values; values.emplace_back( std::span(metadata_buffer, sizeof(TensorMetadata))); @@ -235,7 +235,7 @@ class MooncakeStorePyWrapper { } return 0; - } catch (const pybind11::error_already_set &e) { + } catch (const pybind11::error_already_set& e) { LOG(ERROR) << "Failed to access tensor data: " << e.what(); return -static_cast(ErrorCode::INVALID_PARAMS); } @@ -251,7 +251,7 @@ PYBIND11_MODULE(store, m) { .def_readwrite("preferred_segment", &ReplicateConfig::preferred_segment) .def_readwrite("prefer_alloc_in_same_node", &ReplicateConfig::prefer_alloc_in_same_node) - .def("__str__", [](const ReplicateConfig &config) { + .def("__str__", [](const ReplicateConfig& config) { std::ostringstream oss; oss << config; return oss.str(); @@ -261,13 +261,13 @@ PYBIND11_MODULE(store, m) { py::class_>( m, "BufferHandle", py::buffer_protocol()) .def("ptr", - [](const BufferHandle &self) { + [](const BufferHandle& self) { // Return the pointer as an integer for Python return reinterpret_cast(self.ptr()); }) .def("size", &BufferHandle::size) .def("__len__", &BufferHandle::size) - .def_buffer([](BufferHandle &self) -> py::buffer_info { + .def_buffer([](BufferHandle& self) -> py::buffer_info { // BufferHandle now always contains contiguous memory if (self.size() > 0) { return py::buffer_info( @@ -301,14 +301,14 @@ PYBIND11_MODULE(store, m) { .def(py::init<>()) .def( "setup", - [](MooncakeStorePyWrapper &self, const std::string &local_hostname, - const std::string &metadata_server, + [](MooncakeStorePyWrapper& self, const std::string& local_hostname, + const std::string& metadata_server, size_t global_segment_size = 1024 * 1024 * 16, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "tcp", - const std::string &rdma_devices = "", - const std::string &master_server_addr = "127.0.0.1:50051", - const py::object &engine = py::none()) { + const std::string& protocol = "tcp", + const std::string& rdma_devices = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const py::object& engine = py::none()) { if (!self.store_) { self.store_ = PyClient::create(); } @@ -326,9 +326,31 @@ PYBIND11_MODULE(store, m) { py::arg("global_segment_size"), py::arg("local_buffer_size"), py::arg("protocol"), py::arg("rdma_devices"), py::arg("master_server_addr"), py::arg("engine") = py::none()) + .def("setup_with_files", + [](MooncakeStorePyWrapper& self, const std::string& local_hostname, + const std::string& metadata_server, + const std::vector& files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string& protocol = "nvmeof_generic", + const std::string& protocol_arg = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const py::object& engine = py::none()) { + if (!self.store_) { + self.store_ = PyClient::create(); + } + std::shared_ptr transfer_engine = nullptr; + if (!engine.is_none()) { + transfer_engine = + engine.cast>(); + } + return self.store_->setup_with_files( + local_hostname, metadata_server, files, local_buffer_size, + protocol, protocol_arg, master_server_addr, + transfer_engine); + }) .def("init_all", - [](MooncakeStorePyWrapper &self, const std::string &protocol, - const std::string &device_name, + [](MooncakeStorePyWrapper& self, const std::string& protocol, + const std::string& device_name, size_t mount_segment_size = 1024 * 1024 * 16) { return self.store_->initAll(protocol, device_name, mount_segment_size); @@ -337,27 +359,27 @@ PYBIND11_MODULE(store, m) { .def("get_batch", &MooncakeStorePyWrapper::get_batch) .def( "get_buffer", - [](MooncakeStorePyWrapper &self, const std::string &key) { + [](MooncakeStorePyWrapper& self, const std::string& key) { py::gil_scoped_release release; return self.store_->get_buffer(key); }, py::return_value_policy::take_ownership) .def( "batch_get_buffer", - [](MooncakeStorePyWrapper &self, - const std::vector &keys) { + [](MooncakeStorePyWrapper& self, + const std::vector& keys) { py::gil_scoped_release release; return self.store_->batch_get_buffer(keys); }, py::return_value_policy::take_ownership) .def("remove", - [](MooncakeStorePyWrapper &self, const std::string &key) { + [](MooncakeStorePyWrapper& self, const std::string& key) { py::gil_scoped_release release; return self.store_->remove(key); }) .def( "remove_by_regex", - [](MooncakeStorePyWrapper &self, const std::string &str) { + [](MooncakeStorePyWrapper& self, const std::string& str) { py::gil_scoped_release release; return self.store_->removeByRegex(str); }, @@ -365,19 +387,19 @@ PYBIND11_MODULE(store, m) { "Removes objects from the store whose keys match the given " "regular expression.") .def("remove_all", - [](MooncakeStorePyWrapper &self) { + [](MooncakeStorePyWrapper& self) { py::gil_scoped_release release; return self.store_->removeAll(); }) .def("is_exist", - [](MooncakeStorePyWrapper &self, const std::string &key) { + [](MooncakeStorePyWrapper& self, const std::string& key) { py::gil_scoped_release release; return self.store_->isExist(key); }) .def( "batch_is_exist", - [](MooncakeStorePyWrapper &self, - const std::vector &keys) { + [](MooncakeStorePyWrapper& self, + const std::vector& keys) { py::gil_scoped_release release; return self.store_->batchIsExist(keys); }, @@ -385,14 +407,14 @@ PYBIND11_MODULE(store, m) { "Check if multiple objects exist. Returns list of results: 1 if " "exists, 0 if not exists, -1 if error") .def("close", - [](MooncakeStorePyWrapper &self) { + [](MooncakeStorePyWrapper& self) { if (!self.store_) return 0; int rc = self.store_->tearDownAll(); self.store_.reset(); return rc; }) .def("get_size", - [](MooncakeStorePyWrapper &self, const std::string &key) { + [](MooncakeStorePyWrapper& self, const std::string& key) { py::gil_scoped_release release; return self.store_->getSize(key); }) @@ -402,10 +424,10 @@ PYBIND11_MODULE(store, m) { py::arg("tensor"), "Put a PyTorch tensor into the store") .def( "register_buffer", - [](MooncakeStorePyWrapper &self, uintptr_t buffer_ptr, + [](MooncakeStorePyWrapper& self, uintptr_t buffer_ptr, size_t size) { // Register memory buffer for RDMA operations - void *buffer = reinterpret_cast(buffer_ptr); + void* buffer = reinterpret_cast(buffer_ptr); py::gil_scoped_release release; return self.store_->register_buffer(buffer, size); }, @@ -413,9 +435,9 @@ PYBIND11_MODULE(store, m) { "Register a memory buffer for direct access operations") .def( "unregister_buffer", - [](MooncakeStorePyWrapper &self, uintptr_t buffer_ptr) { + [](MooncakeStorePyWrapper& self, uintptr_t buffer_ptr) { // Unregister memory buffer - void *buffer = reinterpret_cast(buffer_ptr); + void* buffer = reinterpret_cast(buffer_ptr); py::gil_scoped_release release; return self.store_->unregister_buffer(buffer); }, @@ -424,10 +446,10 @@ PYBIND11_MODULE(store, m) { "buffer for direct access operations") .def( "get_into", - [](MooncakeStorePyWrapper &self, const std::string &key, + [](MooncakeStorePyWrapper& self, const std::string& key, uintptr_t buffer_ptr, size_t size) { // Get data directly into user-provided buffer - void *buffer = reinterpret_cast(buffer_ptr); + void* buffer = reinterpret_cast(buffer_ptr); py::gil_scoped_release release; return self.store_->get_into(key, buffer, size); }, @@ -435,14 +457,14 @@ PYBIND11_MODULE(store, m) { "Get object data directly into a pre-allocated buffer") .def( "batch_get_into", - [](MooncakeStorePyWrapper &self, - const std::vector &keys, - const std::vector &buffer_ptrs, - const std::vector &sizes) { - std::vector buffers; + [](MooncakeStorePyWrapper& self, + const std::vector& keys, + const std::vector& buffer_ptrs, + const std::vector& sizes) { + std::vector buffers; buffers.reserve(buffer_ptrs.size()); for (uintptr_t ptr : buffer_ptrs) { - buffers.push_back(reinterpret_cast(ptr)); + buffers.push_back(reinterpret_cast(ptr)); } py::gil_scoped_release release; return self.store_->batch_get_into(keys, buffers, sizes); @@ -453,11 +475,11 @@ PYBIND11_MODULE(store, m) { "keys") .def( "put_from", - [](MooncakeStorePyWrapper &self, const std::string &key, + [](MooncakeStorePyWrapper& self, const std::string& key, uintptr_t buffer_ptr, size_t size, - const ReplicateConfig &config = ReplicateConfig{}) { + const ReplicateConfig& config = ReplicateConfig{}) { // Put data directly from user-provided buffer - void *buffer = reinterpret_cast(buffer_ptr); + void* buffer = reinterpret_cast(buffer_ptr); py::gil_scoped_release release; return self.store_->put_from(key, buffer, size, config); }, @@ -466,15 +488,15 @@ PYBIND11_MODULE(store, m) { "Put object data directly from a pre-allocated buffer") .def( "put_from_with_metadata", - [](MooncakeStorePyWrapper &self, const std::string &key, + [](MooncakeStorePyWrapper& self, const std::string& key, uintptr_t buffer_ptr, uintptr_t metadata_buffer_ptr, size_t size, size_t metadata_size, - const ReplicateConfig &config = ReplicateConfig{}) { + const ReplicateConfig& config = ReplicateConfig{}) { // Put data directly from user-provided buffer with // metadata - void *buffer = reinterpret_cast(buffer_ptr); - void *metadata_buffer = - reinterpret_cast(metadata_buffer_ptr); + void* buffer = reinterpret_cast(buffer_ptr); + void* metadata_buffer = + reinterpret_cast(metadata_buffer_ptr); py::gil_scoped_release release; return self.store_->put_from_with_metadata( key, buffer, metadata_buffer, size, metadata_size, config); @@ -486,15 +508,15 @@ PYBIND11_MODULE(store, m) { "metadata") .def( "batch_put_from", - [](MooncakeStorePyWrapper &self, - const std::vector &keys, - const std::vector &buffer_ptrs, - const std::vector &sizes, - const ReplicateConfig &config = ReplicateConfig{}) { - std::vector buffers; + [](MooncakeStorePyWrapper& self, + const std::vector& keys, + const std::vector& buffer_ptrs, + const std::vector& sizes, + const ReplicateConfig& config = ReplicateConfig{}) { + std::vector buffers; buffers.reserve(buffer_ptrs.size()); for (uintptr_t ptr : buffer_ptrs) { - buffers.push_back(reinterpret_cast(ptr)); + buffers.push_back(reinterpret_cast(ptr)); } py::gil_scoped_release release; return self.store_->batch_put_from(keys, buffers, sizes, @@ -507,14 +529,14 @@ PYBIND11_MODULE(store, m) { "keys") .def( "put", - [](MooncakeStorePyWrapper &self, const std::string &key, + [](MooncakeStorePyWrapper& self, const std::string& key, py::buffer buf, - const ReplicateConfig &config = ReplicateConfig{}) { + const ReplicateConfig& config = ReplicateConfig{}) { py::buffer_info info = buf.request(/*writable=*/false); py::gil_scoped_release release; return self.store_->put( key, - std::span(static_cast(info.ptr), + std::span(static_cast(info.ptr), static_cast(info.size)), config); }, @@ -522,24 +544,24 @@ PYBIND11_MODULE(store, m) { py::arg("config") = ReplicateConfig{}) .def( "put_parts", - [](MooncakeStorePyWrapper &self, const std::string &key, + [](MooncakeStorePyWrapper& self, const std::string& key, py::args parts, - const ReplicateConfig &config = ReplicateConfig{}) { + const ReplicateConfig& config = ReplicateConfig{}) { // 1) Python buffer → span std::vector infos; std::vector> spans; infos.reserve(parts.size()); spans.reserve(parts.size()); - for (auto &obj : parts) { + for (auto& obj : parts) { py::buffer buf = py::reinterpret_borrow(obj); infos.emplace_back(buf.request(false)); - const auto &info = infos.back(); + const auto& info = infos.back(); if (info.ndim != 1 || info.itemsize != 1) throw std::runtime_error( "parts must be 1-D bytes-like"); - spans.emplace_back(static_cast(info.ptr), + spans.emplace_back(static_cast(info.ptr), static_cast(info.size)); } @@ -550,20 +572,20 @@ PYBIND11_MODULE(store, m) { py::arg("key"), py::arg("config") = ReplicateConfig{}) .def( "put_batch", - [](MooncakeStorePyWrapper &self, - const std::vector &keys, - const std::vector &buffers, - const ReplicateConfig &config = ReplicateConfig{}) { + [](MooncakeStorePyWrapper& self, + const std::vector& keys, + const std::vector& buffers, + const ReplicateConfig& config = ReplicateConfig{}) { // Convert pybuffers to spans without copying std::vector infos; std::vector> spans; infos.reserve(buffers.size()); spans.reserve(buffers.size()); - for (const auto &buf : buffers) { + for (const auto& buf : buffers) { infos.emplace_back(buf.request(/*writable=*/false)); - const auto &info = infos.back(); - spans.emplace_back(static_cast(info.ptr), + const auto& info = infos.back(); + spans.emplace_back(static_cast(info.ptr), static_cast(info.size)); } @@ -573,16 +595,16 @@ PYBIND11_MODULE(store, m) { py::arg("keys"), py::arg("values"), py::arg("config") = ReplicateConfig{}) .def("get_hostname", - [](MooncakeStorePyWrapper &self) { + [](MooncakeStorePyWrapper& self) { return self.store_->get_hostname(); }) .def( "batch_put_from_multi_buffers", - [](MooncakeStorePyWrapper &self, - const std::vector &keys, - const std::vector> &all_buffer_ptrs, - const std::vector> &all_sizes, - const ReplicateConfig &config = ReplicateConfig{}) { + [](MooncakeStorePyWrapper& self, + const std::vector& keys, + const std::vector>& all_buffer_ptrs, + const std::vector>& all_sizes, + const ReplicateConfig& config = ReplicateConfig{}) { py::gil_scoped_release release; return self.store_->batch_put_from_multi_buffers( keys, CastAddrs2Ptrs(all_buffer_ptrs), all_sizes, config); @@ -594,10 +616,10 @@ PYBIND11_MODULE(store, m) { "keys") .def( "batch_get_into_multi_buffers", - [](MooncakeStorePyWrapper &self, - const std::vector &keys, - const std::vector> &all_buffer_ptrs, - const std::vector> &all_sizes, + [](MooncakeStorePyWrapper& self, + const std::vector& keys, + const std::vector>& all_buffer_ptrs, + const std::vector>& all_sizes, bool prefer_alloc_in_same_node = false) { py::gil_scoped_release release; return self.store_->batch_get_into_multi_buffers( diff --git a/mooncake-store/include/allocator.h b/mooncake-store/include/allocator.h index c80a97f72..bb60658fb 100644 --- a/mooncake-store/include/allocator.h +++ b/mooncake-store/include/allocator.h @@ -67,7 +67,9 @@ class AllocatedBuffer { uint64_t size_; uintptr_t buffer_address_; std::string transport_endpoint_; - YLT_REFL(Descriptor, size_, buffer_address_, transport_endpoint_); + FileBufferID file_id_; + YLT_REFL(Descriptor, size_, buffer_address_, transport_endpoint_, + file_id_); }; private: @@ -93,6 +95,7 @@ class BufferAllocatorBase { virtual size_t size() const = 0; virtual std::string getSegmentName() const = 0; virtual std::string getTransportEndpoint() const = 0; + virtual FileBufferID getFileID() const = 0; /** * Returns the largest free region available in this allocator. @@ -133,7 +136,8 @@ class CachelibBufferAllocator public std::enable_shared_from_this { public: CachelibBufferAllocator(std::string segment_name, size_t base, size_t size, - std::string transport_endpoint); + std::string transport_endpoint, + FileBufferID file_id = 0); ~CachelibBufferAllocator() override; @@ -147,6 +151,7 @@ class CachelibBufferAllocator std::string getTransportEndpoint() const override { return transport_endpoint_; } + FileBufferID getFileID() const override { return file_id_; } /** * For CacheLib, return kAllocatorUnknownFreeSpace as we don't have exact @@ -164,6 +169,7 @@ class CachelibBufferAllocator const size_t total_size_; std::atomic_size_t cur_size_; const std::string transport_endpoint_; + const FileBufferID file_id_; // metrics - removed allocated_bytes_ member // ylt::metric::gauge_t* allocated_bytes_{nullptr}; @@ -184,7 +190,8 @@ class OffsetBufferAllocator public std::enable_shared_from_this { public: OffsetBufferAllocator(std::string segment_name, size_t base, size_t size, - std::string transport_endpoint); + std::string transport_endpoint, + FileBufferID file_id = 0); ~OffsetBufferAllocator() override; @@ -198,6 +205,7 @@ class OffsetBufferAllocator std::string getTransportEndpoint() const override { return transport_endpoint_; } + FileBufferID getFileID() const override { return file_id_; } /** * Returns the actual largest free region from the offset allocator. @@ -211,6 +219,7 @@ class OffsetBufferAllocator const size_t total_size_; std::atomic_size_t cur_size_; const std::string transport_endpoint_; + const FileBufferID file_id_; // offset allocator implementation std::shared_ptr offset_allocator_; diff --git a/mooncake-store/include/client.h b/mooncake-store/include/client.h index bb19d0ccf..534f18486 100644 --- a/mooncake-store/include/client.h +++ b/mooncake-store/include/client.h @@ -208,6 +208,20 @@ class Client { tl::expected UnmountSegment(const void* buffer, size_t size); + /** + * @brief Register a file to master for allocation + * @param path The file path + * @return ErrorCode indicating success/failure + */ + tl::expected MountFileSegment(const std::string& path); + + /** + * @brief Unregisters a file segment from master + * @param path File path to unregister + * @return ErrorCode indicating success/failure + */ + tl::expected UnmountFileSegment(const std::string& path); + /** * @brief Registers memory buffer with TransferEngine for data transfer * @param addr Memory address to register diff --git a/mooncake-store/include/pybind_client.h b/mooncake-store/include/pybind_client.h index d79c0aab0..9a459d4e6 100644 --- a/mooncake-store/include/pybind_client.h +++ b/mooncake-store/include/pybind_client.h @@ -22,7 +22,7 @@ constexpr bool is_supported_return_type_v = template requires is_supported_return_type_v -int64_t to_py_ret(const tl::expected &exp) noexcept { +int64_t to_py_ret(const tl::expected& exp) noexcept { if (!exp) { return static_cast(toInt(exp.error())); } @@ -40,18 +40,18 @@ int64_t to_py_ret(const tl::expected &exp) noexcept { class ResourceTracker { public: // Get the singleton instance - static ResourceTracker &getInstance(); + static ResourceTracker& getInstance(); // Register a DistributedObjectStore instance for cleanup - void registerInstance(const std::shared_ptr &instance); + void registerInstance(const std::shared_ptr& instance); private: ResourceTracker(); ~ResourceTracker(); // Prevent copying - ResourceTracker(const ResourceTracker &) = delete; - ResourceTracker &operator=(const ResourceTracker &) = delete; + ResourceTracker(const ResourceTracker&) = delete; + ResourceTracker& operator=(const ResourceTracker&) = delete; // Cleanup all registered resources void cleanupAllResources(); @@ -82,24 +82,33 @@ class PyClient { // Factory to create shared instances and auto-register to ResourceTracker static std::shared_ptr create(); - int setup(const std::string &local_hostname, - const std::string &metadata_server, + int setup(const std::string& local_hostname, + const std::string& metadata_server, size_t global_segment_size = 1024 * 1024 * 16, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "tcp", - const std::string &rdma_devices = "", - const std::string &master_server_addr = "127.0.0.1:50051", - const std::shared_ptr &transfer_engine = nullptr); + const std::string& protocol = "tcp", + const std::string& rdma_devices = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); + + int setup_with_files( + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string& protocol = "nvmeof_generic", + const std::string& protocol_arg = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); - int initAll(const std::string &protocol, const std::string &device_name, + int initAll(const std::string& protocol, const std::string& device_name, size_t mount_segment_size = 1024 * 1024 * 16); // Default 16MB - int put(const std::string &key, std::span value, - const ReplicateConfig &config = ReplicateConfig{}); + int put(const std::string& key, std::span value, + const ReplicateConfig& config = ReplicateConfig{}); - int register_buffer(void *buffer, size_t size); + int register_buffer(void* buffer, size_t size); - int unregister_buffer(void *buffer); + int unregister_buffer(void* buffer); /** * @brief Get object data directly into a pre-allocated buffer @@ -111,7 +120,7 @@ class PyClient { * @note The buffer address must be previously registered with * register_buffer() for zero-copy operations */ - int64_t get_into(const std::string &key, void *buffer, size_t size); + int64_t get_into(const std::string& key, void* buffer, size_t size); /** * @brief Get object data directly into pre-allocated buffers for multiple @@ -124,9 +133,9 @@ class PyClient { * @note The buffer addresses must be previously registered with * register_buffer() for zero-copy operations */ - std::vector batch_get_into(const std::vector &keys, - const std::vector &buffers, - const std::vector &sizes); + std::vector batch_get_into(const std::vector& keys, + const std::vector& buffers, + const std::vector& sizes); /** * @brief Get object data directly into pre-allocated buffers for multiple @@ -141,9 +150,9 @@ class PyClient { * register_buffer() for zero-copy operations */ std::vector batch_get_into_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_same_node); /** @@ -156,8 +165,8 @@ class PyClient { * @note The buffer address must be previously registered with * register_buffer() for zero-copy operations */ - int put_from(const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config = ReplicateConfig{}); + int put_from(const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from pre-allocated buffers for multiple @@ -175,9 +184,9 @@ class PyClient { * register_buffer() for zero-copy operations */ int put_from_with_metadata( - const std::string &key, void *buffer, void *metadata_buffer, + const std::string& key, void* buffer, void* metadata_buffer, size_t size, size_t metadata_size, - const ReplicateConfig &config = ReplicateConfig{}); + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from pre-allocated buffers for multiple @@ -193,9 +202,9 @@ class PyClient { */ std::vector batch_put_from( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes, + const ReplicateConfig& config = ReplicateConfig{}); /** * @brief Put object data directly from multiple pre-allocated buffers for @@ -211,18 +220,18 @@ class PyClient { * register_buffer() for zero-copy operations */ std::vector batch_put_from_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, + const ReplicateConfig& config = ReplicateConfig{}); - int put_parts(const std::string &key, + int put_parts(const std::string& key, std::vector> values, - const ReplicateConfig &config = ReplicateConfig{}); + const ReplicateConfig& config = ReplicateConfig{}); - int put_batch(const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config = ReplicateConfig{}); + int put_batch(const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config = ReplicateConfig{}); [[nodiscard]] std::string get_hostname() const; @@ -232,7 +241,7 @@ class PyClient { * @return std::shared_ptr Buffer containing the data, or * nullptr if error */ - std::shared_ptr get_buffer(const std::string &key); + std::shared_ptr get_buffer(const std::string& key); /** * @brief Get buffers containing the data for multiple keys (batch version) @@ -241,11 +250,11 @@ class PyClient { * data, or nullptr for each key if error */ std::vector> batch_get_buffer( - const std::vector &keys); + const std::vector& keys); - int remove(const std::string &key); + int remove(const std::string& key); - long removeByRegex(const std::string &str); + long removeByRegex(const std::string& str); long removeAll(); @@ -256,7 +265,7 @@ class PyClient { * @param key Key to check * @return 1 if exists, 0 if not exists, -1 if error */ - int isExist(const std::string &key); + int isExist(const std::string& key); /** * @brief Check if multiple objects exist @@ -264,7 +273,7 @@ class PyClient { * @return Vector of existence results: 1 if exists, 0 if not exists, -1 if * error */ - std::vector batchIsExist(const std::vector &keys); + std::vector batchIsExist(const std::vector& keys); /** * @brief Get the size of an object @@ -272,96 +281,112 @@ class PyClient { * @return Size of the object in bytes, or -1 if error or object doesn't * exist */ - int64_t getSize(const std::string &key); + int64_t getSize(const std::string& key); // Internal versions that return tl::expected + + tl::expected common_setup_internal( + const std::string& local_hostname, const std::string& metadata_server, + size_t local_buffer_size, const std::string& protocol, + const std::string& protocol_args, const std::string& master_server_addr, + const std::shared_ptr& transfer_engine); + tl::expected setup_internal( - const std::string &local_hostname, const std::string &metadata_server, + const std::string& local_hostname, const std::string& metadata_server, size_t global_segment_size = 1024 * 1024 * 16, size_t local_buffer_size = 1024 * 1024 * 16, - const std::string &protocol = "tcp", - const std::string &rdma_devices = "", - const std::string &master_server_addr = "127.0.0.1:50051", - const std::shared_ptr &transfer_engine = nullptr); + const std::string& protocol = "tcp", + const std::string& rdma_devices = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); + + tl::expected setup_with_files_internal( + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, + size_t local_buffer_size = 1024 * 1024 * 16, + const std::string& protocol = "nvmeof_generic", + const std::string& protocol_arg = "", + const std::string& master_server_addr = "127.0.0.1:50051", + const std::shared_ptr& transfer_engine = nullptr); tl::expected initAll_internal( - const std::string &protocol, const std::string &device_name, + const std::string& protocol, const std::string& device_name, size_t mount_segment_size = 1024 * 1024 * 16); - tl::expected unregister_buffer_internal(void *buffer); + tl::expected unregister_buffer_internal(void* buffer); tl::expected put_internal( - const std::string &key, std::span value, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, std::span value, + const ReplicateConfig& config = ReplicateConfig{}); - tl::expected register_buffer_internal(void *buffer, + tl::expected register_buffer_internal(void* buffer, size_t size); - tl::expected get_into_internal(const std::string &key, - void *buffer, + tl::expected get_into_internal(const std::string& key, + void* buffer, size_t size); std::vector> batch_get_into_internal( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes); std::vector> batch_get_into_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_same_node); tl::expected put_from_internal( - const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config = ReplicateConfig{}); std::vector> batch_put_from_internal( - const std::vector &keys, - const std::vector &buffers, const std::vector &sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes, + const ReplicateConfig& config = ReplicateConfig{}); std::vector> batch_put_from_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, + const ReplicateConfig& config = ReplicateConfig{}); tl::expected put_parts_internal( - const std::string &key, std::vector> values, - const ReplicateConfig &config = ReplicateConfig{}); + const std::string& key, std::vector> values, + const ReplicateConfig& config = ReplicateConfig{}); tl::expected put_batch_internal( - const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config = ReplicateConfig{}); + const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config = ReplicateConfig{}); - tl::expected remove_internal(const std::string &key); + tl::expected remove_internal(const std::string& key); tl::expected removeByRegex_internal( - const std::string &str); + const std::string& str); tl::expected removeAll_internal(); tl::expected tearDownAll_internal(); - tl::expected isExist_internal(const std::string &key); + tl::expected isExist_internal(const std::string& key); std::vector> batchIsExist_internal( - const std::vector &keys); + const std::vector& keys); - tl::expected getSize_internal(const std::string &key); + tl::expected getSize_internal(const std::string& key); std::vector> batch_get_buffer_internal( - const std::vector &keys); + const std::vector& keys); std::shared_ptr client_ = nullptr; std::shared_ptr client_buffer_allocator_ = nullptr; std::unique_ptr port_binder_ = nullptr; struct SegmentDeleter { - void operator()(void *ptr) { + void operator()(void* ptr) { if (ptr) { free(ptr); } @@ -369,7 +394,7 @@ class PyClient { }; struct AscendSegmentDeleter { - void operator()(void *ptr) { + void operator()(void* ptr) { if (ptr) { free_memory("ascend", ptr); } diff --git a/mooncake-store/include/types.h b/mooncake-store/include/types.h index d9e617df3..8e4f6ed02 100644 --- a/mooncake-store/include/types.h +++ b/mooncake-store/include/types.h @@ -48,6 +48,7 @@ class Replica; using ObjectKey = std::string; using Version = uint64_t; using SegmentId = int64_t; +using FileBufferID = uint32_t; using TaskID = int64_t; using BufHandleList = std::vector>; // using ReplicaList = std::vector; @@ -170,18 +171,65 @@ const static uint64_t kMaxSliceSize = facebook::cachelib::Slab::kSize - 16; // should be lower than limit /** - * @brief Represents a contiguous memory region + * @brief Type of segments. + */ +enum class SegmentType { + UNKNOWN = -1, + MEMORY, + FILE, +}; + +/** + * @brief Stream operator for SegmentType + */ +inline std::ostream& operator<<(std::ostream& os, + const SegmentType& type) noexcept { + static const std::unordered_map type_strings{ + {SegmentType::UNKNOWN, "UNKNOWN"}, + {SegmentType::MEMORY, "MEMORY"}, + {SegmentType::FILE, "FILE"}}; + + os << (type_strings.count(type) ? type_strings.at(type) : "UNKNOWN"); + return os; +} + +/** + * @brief Represents a contiguous storage region, could be memory or file. */ struct Segment { UUID id{0, 0}; - std::string name{}; // Logical segment name used for preferred allocation + SegmentType type{SegmentType::UNKNOWN}; + std::string name{}; // The name of the segment, also might be the + // hostname of the server that owns the segment uintptr_t base{0}; size_t size{0}; - // TE p2p endpoint (ip:port) for transport-only addressing std::string te_endpoint{}; + // For a file segment, this will be the path of the file. + std::string path{}; + // For a file segment, this will be the id of the file buffer. + FileBufferID file_id{0}; Segment() = default; + Segment(const UUID& id, const std::string& name, uintptr_t base, + size_t size, const std::string& te_endpoint) + : id(id), + type(SegmentType::MEMORY), + name(name), + base(base), + size(size), + te_endpoint(te_endpoint) {} + Segment(const UUID& id, const std::string& name, uintptr_t base, + size_t size, const std::string& te_endpoint, + const std::string& path, FileBufferID file_id) + : id(id), + type(SegmentType::FILE), + name(name), + base(base), + size(size), + te_endpoint(te_endpoint), + path(path), + file_id(file_id) {} }; -YLT_REFL(Segment, id, name, base, size, te_endpoint); +YLT_REFL(Segment, id, type, name, base, size, te_endpoint, path, file_id); /** * @brief Client status from the master's perspective diff --git a/mooncake-store/src/allocator.cpp b/mooncake-store/src/allocator.cpp index 5df83a44a..7b8a9ebb9 100644 --- a/mooncake-store/src/allocator.cpp +++ b/mooncake-store/src/allocator.cpp @@ -32,39 +32,43 @@ AllocatedBuffer::~AllocatedBuffer() { AllocatedBuffer::Descriptor AllocatedBuffer::get_descriptor() const { auto alloc = allocator_.lock(); std::string endpoint; + FileBufferID file_id = 0; if (alloc) { endpoint = alloc->getTransportEndpoint(); + file_id = alloc->getFileID(); } else { LOG(ERROR) << "allocator=expired_or_null in get_descriptor"; } return {static_cast(size()), - reinterpret_cast(buffer_ptr_), endpoint}; + reinterpret_cast(buffer_ptr_), endpoint, file_id}; } // Define operator<< using public accessors or get_descriptor if appropriate std::ostream& operator<<(std::ostream& os, const AllocatedBuffer& buffer) { + auto alloc = buffer.allocator_.lock(); return os << "AllocatedBuffer: { " << "segment_name: " - << (buffer.allocator_.lock() - ? buffer.allocator_.lock()->getSegmentName() - : std::string("")) + << (alloc ? alloc->getSegmentName() : std::string("")) << ", " << "size: " << buffer.size() << ", " - << "buffer_ptr: " << static_cast(buffer.data()) << " }"; + << "buffer_ptr: " << static_cast(buffer.data()) << "," + << "file_id: " << (alloc ? alloc->getFileID() : 0) << " }"; } // Removed allocated_bytes parameter and member initialization CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, size_t base, size_t size, - std::string transport_endpoint) + std::string transport_endpoint, + FileBufferID file_id) : segment_name_(segment_name), base_(base), total_size_(size), cur_size_(0), - transport_endpoint_(std::move(transport_endpoint)) { + transport_endpoint_(std::move(transport_endpoint)), + file_id_(file_id) { VLOG(1) << "initializing_buffer_allocator segment_name=" << segment_name << " base_address=" << reinterpret_cast(base) - << " size=" << size; + << " size=" << size << " file_id=" << file_id; // Calculate the size of the header region. header_region_size_ = @@ -75,12 +79,16 @@ CachelibBufferAllocator::CachelibBufferAllocator(std::string segment_name, LOG_ASSERT(header_region_start_); + /// Zero is not a valid buffer base address for CachelibAllocator. + /// Therefore, we add a padding to the base to support zero-based buffers. + auto padded_base = base + facebook::cachelib::Slab::kSize; + // Initialize the CacheLib MemoryAllocator. memory_allocator_ = std::make_unique( facebook::cachelib::MemoryAllocator::Config( facebook::cachelib::MemoryAllocator::generateAllocSizes()), reinterpret_cast(header_region_start_.get()), - header_region_size_, reinterpret_cast(base), size); + header_region_size_, reinterpret_cast(padded_base), size); if (!memory_allocator_) { LOG(ERROR) << "status=failed_to_init_facebook_memory_allocator"; @@ -107,6 +115,10 @@ std::unique_ptr CachelibBufferAllocator::allocate( << " current_size=" << cur_size_; return nullptr; } + + // Un-padding the buffer. + buffer = reinterpret_cast(reinterpret_cast(buffer) - + facebook::cachelib::Slab::kSize); } catch (const std::exception& e) { LOG(ERROR) << "allocation_exception error=" << e.what(); return nullptr; @@ -124,7 +136,10 @@ std::unique_ptr CachelibBufferAllocator::allocate( void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { try { // Deallocate memory using CacheLib. - memory_allocator_->free(handle->buffer_ptr_); + auto buffer = reinterpret_cast( + reinterpret_cast(handle->buffer_ptr_) + + facebook::cachelib::Slab::kSize); + memory_allocator_->free(buffer); size_t freed_size = handle->size_; // Store size before handle might become invalid cur_size_.fetch_sub(freed_size); @@ -141,15 +156,17 @@ void CachelibBufferAllocator::deallocate(AllocatedBuffer* handle) { // OffsetBufferAllocator implementation OffsetBufferAllocator::OffsetBufferAllocator(std::string segment_name, size_t base, size_t size, - std::string transport_endpoint) + std::string transport_endpoint, + FileBufferID file_id) : segment_name_(segment_name), base_(base), total_size_(size), cur_size_(0), - transport_endpoint_(std::move(transport_endpoint)) { + transport_endpoint_(std::move(transport_endpoint)), + file_id_(file_id) { VLOG(1) << "initializing_offset_buffer_allocator segment_name=" << segment_name << " base_address=" << reinterpret_cast(base) - << " size=" << size; + << " size=" << size << " file_id=" << file_id; try { // 1k <= init_capacity <= 64k diff --git a/mooncake-store/src/client.cpp b/mooncake-store/src/client.cpp index 349291213..775b685db 100644 --- a/mooncake-store/src/client.cpp +++ b/mooncake-store/src/client.cpp @@ -10,6 +10,10 @@ #include #include #include +#include +#include +#include +#include #include "transfer_engine.h" #include "transfer_task.h" @@ -35,6 +39,45 @@ namespace mooncake { return slice_size; } +static size_t getFileSize(const std::string& file) { + size_t size = 0; + struct stat st; + int rc; + + int fd = open(file.c_str(), O_RDONLY); + if (fd < 0) { + LOG(ERROR) << "Failed to open file " << file << ", errno=" << errno; + return 0; + } + + rc = fstat(fd, &st); + if (rc < 0) { + LOG(ERROR) << "Failed fstat on file " << file << ", errno=" << errno; + close(fd); + return 0; + } + + if (S_ISLNK(st.st_mode)) { + LOG(ERROR) << "File " << file << " is a symbolic link"; + close(fd); + return 0; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + rc = ioctl(fd, BLKGETSIZE64, &size); + if (rc < 0) { + LOG(ERROR) << "Failed ioctl on file " << file + << ", errno=" << errno; + size = 0; + } + } else if (S_ISREG(st.st_mode)) { + size = st.st_size; + } + + close(fd); + return size; +} + Client::Client(const std::string& local_hostname, const std::string& metadata_connstring) : metrics_(ClientMetric::Create()), @@ -72,8 +115,21 @@ Client::~Client() { } for (auto& segment : segments_to_unmount) { - auto result = - UnmountSegment(reinterpret_cast(segment.base), segment.size); + tl::expected result; + switch (segment.type) { + case SegmentType::MEMORY: + result = UnmountSegment(reinterpret_cast(segment.base), + segment.size); + break; + case SegmentType::FILE: + result = UnmountFileSegment(segment.path); + break; + default: + result = tl::unexpected(ErrorCode::INVALID_PARAMS); + LOG(ERROR) << "Unknown segment type: " << segment.type; + break; + } + if (!result) { LOG(ERROR) << "Failed to unmount segment: " << toString(result.error()); @@ -343,6 +399,19 @@ ErrorCode Client::InitTransferEngine( LOG(ERROR) << "Failed to install Ascend transport"; return ErrorCode::INTERNAL_ERROR; } + } else if (protocol == "nvmeof_generic") { + void* args[2]; + args[0] = device_names.has_value() + ? (void*)device_names.value().c_str() + : nullptr; + args[1] = nullptr; + + transport = + transfer_engine_->installTransport("nvmeof_generic", args); + if (!transport) { + LOG(ERROR) << "Failed to install Generic NVMeoF transport"; + return ErrorCode::INTERNAL_ERROR; + } } else { LOG(ERROR) << "unsupported_protocol protocol=" << protocol; return ErrorCode::INVALID_PARAMS; @@ -1390,6 +1459,10 @@ tl::expected Client::MountSegment(const void* buffer, // Check if the segment overlaps with any existing segment for (auto& it : mounted_segments_) { auto& mtseg = it.second; + // Skip non-memory segments. + if (mtseg.type != SegmentType::MEMORY) { + continue; + } uintptr_t l1 = reinterpret_cast(mtseg.base); uintptr_t r1 = reinterpret_cast(mtseg.size) + l1; uintptr_t l2 = reinterpret_cast(buffer); @@ -1410,21 +1483,20 @@ tl::expected Client::MountSegment(const void* buffer, return tl::unexpected(ErrorCode::INVALID_PARAMS); } - // Build segment with logical name; attach TE endpoint for transport - Segment segment; - segment.id = generate_uuid(); - segment.name = local_hostname_; - segment.base = reinterpret_cast(buffer); - segment.size = size; + std::string te_endpoint; // For P2P handshake mode, publish the actual transport endpoint that was // negotiated by the transfer engine. Otherwise, keep the logical hostname // so metadata backends (HTTP/etcd/redis) can resolve the segment by name. if (metadata_connstring_ == P2PHANDSHAKE) { - segment.te_endpoint = transfer_engine_->getLocalIpAndPort(); + te_endpoint = transfer_engine_->getLocalIpAndPort(); } else { - segment.te_endpoint = local_hostname_; + te_endpoint = local_hostname_; } + // Build segment with logical name; attach TE endpoint for transport + Segment segment(generate_uuid(), local_hostname_, + reinterpret_cast(buffer), size, te_endpoint); + auto mount_result = master_client_.MountSegment(segment, client_id_); if (!mount_result) { ErrorCode err = mount_result.error(); @@ -1444,7 +1516,8 @@ tl::expected Client::UnmountSegment(const void* buffer, for (auto it = mounted_segments_.begin(); it != mounted_segments_.end(); ++it) { - if (it->second.base == reinterpret_cast(buffer) && + if (it->second.type == SegmentType::MEMORY && + it->second.base == reinterpret_cast(buffer) && it->second.size == size) { segment = it; break; @@ -1481,6 +1554,104 @@ tl::expected Client::UnmountSegment(const void* buffer, return {}; } +tl::expected Client::MountFileSegment( + const std::string& path) { + const size_t size = getFileSize(path); + if (size <= 0) { + LOG(ERROR) << "Invalid file " << path << " to mount"; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + std::lock_guard lock(mounted_segments_mutex_); + + for (auto& it : mounted_segments_) { + auto& mtseg = it.second; + // Skip non-file segments. + if (mtseg.type != SegmentType::FILE) { + continue; + } + + if (mtseg.path == path) { + LOG(ERROR) << "Duplicated file segment path=" << mtseg.path; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + } + + FileBufferID file_id; + int rc = transfer_engine_->registerLocalFile(path, size, file_id); + if (rc != 0) { + LOG(ERROR) << "register_local_file_failed path=" << path + << " size=" << size << ", error=" << rc; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + std::string te_endpoint; + // For P2P handshake mode, publish the actual transport endpoint that was + // negotiated by the transfer engine. Otherwise, keep the logical hostname + // so metadata backends (HTTP/etcd/redis) can resolve the segment by name. + if (metadata_connstring_ == P2PHANDSHAKE) { + te_endpoint = transfer_engine_->getLocalIpAndPort(); + } else { + te_endpoint = local_hostname_; + } + + Segment segment(generate_uuid(), local_hostname_, 0, size, te_endpoint, + path, file_id); + + auto mount_result = master_client_.MountSegment(segment, client_id_); + if (!mount_result) { + ErrorCode err = mount_result.error(); + LOG(ERROR) << "mount_segment_to_master_failed path=" << path + << " size=" << size << ", error=" << err; + return tl::unexpected(err); + } + + mounted_segments_[segment.id] = segment; + return {}; +} + +tl::expected Client::UnmountFileSegment( + const std::string& path) { + std::lock_guard lock(mounted_segments_mutex_); + + auto segment = mounted_segments_.end(); + for (auto it = mounted_segments_.begin(); it != mounted_segments_.end(); + it++) { + if (it->second.type == SegmentType::FILE && it->second.path == path) { + segment = it; + break; + } + } + if (segment == mounted_segments_.end()) { + LOG(ERROR) << "segment_not_found path=" << path; + return tl::unexpected(ErrorCode::INVALID_PARAMS); + } + + auto unmount_result = + master_client_.UnmountSegment(segment->second.id, client_id_); + if (!unmount_result) { + ErrorCode err = unmount_result.error(); + LOG(ERROR) << "Failed to unmount segment from master: " + << toString(err); + return tl::unexpected(err); + } + + int rc = transfer_engine_->unregisterLocalFile(segment->second.path); + if (rc != 0) { + LOG(ERROR) << "Failed to unregister file with transfer " + "engine ret is " + << rc; + if (rc != ERR_ADDRESS_NOT_REGISTERED) { + return tl::unexpected(ErrorCode::INTERNAL_ERROR); + } + // Otherwise, the segment is already unregistered from transfer + // engine, we can continue + } + + mounted_segments_.erase(segment); + return {}; +} + tl::expected Client::RegisterLocalMemory( void* addr, size_t length, const std::string& location, bool remote_accessible, bool update_metadata) { diff --git a/mooncake-store/src/pybind_client.cpp b/mooncake-store/src/pybind_client.cpp index ef8e074cc..d4acd4f7e 100644 --- a/mooncake-store/src/pybind_client.cpp +++ b/mooncake-store/src/pybind_client.cpp @@ -23,8 +23,8 @@ namespace mooncake { // ResourceTracker implementation using singleton pattern // Use a deliberately leaked heap object to avoid static destruction // order issues with atexit/signal handlers during process teardown. -ResourceTracker &ResourceTracker::getInstance() { - static ResourceTracker *instance = new ResourceTracker(); +ResourceTracker& ResourceTracker::getInstance() { + static ResourceTracker* instance = new ResourceTracker(); return *instance; } @@ -40,7 +40,7 @@ ResourceTracker::~ResourceTracker() { } void ResourceTracker::registerInstance( - const std::shared_ptr &instance) { + const std::shared_ptr& instance) { MutexLocker locker(&mutex_); instances_.push_back(instance); } @@ -55,7 +55,7 @@ void ResourceTracker::cleanupAllResources() { MutexLocker locker(&mutex_); - for (auto &wp : instances_) { + for (auto& wp : instances_) { if (auto sp = wp.lock()) { LOG(INFO) << "Cleaning up DistributedObjectStore instance"; sp->tearDownAll(); @@ -150,12 +150,11 @@ std::shared_ptr PyClient::create() { return sp; } -tl::expected PyClient::setup_internal( - const std::string &local_hostname, const std::string &metadata_server, - size_t global_segment_size, size_t local_buffer_size, - const std::string &protocol, const std::string &rdma_devices, - const std::string &master_server_addr, - const std::shared_ptr &transfer_engine) { +tl::expected PyClient::common_setup_internal( + const std::string& local_hostname, const std::string& metadata_server, + size_t local_buffer_size, const std::string& protocol, + const std::string& protocol_args, const std::string& master_server_addr, + const std::shared_ptr& transfer_engine) { this->protocol = protocol; // Remove port if hostname already contains one @@ -174,9 +173,9 @@ tl::expected PyClient::setup_internal( this->local_hostname = local_hostname; } - std::optional device_name = - (rdma_devices.empty() ? std::nullopt - : std::make_optional(rdma_devices)); + std::optional protocol_args_opt = + (protocol_args.empty() ? std::nullopt + : std::make_optional(protocol_args)); auto client_opt = mooncake::Client::Create( this->local_hostname, metadata_server, protocol, device_name, @@ -205,6 +204,24 @@ tl::expected PyClient::setup_internal( LOG(INFO) << "Local buffer size is 0, skip registering local memory"; } + return {}; +} + +tl::expected PyClient::setup_internal( + const std::string& local_hostname, const std::string& metadata_server, + size_t global_segment_size, size_t local_buffer_size, + const std::string& protocol, const std::string& rdma_devices, + const std::string& master_server_addr, + const std::shared_ptr& transfer_engine) { + // Common setups. + auto result = common_setup_internal( + local_hostname, metadata_server, local_buffer_size, protocol, + rdma_devices, master_server_addr, transfer_engine); + if (!result.has_value()) { + LOG(ERROR) << "Failed to setup PyClient"; + return tl::unexpected(result.error()); + } + // If global_segment_size is 0, skip mount segment; // If global_segment_size is larger than max_mr_size, split to multiple // segments. @@ -217,7 +234,7 @@ tl::expected PyClient::setup_internal( current_glbseg_size += segment_size; LOG(INFO) << "Mounting segment: " << segment_size << " bytes, " << current_glbseg_size << " of " << total_glbseg_size; - void *ptr = + void* ptr = allocate_buffer_allocator_memory(segment_size, this->protocol); if (!ptr) { LOG(ERROR) << "Failed to allocate segment memory"; @@ -242,20 +259,59 @@ tl::expected PyClient::setup_internal( return {}; } -int PyClient::setup(const std::string &local_hostname, - const std::string &metadata_server, +int PyClient::setup(const std::string& local_hostname, + const std::string& metadata_server, size_t global_segment_size, size_t local_buffer_size, - const std::string &protocol, - const std::string &rdma_devices, - const std::string &master_server_addr, - const std::shared_ptr &transfer_engine) { + const std::string& protocol, + const std::string& rdma_devices, + const std::string& master_server_addr, + const std::shared_ptr& transfer_engine) { return to_py_ret(setup_internal( local_hostname, metadata_server, global_segment_size, local_buffer_size, protocol, rdma_devices, master_server_addr, transfer_engine)); } +tl::expected PyClient::setup_with_files_internal( + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, size_t local_buffer_size, + const std::string& protocol, const std::string& protocol_arg, + const std::string& master_server_addr, + const std::shared_ptr& transfer_engine) { + // Common setups. + auto result = common_setup_internal( + local_hostname, metadata_server, local_buffer_size, protocol, + protocol_arg, master_server_addr, transfer_engine); + if (!result.has_value()) { + LOG(ERROR) << "Failed to setup PyClient"; + return tl::unexpected(result.error()); + } + + // Mount file segments. + for (auto& file : files) { + auto result = client_->MountFileSegment(file); + if (!result.has_value()) { + LOG(ERROR) << "Failed to mount file " << file + << ", error=" << result.error(); + return tl::unexpected(result.error()); + } + } + + return {}; +} + +int PyClient::setup_with_files( + const std::string& local_hostname, const std::string& metadata_server, + const std::vector& files, size_t local_buffer_size, + const std::string& protocol, const std::string& protocol_arg, + const std::string& master_server_addr, + const std::shared_ptr& transfer_engine) { + return to_py_ret(setup_with_files_internal( + local_hostname, metadata_server, files, local_buffer_size, protocol, + protocol_arg, master_server_addr, transfer_engine)); +} + tl::expected PyClient::initAll_internal( - const std::string &protocol_, const std::string &device_name, + const std::string& protocol_, const std::string& device_name, size_t mount_segment_size) { if (client_) { LOG(ERROR) << "Client is already initialized"; @@ -267,8 +323,8 @@ tl::expected PyClient::initAll_internal( device_name); } -int PyClient::initAll(const std::string &protocol_, - const std::string &device_name, +int PyClient::initAll(const std::string& protocol_, + const std::string& device_name, size_t mount_segment_size) { return to_py_ret( initAll_internal(protocol_, device_name, mount_segment_size)); @@ -299,8 +355,8 @@ tl::expected PyClient::tearDownAll_internal() { int PyClient::tearDownAll() { return to_py_ret(tearDownAll_internal()); } tl::expected PyClient::put_internal( - const std::string &key, std::span value, - const ReplicateConfig &config) { + const std::string& key, std::span value, + const ReplicateConfig& config) { if (config.prefer_alloc_in_same_node) { LOG(ERROR) << "prefer_alloc_in_same_node is not supported."; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -315,7 +371,7 @@ tl::expected PyClient::put_internal( << key << ", value size: " << value.size(); return tl::unexpected(ErrorCode::INVALID_PARAMS); } - auto &buffer_handle = *alloc_result; + auto& buffer_handle = *alloc_result; memcpy(buffer_handle.ptr(), value.data(), value.size_bytes()); std::vector slices = split_into_slices(buffer_handle); @@ -328,15 +384,15 @@ tl::expected PyClient::put_internal( return {}; } -int PyClient::put(const std::string &key, std::span value, - const ReplicateConfig &config) { +int PyClient::put(const std::string& key, std::span value, + const ReplicateConfig& config) { return to_py_ret(put_internal(key, value, config)); } tl::expected PyClient::put_batch_internal( - const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config) { + const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config) { if (config.prefer_alloc_in_same_node) { LOG(ERROR) << "prefer_alloc_in_same_node is not supported."; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -354,8 +410,8 @@ tl::expected PyClient::put_batch_internal( batched_slices.reserve(keys.size()); for (size_t i = 0; i < keys.size(); ++i) { - auto &key = keys[i]; - auto &value = values[i]; + auto& key = keys[i]; + auto& value = values[i]; auto alloc_result = client_buffer_allocator_->allocate(value.size_bytes()); if (!alloc_result) { @@ -364,7 +420,7 @@ tl::expected PyClient::put_batch_internal( << key << ", value size: " << value.size(); return tl::unexpected(ErrorCode::INVALID_PARAMS); } - auto &buffer_handle = *alloc_result; + auto& buffer_handle = *alloc_result; memcpy(buffer_handle.ptr(), value.data(), value.size_bytes()); auto slices = split_into_slices(buffer_handle); buffer_handles.emplace_back(std::move(*alloc_result)); @@ -374,7 +430,7 @@ tl::expected PyClient::put_batch_internal( // Convert unordered_map to vector format expected by BatchPut std::vector> ordered_batched_slices; ordered_batched_slices.reserve(keys.size()); - for (const auto &key : keys) { + for (const auto& key : keys) { auto it = batched_slices.find(key); if (it != batched_slices.end()) { ordered_batched_slices.emplace_back(it->second); @@ -395,15 +451,15 @@ tl::expected PyClient::put_batch_internal( return {}; } -int PyClient::put_batch(const std::vector &keys, - const std::vector> &values, - const ReplicateConfig &config) { +int PyClient::put_batch(const std::vector& keys, + const std::vector>& values, + const ReplicateConfig& config) { return to_py_ret(put_batch_internal(keys, values, config)); } tl::expected PyClient::put_parts_internal( - const std::string &key, std::vector> values, - const ReplicateConfig &config) { + const std::string& key, std::vector> values, + const ReplicateConfig& config) { if (config.prefer_alloc_in_same_node) { LOG(ERROR) << "prefer_alloc_in_same_node is not supported."; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -415,7 +471,7 @@ tl::expected PyClient::put_parts_internal( // Calculate total size needed size_t total_size = 0; - for (const auto &value : values) { + for (const auto& value : values) { total_size += value.size_bytes(); } @@ -432,12 +488,12 @@ tl::expected PyClient::put_parts_internal( return tl::unexpected(ErrorCode::INVALID_PARAMS); } - auto &buffer_handle = *alloc_result; + auto& buffer_handle = *alloc_result; // Copy all parts into the contiguous buffer size_t offset = 0; - for (const auto &value : values) { - memcpy(static_cast(buffer_handle.ptr()) + offset, value.data(), + for (const auto& value : values) { + memcpy(static_cast(buffer_handle.ptr()) + offset, value.data(), value.size_bytes()); offset += value.size_bytes(); } @@ -456,14 +512,14 @@ tl::expected PyClient::put_parts_internal( return {}; } -int PyClient::put_parts(const std::string &key, +int PyClient::put_parts(const std::string& key, std::vector> values, - const ReplicateConfig &config) { + const ReplicateConfig& config) { return to_py_ret(put_parts_internal(key, values, config)); } tl::expected PyClient::remove_internal( - const std::string &key) { + const std::string& key) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -475,12 +531,12 @@ tl::expected PyClient::remove_internal( return {}; } -int PyClient::remove(const std::string &key) { +int PyClient::remove(const std::string& key) { return to_py_ret(remove_internal(key)); } tl::expected PyClient::removeByRegex_internal( - const std::string &str) { + const std::string& str) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -488,7 +544,7 @@ tl::expected PyClient::removeByRegex_internal( return client_->RemoveByRegex(str); } -long PyClient::removeByRegex(const std::string &str) { +long PyClient::removeByRegex(const std::string& str) { return to_py_ret(removeByRegex_internal(str)); } @@ -503,7 +559,7 @@ tl::expected PyClient::removeAll_internal() { long PyClient::removeAll() { return to_py_ret(removeAll_internal()); } tl::expected PyClient::isExist_internal( - const std::string &key) { + const std::string& key) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -511,7 +567,7 @@ tl::expected PyClient::isExist_internal( return client_->IsExist(key); } -int PyClient::isExist(const std::string &key) { +int PyClient::isExist(const std::string& key) { auto result = isExist_internal(key); if (result.has_value()) { @@ -521,12 +577,12 @@ int PyClient::isExist(const std::string &key) { } } -std::vector PyClient::batchIsExist(const std::vector &keys) { +std::vector PyClient::batchIsExist(const std::vector& keys) { auto internal_results = batchIsExist_internal(keys); std::vector results; results.reserve(internal_results.size()); - for (const auto &result : internal_results) { + for (const auto& result : internal_results) { if (result.has_value()) { results.push_back(result.value() ? 1 : 0); // 1 if exists, 0 if not } else { @@ -538,7 +594,7 @@ std::vector PyClient::batchIsExist(const std::vector &keys) { } tl::expected PyClient::getSize_internal( - const std::string &key) { + const std::string& key) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -550,13 +606,13 @@ tl::expected PyClient::getSize_internal( return tl::unexpected(query_result.error()); } - const std::vector &replica_list = + const std::vector& replica_list = query_result.value().replicas; // Calculate total size from all replicas' handles int64_t total_size = 0; if (!replica_list.empty()) { - auto &replica = replica_list[0]; + auto& replica = replica_list[0]; total_size = calculate_total_size(replica); } else { LOG(ERROR) << "Internal error: replica_list is empty"; @@ -566,12 +622,12 @@ tl::expected PyClient::getSize_internal( return total_size; } -int64_t PyClient::getSize(const std::string &key) { +int64_t PyClient::getSize(const std::string& key) { return to_py_ret(getSize_internal(key)); } // Implementation of get_buffer method -std::shared_ptr PyClient::get_buffer(const std::string &key) { +std::shared_ptr PyClient::get_buffer(const std::string& key) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return nullptr; @@ -589,14 +645,14 @@ std::shared_ptr PyClient::get_buffer(const std::string &key) { return nullptr; } - const std::vector &replica_list = + const std::vector& replica_list = query_result.value().replicas; if (replica_list.empty()) { LOG(ERROR) << "Empty replica list for key: " << key; return nullptr; } - const auto &replica = replica_list[0]; + const auto& replica = replica_list[0]; uint64_t total_length = calculate_total_size(replica); if (total_length == 0) { @@ -610,7 +666,7 @@ std::shared_ptr PyClient::get_buffer(const std::string &key) { return nullptr; } - auto &buffer_handle = *alloc_result; + auto& buffer_handle = *alloc_result; // Create slices for the allocated buffer std::vector slices; @@ -631,7 +687,7 @@ std::shared_ptr PyClient::get_buffer(const std::string &key) { // Implementation of batch_get_buffer_internal method std::vector> PyClient::batch_get_buffer_internal( - const std::vector &keys) { + const std::vector& keys) { std::vector> final_results(keys.size(), nullptr); @@ -659,7 +715,7 @@ std::vector> PyClient::batch_get_buffer_internal( valid_ops.reserve(keys.size()); for (size_t i = 0; i < keys.size(); ++i) { - const auto &key = keys[i]; + const auto& key = keys[i]; if (!query_results[i]) { if (query_results[i].error() != ErrorCode::OBJECT_NOT_FOUND && @@ -676,7 +732,7 @@ std::vector> PyClient::batch_get_buffer_internal( continue; } - const auto &replica = query_result_values.replicas[0]; + const auto& replica = query_result_values.replicas[0]; uint64_t total_size = calculate_total_size(replica); if (total_size == 0) { continue; @@ -712,7 +768,7 @@ std::vector> PyClient::batch_get_buffer_internal( batch_keys.reserve(valid_ops.size()); batch_query_results.reserve(valid_ops.size()); - for (auto &op : valid_ops) { + for (auto& op : valid_ops) { batch_keys.push_back(op.key); batch_query_results.push_back(op.query_result); batch_slices[op.key] = op.slices; @@ -724,7 +780,7 @@ std::vector> PyClient::batch_get_buffer_internal( // 4. Process results and create BufferHandles for (size_t i = 0; i < valid_ops.size(); ++i) { if (batch_get_results[i]) { - auto &op = valid_ops[i]; + auto& op = valid_ops[i]; final_results[op.original_index] = std::make_shared(std::move(*op.buffer_handle)); } else { @@ -738,11 +794,11 @@ std::vector> PyClient::batch_get_buffer_internal( // Implementation of batch_get_buffer method std::vector> PyClient::batch_get_buffer( - const std::vector &keys) { + const std::vector& keys) { return batch_get_buffer_internal(keys); } -tl::expected PyClient::register_buffer_internal(void *buffer, +tl::expected PyClient::register_buffer_internal(void* buffer, size_t size) { if (!client_) { LOG(ERROR) << "Client is not initialized"; @@ -752,12 +808,12 @@ tl::expected PyClient::register_buffer_internal(void *buffer, true); } -int PyClient::register_buffer(void *buffer, size_t size) { +int PyClient::register_buffer(void* buffer, size_t size) { return to_py_ret(register_buffer_internal(buffer, size)); } tl::expected PyClient::unregister_buffer_internal( - void *buffer) { + void* buffer) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return tl::unexpected(ErrorCode::INVALID_PARAMS); @@ -771,12 +827,12 @@ tl::expected PyClient::unregister_buffer_internal( return {}; } -int PyClient::unregister_buffer(void *buffer) { +int PyClient::unregister_buffer(void* buffer) { return to_py_ret(unregister_buffer_internal(buffer)); } tl::expected PyClient::get_into_internal( - const std::string &key, void *buffer, size_t size) { + const std::string& key, void* buffer, size_t size) { // NOTE: The buffer address must be previously registered with // register_buffer() for zero-copy RDMA operations to work correctly if (!client_) { @@ -797,7 +853,7 @@ tl::expected PyClient::get_into_internal( return tl::unexpected(query_result.error()); } - const std::vector &replica_list = + const std::vector& replica_list = query_result.value().replicas; // Calculate total size from replica list @@ -806,7 +862,7 @@ tl::expected PyClient::get_into_internal( return tl::unexpected(ErrorCode::INVALID_PARAMS); } - auto &replica = replica_list[0]; + auto& replica = replica_list[0]; uint64_t total_size = calculate_total_size(replica); // Check if user buffer is large enough @@ -824,14 +880,14 @@ tl::expected PyClient::get_into_internal( if (replica.is_memory_replica() == false) { while (offset < total_size) { auto chunk_size = std::min(total_size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffer) + offset; + void* chunk_ptr = static_cast(buffer) + offset; slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } } else { - for (auto &handle : + for (auto& handle : replica.get_memory_descriptor().buffer_descriptors) { - void *chunk_ptr = static_cast(buffer) + offset; + void* chunk_ptr = static_cast(buffer) + offset; slices.emplace_back(Slice{chunk_ptr, handle.size_}); offset += handle.size_; } @@ -848,22 +904,22 @@ tl::expected PyClient::get_into_internal( return static_cast(total_size); } -int64_t PyClient::get_into(const std::string &key, void *buffer, size_t size) { +int64_t PyClient::get_into(const std::string& key, void* buffer, size_t size) { return to_py_ret(get_into_internal(key, buffer, size)); } std::string PyClient::get_hostname() const { return local_hostname; } -std::vector PyClient::batch_put_from(const std::vector &keys, - const std::vector &buffers, - const std::vector &sizes, - const ReplicateConfig &config) { +std::vector PyClient::batch_put_from(const std::vector& keys, + const std::vector& buffers, + const std::vector& sizes, + const ReplicateConfig& config) { auto internal_results = batch_put_from_internal(keys, buffers, sizes, config); std::vector results; results.reserve(internal_results.size()); - for (const auto &result : internal_results) { + for (const auto& result : internal_results) { results.push_back(to_py_ret(result)); } @@ -871,8 +927,8 @@ std::vector PyClient::batch_put_from(const std::vector &keys, } std::vector> PyClient::batch_put_from_internal( - const std::vector &keys, const std::vector &buffers, - const std::vector &sizes, const ReplicateConfig &config) { + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes, const ReplicateConfig& config) { if (config.prefer_alloc_in_same_node) { LOG(ERROR) << "prefer_alloc_in_same_node is not supported."; return std::vector>( @@ -894,8 +950,8 @@ std::vector> PyClient::batch_put_from_internal( // Create slices from user buffers for (size_t i = 0; i < keys.size(); ++i) { - const std::string &key = keys[i]; - void *buffer = buffers[i]; + const std::string& key = keys[i]; + void* buffer = buffers[i]; size_t size = sizes[i]; std::vector slices; @@ -903,7 +959,7 @@ std::vector> PyClient::batch_put_from_internal( while (offset < size) { auto chunk_size = std::min(size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffer) + offset; + void* chunk_ptr = static_cast(buffer) + offset; slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } @@ -913,7 +969,7 @@ std::vector> PyClient::batch_put_from_internal( std::vector> ordered_batched_slices; ordered_batched_slices.reserve(keys.size()); - for (const auto &key : keys) { + for (const auto& key : keys) { auto it = all_slices.find(key); if (it != all_slices.end()) { ordered_batched_slices.emplace_back(it->second); @@ -929,8 +985,8 @@ std::vector> PyClient::batch_put_from_internal( } tl::expected PyClient::put_from_internal( - const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config) { + const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config) { // NOTE: The buffer address must be previously registered with // register_buffer() for zero-copy RDMA operations to work correctly if (config.prefer_alloc_in_same_node) { @@ -953,7 +1009,7 @@ tl::expected PyClient::put_from_internal( while (offset < size) { auto chunk_size = std::min(size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffer) + offset; + void* chunk_ptr = static_cast(buffer) + offset; slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } @@ -966,19 +1022,19 @@ tl::expected PyClient::put_from_internal( return {}; } -int PyClient::put_from(const std::string &key, void *buffer, size_t size, - const ReplicateConfig &config) { +int PyClient::put_from(const std::string& key, void* buffer, size_t size, + const ReplicateConfig& config) { return to_py_ret(put_from_internal(key, buffer, size, config)); } std::vector PyClient::batch_get_into( - const std::vector &keys, const std::vector &buffers, - const std::vector &sizes) { + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes) { auto internal_results = batch_get_into_internal(keys, buffers, sizes); std::vector results; results.reserve(internal_results.size()); - for (const auto &result : internal_results) { + for (const auto& result : internal_results) { results.push_back(to_py_ret(result)); } @@ -986,8 +1042,8 @@ std::vector PyClient::batch_get_into( } std::vector> PyClient::batch_get_into_internal( - const std::vector &keys, const std::vector &buffers, - const std::vector &sizes) { + const std::vector& keys, const std::vector& buffers, + const std::vector& sizes) { // Validate preconditions if (!client_) { LOG(ERROR) << "Client is not initialized"; @@ -1027,7 +1083,7 @@ std::vector> PyClient::batch_get_into_internal( valid_operations.reserve(num_keys); for (size_t i = 0; i < num_keys; ++i) { - const auto &key = keys[i]; + const auto& key = keys[i]; // Handle query failures if (!query_results[i]) { @@ -1050,7 +1106,7 @@ std::vector> PyClient::batch_get_into_internal( } // Calculate required buffer size - const auto &replica = query_result_values.replicas[0]; + const auto& replica = query_result_values.replicas[0]; uint64_t total_size = calculate_total_size(replica); // Validate buffer capacity @@ -1068,14 +1124,14 @@ std::vector> PyClient::batch_get_into_internal( if (replica.is_memory_replica() == false) { while (offset < total_size) { auto chunk_size = std::min(total_size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffers[i]) + offset; + void* chunk_ptr = static_cast(buffers[i]) + offset; key_slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } } else { - for (auto &handle : + for (auto& handle : replica.get_memory_descriptor().buffer_descriptors) { - void *chunk_ptr = static_cast(buffers[i]) + offset; + void* chunk_ptr = static_cast(buffers[i]) + offset; key_slices.emplace_back(Slice{chunk_ptr, handle.size_}); offset += handle.size_; } @@ -1106,7 +1162,7 @@ std::vector> PyClient::batch_get_into_internal( batch_keys.reserve(valid_operations.size()); batch_query_results.reserve(valid_operations.size()); - for (const auto &op : valid_operations) { + for (const auto& op : valid_operations) { batch_keys.push_back(op.key); batch_query_results.push_back(op.query_result); batch_slices[op.key] = op.slices; @@ -1118,7 +1174,7 @@ std::vector> PyClient::batch_get_into_internal( // Process transfer results for (size_t j = 0; j < batch_get_results.size(); ++j) { - const auto &op = valid_operations[j]; + const auto& op = valid_operations[j]; if (!batch_get_results[j]) { const auto error = batch_get_results[j].error(); @@ -1132,7 +1188,7 @@ std::vector> PyClient::batch_get_into_internal( } std::vector> PyClient::batchIsExist_internal( - const std::vector &keys) { + const std::vector& keys) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return std::vector>( @@ -1148,10 +1204,10 @@ std::vector> PyClient::batchIsExist_internal( return client_->BatchIsExist(keys); } -int PyClient::put_from_with_metadata(const std::string &key, void *buffer, - void *metadata_buffer, size_t size, +int PyClient::put_from_with_metadata(const std::string& key, void* buffer, + void* metadata_buffer, size_t size, size_t metadata_size, - const ReplicateConfig &config) { + const ReplicateConfig& config) { // NOTE: The buffer address must be previously registered with // register_buffer() for zero-copy RDMA operations to work correctly if (config.prefer_alloc_in_same_node) { @@ -1175,8 +1231,8 @@ int PyClient::put_from_with_metadata(const std::string &key, void *buffer, while (metadata_offset < metadata_size) { auto metadata_chunk_size = std::min(metadata_size - metadata_offset, kMaxSliceSize); - void *metadata_chunk_ptr = - static_cast(metadata_buffer) + metadata_offset; + void* metadata_chunk_ptr = + static_cast(metadata_buffer) + metadata_offset; slices.emplace_back(Slice{metadata_chunk_ptr, metadata_chunk_size}); metadata_offset += metadata_chunk_size; } @@ -1184,7 +1240,7 @@ int PyClient::put_from_with_metadata(const std::string &key, void *buffer, uint64_t offset = 0; while (offset < size) { auto chunk_size = std::min(size - offset, kMaxSliceSize); - void *chunk_ptr = static_cast(buffer) + offset; + void* chunk_ptr = static_cast(buffer) + offset; slices.emplace_back(Slice{chunk_ptr, chunk_size}); offset += chunk_size; } @@ -1198,10 +1254,10 @@ int PyClient::put_from_with_metadata(const std::string &key, void *buffer, } std::vector PyClient::batch_put_from_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &sizes, - const ReplicateConfig &config) { + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& sizes, + const ReplicateConfig& config) { auto start = std::chrono::steady_clock::now(); auto internal_results = @@ -1209,7 +1265,7 @@ std::vector PyClient::batch_put_from_multi_buffers( std::vector results; results.reserve(internal_results.size()); - for (const auto &result : internal_results) { + for (const auto& result : internal_results) { results.push_back(to_py_ret(result)); } @@ -1222,10 +1278,10 @@ std::vector PyClient::batch_put_from_multi_buffers( std::vector> PyClient::batch_put_from_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, - const ReplicateConfig &config) { + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, + const ReplicateConfig& config) { if (!client_) { LOG(ERROR) << "Client is not initialized"; return std::vector>( @@ -1241,8 +1297,8 @@ PyClient::batch_put_from_multi_buffers_internal( std::vector> batched_slices(keys.size()); for (size_t i = 0; i < all_buffers.size(); ++i) { - const auto &buffers = all_buffers[i]; - const auto &sizes = all_sizes[i]; + const auto& buffers = all_buffers[i]; + const auto& sizes = all_sizes[i]; if (buffers.size() != sizes.size()) { LOG(ERROR) << "Mismatched buffers and sizes of key:" << keys[i]; return std::vector>( @@ -1258,9 +1314,9 @@ PyClient::batch_put_from_multi_buffers_internal( } std::vector PyClient::batch_get_into_multi_buffers( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_alloc_in_same_node) { auto start = std::chrono::steady_clock::now(); auto internal_results = batch_get_into_multi_buffers_internal( @@ -1268,7 +1324,7 @@ std::vector PyClient::batch_get_into_multi_buffers( std::vector results; results.reserve(internal_results.size()); - for (const auto &result : internal_results) { + for (const auto& result : internal_results) { results.push_back(to_py_ret(result)); } auto duration_call = std::chrono::duration_cast( @@ -1280,9 +1336,9 @@ std::vector PyClient::batch_get_into_multi_buffers( std::vector> PyClient::batch_get_into_multi_buffers_internal( - const std::vector &keys, - const std::vector> &all_buffers, - const std::vector> &all_sizes, + const std::vector& keys, + const std::vector>& all_buffers, + const std::vector>& all_sizes, bool prefer_alloc_in_same_node) { // Validate preconditions if (!client_) { @@ -1319,7 +1375,7 @@ PyClient::batch_get_into_multi_buffers_internal( std::vector valid_operations; valid_operations.reserve(num_keys); for (size_t i = 0; i < num_keys; ++i) { - const auto &key = keys[i]; + const auto& key = keys[i]; // Handle query failures if (!query_results[i]) { const auto error = query_results[i].error(); @@ -1338,11 +1394,11 @@ PyClient::batch_get_into_multi_buffers_internal( continue; } // Calculate required buffer size - const auto &replica = query_result_values.replicas[0]; + const auto& replica = query_result_values.replicas[0]; uint64_t total_size = calculate_total_size(replica); - const auto &sizes = all_sizes[i]; + const auto& sizes = all_sizes[i]; uint64_t dst_total_size = 0; - for (auto &size : sizes) { + for (auto& size : sizes) { dst_total_size += size; } if (dst_total_size < total_size) { @@ -1353,7 +1409,7 @@ PyClient::batch_get_into_multi_buffers_internal( continue; } // Create slices for this key's buffer - const auto &buffers = all_buffers[i]; + const auto& buffers = all_buffers[i]; std::vector key_slices; key_slices.reserve(buffers.size()); if (replica.is_memory_replica()) { @@ -1386,7 +1442,7 @@ PyClient::batch_get_into_multi_buffers_internal( std::unordered_map> batch_slices; batch_keys.reserve(valid_operations.size()); batch_query_results.reserve(valid_operations.size()); - for (auto &op : valid_operations) { + for (auto& op : valid_operations) { batch_keys.push_back(op.key); batch_query_results.push_back(op.query_result); batch_slices[op.key] = op.slices; @@ -1398,7 +1454,7 @@ PyClient::batch_get_into_multi_buffers_internal( // Process transfer results for (size_t j = 0; j < batch_get_results.size(); ++j) { - const auto &op = valid_operations[j]; + const auto& op = valid_operations[j]; if (!batch_get_results[j]) { const auto error = batch_get_results[j].error(); diff --git a/mooncake-store/src/segment.cpp b/mooncake-store/src/segment.cpp index 1d6b81b06..b4fcbedc4 100644 --- a/mooncake-store/src/segment.cpp +++ b/mooncake-store/src/segment.cpp @@ -10,7 +10,7 @@ ErrorCode ScopedSegmentAccess::MountSegment(const Segment& segment, const size_t size = segment.size; // Check if parameters are valid before allocating memory. - if (buffer == 0 || size == 0) { + if ((segment.type == SegmentType::MEMORY && buffer == 0) || size == 0) { LOG(ERROR) << "buffer=" << buffer << " or size=" << size << " is invalid"; return ErrorCode::INVALID_PARAMS; @@ -50,11 +50,13 @@ ErrorCode ScopedSegmentAccess::MountSegment(const Segment& segment, switch (segment_manager_->memory_allocator_) { case BufferAllocatorType::CACHELIB: allocator = std::make_shared( - segment.name, buffer, size, segment.te_endpoint); + segment.name, buffer, size, segment.te_endpoint, + segment.file_id); break; case BufferAllocatorType::OFFSET: allocator = std::make_shared( - segment.name, buffer, size, segment.te_endpoint); + segment.name, buffer, size, segment.te_endpoint, + segment.file_id); break; default: LOG(ERROR) << "segment_name=" << segment.name diff --git a/mooncake-store/src/transfer_task.cpp b/mooncake-store/src/transfer_task.cpp index b09fc15cc..3bd92cdef 100644 --- a/mooncake-store/src/transfer_task.cpp +++ b/mooncake-store/src/transfer_task.cpp @@ -579,6 +579,7 @@ std::optional TransferSubmitter::submitTransferEngineOperation( request.opcode = op_code; request.source = static_cast(slice.ptr); request.target_id = seg; + request.file_id = handle.file_id_; request.target_offset = handle.buffer_address_; request.length = handle.size_; diff --git a/mooncake-store/tests/client_buffer_test.cpp b/mooncake-store/tests/client_buffer_test.cpp index 6cdd25f27..68d13483b 100644 --- a/mooncake-store/tests/client_buffer_test.cpp +++ b/mooncake-store/tests/client_buffer_test.cpp @@ -273,14 +273,17 @@ TEST_F(ClientBufferTest, CalculateTotalSizeMemoryReplica) { AllocatedBuffer::Descriptor buf1; buf1.size_ = 1024; buf1.buffer_address_ = 0x1000; + buf1.file_id_ = 0; AllocatedBuffer::Descriptor buf2; buf2.size_ = 2048; buf2.buffer_address_ = 0x2000; + buf2.file_id_ = 0; AllocatedBuffer::Descriptor buf3; buf3.size_ = 512; buf3.buffer_address_ = 0x3000; + buf3.file_id_ = 0; mem_desc.buffer_descriptors = {buf1, buf2, buf3}; replica.descriptor_variant = mem_desc; diff --git a/mooncake-store/tests/master_service_test.cpp b/mooncake-store/tests/master_service_test.cpp index 1170397ff..e9f10e850 100644 --- a/mooncake-store/tests/master_service_test.cpp +++ b/mooncake-store/tests/master_service_test.cpp @@ -35,6 +35,7 @@ class MasterServiceTest : public ::testing::Test { size_t size = kDefaultSegmentSize) const { Segment segment; segment.id = generate_uuid(); + segment.type = SegmentType::MEMORY; segment.name = std::move(name); segment.base = base; segment.size = size; diff --git a/mooncake-store/tests/stress_cluster_benchmark.py b/mooncake-store/tests/stress_cluster_benchmark.py index 427b8f7ed..c753d67f8 100644 --- a/mooncake-store/tests/stress_cluster_benchmark.py +++ b/mooncake-store/tests/stress_cluster_benchmark.py @@ -186,20 +186,26 @@ def setup(self): # Setup store protocol = self.args.protocol - device_name = self.args.device_name + protocol_args = self.args.protocol_args local_hostname = self.args.local_hostname metadata_server = self.args.metadata_server - global_segment_size = self.args.global_segment_size * 1024 * 1024 local_buffer_size = self.args.local_buffer_size * 1024 * 1024 master_server_address = self.args.master_server logger.info(f"Setting up {self.args.role} instance with batch_size={self.args.batch_size}") - logger.info(f" Protocol: {protocol}, Device: {device_name}") - logger.info(f" Global segment: {global_segment_size // (1024*1024)} MB") + logger.info(f" Protocol: {protocol}, Protocol args: {protocol_args}") logger.info(f" Local buffer: {local_buffer_size // (1024*1024)} MB") - retcode = self.store.setup(local_hostname, metadata_server, global_segment_size, - local_buffer_size, protocol, device_name, master_server_address) + if self.args.files is None: + global_segment_size = self.args.global_segment_size * 1024 * 1024 + logger.info(f" Global segment: {global_segment_size // (1024*1024)} MB") + retcode = self.store.setup(local_hostname, metadata_server, global_segment_size, + local_buffer_size, protocol, protocol_args, master_server_address) + else: + files = self.args.files.split() + logger.info(f" Files: {files}") + retcode = self.store.setup_with_files(local_hostname, metadata_server, files, + local_buffer_size, protocol, protocol_args, master_server_address) if retcode: logger.error(f"Store setup failed with return code {retcode}") exit(1) @@ -431,14 +437,18 @@ def parse_arguments(): # Network and connection settings parser.add_argument("--protocol", type=str, default="rdma", help="Communication protocol to use") - parser.add_argument("--device-name", type=str, default="erdma_0", help="Network device name for RDMA") + parser.add_argument("--protocol-args", "--device-name", dest="protocol_args", type=str, + default="erdma_0", help="Protocol specific args, e.g. Network device name for RDMA") parser.add_argument("--local-hostname", type=str, default="localhost", help="Local hostname") parser.add_argument("--metadata-server", type=str, default="http://127.0.0.1:8080/metadata", help="Metadata server address") parser.add_argument("--master-server", type=str, default="localhost:50051", help="Master server address") # Memory and storage settings - parser.add_argument("--global-segment-size", type=int, default=10000, help="Global segment size in MB") parser.add_argument("--local-buffer-size", type=int, default=512, help="Local buffer size in MB") + # Only one of --global-segment-size and --files should be specified. + group = parser.add_mutually_exclusive_group() + group.add_argument("--global-segment-size", type=int, default=10000, help="Global segment size in MB") + group.add_argument("--files", type=str, default=None, help="Files to be registered as global segments") # Test parameters parser.add_argument("--max-requests", type=int, default=1200, help="Maximum number of requests to process") diff --git a/mooncake-transfer-engine/example/CMakeLists.txt b/mooncake-transfer-engine/example/CMakeLists.txt index 167276a88..11edf619a 100644 --- a/mooncake-transfer-engine/example/CMakeLists.txt +++ b/mooncake-transfer-engine/example/CMakeLists.txt @@ -27,3 +27,8 @@ if (USE_ASCEND_HETEROGENEOUS) add_executable(transfer_engine_heterogeneous_ascend_perf_initiator transfer_engine_heterogeneous_ascend_perf_initiator.cpp) target_link_libraries(transfer_engine_heterogeneous_ascend_perf_initiator PUBLIC transfer_engine) endif() + +if (USE_NVMEOF_GENERIC) + add_executable(transfer_engine_nvmeof_generic_bench transfer_engine_nvmeof_generic_bench.cpp) + target_link_libraries(transfer_engine_nvmeof_generic_bench PUBLIC transfer_engine) +endif() diff --git a/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp new file mode 100644 index 000000000..ec6986707 --- /dev/null +++ b/mooncake-transfer-engine/example/transfer_engine_nvmeof_generic_bench.cpp @@ -0,0 +1,396 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "transfer_engine.h" + +// Common arguments. +DEFINE_string(local_server_name, mooncake::getHostname(), + "Local server name for segment discovery"); +DEFINE_string(metadata_server, "192.168.3.77:2379", "etcd server host address"); +DEFINE_string( + mode, "loopback", + "Running mode: initiator, target, or loopback. Initiator node read/write " + "data blocks from target node"); + +// Initiator arguments. +DEFINE_string(operation, "read", "Operation type: read or write"); +DEFINE_string(segment_id, "192.168.3.76", "Segment ID to access data"); +DEFINE_int32(batch_size, 4096, "Batch size"); +DEFINE_uint64(block_size, 65536, "Block size for each transfer request"); +DEFINE_int32(duration, 30, "Test duration in seconds"); +DEFINE_int32(threads, 1, "Task submission threads"); +DEFINE_string(report_unit, "GB", "Report unit: GB|GiB|Gb|MB|MiB|Mb|KB|KiB|Kb"); +DEFINE_uint32(report_precision, 2, "Report precision"); + +// Target arguments. +DEFINE_string(trtype, "tcp", "TRTYPE of NVMeoF: tcp|rdma"); +DEFINE_string(adrfam, "ipv4", "ADRFAM of NVMeoF: ipv4|ipv6"); +DEFINE_string(traddr, "127.0.0.1", + "TRADDR of NVMeoF, i.e. service listen address"); +DEFINE_string(trsvcid, "4420", "TRSVCID of NVMeoF, i.e. service listen port"); +DEFINE_string(files, "", + "Files to register as buffers, separated by space, e.g.: " + "\"/dev/nvme0n1 /dev/nvme1n1\""); + +using namespace mooncake; + +const static std::unordered_map RATE_UNIT_MP = { + {"GB", 1000ull * 1000ull * 1000ull}, + {"GiB", 1ull << 30}, + {"Gb", 1000ull * 1000ull * 1000ull / 8}, + {"MB", 1000ull * 1000ull}, + {"MiB", 1ull << 20}, + {"Mb", 1000ull * 1000ull / 8}, + {"KB", 1000ull}, + {"KiB", 1ull << 10}, + {"Kb", 1000ull / 8}}; + +static inline std::string calculateRate(uint64_t data_bytes, double duration) { + if (std::fabs(duration) < 1e-10) { + LOG(ERROR) << "Invalid args: duration shouldn't be 0"; + return ""; + } + + if (!RATE_UNIT_MP.count(FLAGS_report_unit)) { + LOG(WARNING) << "Invalid flag: report_unit only support " + "GB|GiB|Gb|MB|MiB|Mb|KB|KiB|Kb, not support " + << FLAGS_report_unit + << " . Now use GB(default) as report_unit"; + FLAGS_report_unit = "GB"; + } + + std::ostringstream oss; + oss << std::fixed << std::setprecision(FLAGS_report_precision) + << 1.0 * data_bytes / duration / RATE_UNIT_MP.at(FLAGS_report_unit) + << " " << FLAGS_report_unit << "/s"; + return oss.str(); +} + +static std::unique_ptr initTransferEngine() { + // Disable topology auto discovery for testing. + auto engine = std::make_unique(); + if (engine == nullptr) { + LOG(ERROR) << "Failed to create transfer engine"; + exit(EXIT_FAILURE); + } + + auto hostname_port = parseHostNameWithPort(FLAGS_local_server_name); + int rc = + engine->init(FLAGS_metadata_server, FLAGS_local_server_name.c_str(), + hostname_port.first.c_str(), hostname_port.second); + if (rc != 0) { + LOG(ERROR) << "Failed to init transfer engine, rc=" << rc; + exit(EXIT_FAILURE); + } + + const std::string trStr = + "trtype=" + FLAGS_trtype + " adrfam=" + FLAGS_adrfam + + " traddr=" + FLAGS_traddr + " trsvcid=" + FLAGS_trsvcid; + LOG(INFO) << "Using Trid: " << trStr; + + Transport *xport = nullptr; + void *args[2] = {(void *)trStr.c_str(), nullptr}; + xport = engine->installTransport("nvmeof_generic", args); + if (xport == nullptr) { + LOG(ERROR) << "Failed to install nvmeof_generic transport"; + exit(EXIT_FAILURE); + } + + return engine; +} + +static volatile bool initiator_running = true; +static std::atomic total_batch_count(0); + +static Status initiatorWorker(TransferEngine *engine, SegmentID segment_id, + int thread_id, void *addr) { + TransferRequest::OpCode opcode; + if (FLAGS_operation == "read") + opcode = TransferRequest::READ; + else if (FLAGS_operation == "write") + opcode = TransferRequest::WRITE; + else { + LOG(ERROR) << "Unsupported operation: must be 'read' or 'write'"; + exit(EXIT_FAILURE); + } + + auto segment_desc = engine->getMetadata()->getSegmentDescByID(segment_id); + if (!segment_desc) { + LOG(ERROR) << "Unable to get target segment ID, please recheck"; + exit(EXIT_FAILURE); + } + + auto &file_buffers = segment_desc->file_buffers; + if (file_buffers.size() <= 0) { + LOG(ERROR) << "No file buffer registered in segment, please check"; + exit(EXIT_FAILURE); + } + + size_t batch_count = 0; + while (initiator_running) { + std::vector requests; + for (int i = 0; i < FLAGS_batch_size; ++i) { + auto buffer_offset = + FLAGS_block_size * (i * FLAGS_threads + thread_id); + // Randomly pick a file. + auto file_index = std::rand() % file_buffers.size(); + // Randomly pick a file offset. + auto file_unit_cnt = file_buffers[file_index].size / + FLAGS_block_size / FLAGS_threads; + auto target_offset = + FLAGS_block_size * + ((std::rand() % file_unit_cnt) * FLAGS_threads + thread_id); + + TransferRequest entry; + entry.opcode = opcode; + entry.length = FLAGS_block_size; + entry.source = (void *)((uintptr_t)(addr) + buffer_offset); + entry.target_id = segment_id; + entry.file_id = file_buffers[file_index].id; + entry.target_offset = target_offset; + requests.emplace_back(entry); + } + + auto batch_id = engine->allocateBatchID(FLAGS_batch_size); + Status s = engine->submitTransfer(batch_id, requests); + if (!s.ok()) { + LOG(ERROR) << "Failed to submit request: " << s.ToString(); + } + + for (int task_id = 0; task_id < FLAGS_batch_size; ++task_id) { + bool completed = false; + TransferStatus status; + while (!completed) { + Status s = engine->getTransferStatus(batch_id, task_id, status); + LOG_ASSERT(s.ok()); + if (status.s == TransferStatusEnum::COMPLETED) + completed = true; + else if (status.s == TransferStatusEnum::FAILED) { + LOG(INFO) << "FAILED"; + completed = true; + exit(EXIT_FAILURE); + } + } + } + + s = engine->freeBatchID(batch_id); + LOG_ASSERT(s.ok()); + batch_count++; + } + + LOG(INFO) << "Worker " << thread_id << " stopped!"; + total_batch_count.fetch_add(batch_count); + return Status::OK(); +} + +static void startInitiator(TransferEngine *engine) { + auto buffer_size = FLAGS_block_size * FLAGS_batch_size * FLAGS_threads; + void *addr = std::aligned_alloc(4096, buffer_size); + if (addr == nullptr) { + LOG(ERROR) << "Failed to allocate buffer"; + exit(EXIT_FAILURE); + } + + int rc = engine->registerLocalMemory(addr, buffer_size); + if (rc != 0) { + LOG(ERROR) << "Failed to register buffer, rc=" << rc; + exit(EXIT_FAILURE); + } + + auto segment_id = engine->openSegment(FLAGS_segment_id.c_str()); + + struct timeval start_tv; + gettimeofday(&start_tv, nullptr); + + std::vector workers(FLAGS_threads); + for (int i = 0; i < FLAGS_threads; ++i) { + workers[i] = std::thread(initiatorWorker, engine, segment_id, i, addr); + } + + sleep(FLAGS_duration); + initiator_running = false; + + for (int i = 0; i < FLAGS_threads; ++i) { + workers[i].join(); + } + + struct timeval stop_tv; + gettimeofday(&stop_tv, nullptr); + + auto duration = (stop_tv.tv_sec - start_tv.tv_sec) + + (stop_tv.tv_usec - start_tv.tv_usec) / 1000000.0; + auto batch_count = total_batch_count.load(); + LOG(INFO) << "Test completed: duration " << std::fixed + << std::setprecision(2) << duration << ", batch count " + << batch_count << ", throughput " + << calculateRate( + batch_count * FLAGS_batch_size * FLAGS_block_size, + duration); + + engine->unregisterLocalMemory(addr); + std::free(addr); +} + +static volatile bool target_started = false; +static volatile bool target_running = true; + +static size_t getFileSize(const std::string &file) { + size_t size = 0; + struct stat st; + + int fd = open(file.c_str(), O_RDONLY); + if (fd < 0) { + return 0; + } + + if (fstat(fd, &st) != 0) { + close(fd); + return 0; + } + + if (S_ISLNK(st.st_mode)) { + close(fd); + return 0; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + ioctl(fd, BLKGETSIZE64, &size); + } else if (S_ISREG(st.st_mode)) { + size = st.st_size; + } + + close(fd); + return size; +} + +static void startTarget(TransferEngine *engine) { + std::vector files; + std::istringstream s(FLAGS_files); + std::string file; + while (s >> file) { + if (file.size() <= 0) { + LOG(ERROR) << "Invalid file path " << file; + exit(EXIT_FAILURE); + } + + auto size = getFileSize(file); + if (size == 0) { + LOG(ERROR) << "Invalid file " << file; + exit(EXIT_FAILURE); + } + + FileBufferID id; + int rc = engine->registerLocalFile(file, size, id); + if (rc != 0) { + LOG(ERROR) << "Failed to register file " << file << ", rc=" << rc; + exit(EXIT_FAILURE); + } + + files.push_back(file); + } + + if (files.size() <= 0) { + LOG(ERROR) << "No valid file in \"" << FLAGS_files << "\""; + exit(EXIT_FAILURE); + } + + target_started = true; + while (target_running) sleep(1); + + for (auto &file : files) { + engine->unregisterLocalFile(file); + } +} + +static int initiator() { + auto engine = initTransferEngine(); + startInitiator(engine.get()); + return 0; +} + +static void signalHandler(int signum) { + LOG(INFO) << "Received signal " << signum << ", stopping target server..."; + target_running = false; +} + +static int target() { + signal(SIGINT, signalHandler); + signal(SIGTERM, signalHandler); + + auto engine = initTransferEngine(); + startTarget(engine.get()); + + return 0; +} + +static int loopback() { + auto engine = initTransferEngine(); + + // Start target thread. + auto target_thread = std::thread(startTarget, engine.get()); + size_t wait_cnt = 0; + while (!target_started && wait_cnt < 60) { + sleep(1); + wait_cnt++; + } + + if (!target_started) { + LOG(ERROR) << "Target initialization timedout"; + exit(EXIT_FAILURE); + } + + // Start initiator thread. + auto initiator_thread = std::thread(startInitiator, engine.get()); + + // Wait initiator to complete. + initiator_thread.join(); + + // Terminate target. + target_running = false; + target_thread.join(); + + return 0; +} + +int main(int argc, char **argv) { + gflags::ParseCommandLineFlags(&argc, &argv, false); + + if (FLAGS_mode == "initiator") + return initiator(); + else if (FLAGS_mode == "target") + return target(); + else if (FLAGS_mode == "loopback") + return loopback(); + + LOG(ERROR) + << "Unsupported mode: must be 'initiator', 'target', or 'loopback'"; + exit(EXIT_FAILURE); +} diff --git a/mooncake-transfer-engine/include/config.h b/mooncake-transfer-engine/include/config.h index 33e71322e..2ed101884 100644 --- a/mooncake-transfer-engine/include/config.h +++ b/mooncake-transfer-engine/include/config.h @@ -50,6 +50,10 @@ struct GlobalConfig { bool use_ipv6 = false; size_t fragment_limit = 16384; bool enable_dest_device_affinity = false; +#ifdef USE_NVMEOF_GENERIC + bool nvmeof_generic_direct_io = false; + uint32_t nvmeof_generic_num_workers = 8; +#endif }; void loadGlobalConfig(GlobalConfig &config); diff --git a/mooncake-transfer-engine/include/multi_transport.h b/mooncake-transfer-engine/include/multi_transport.h index b5214b58c..5e80c771e 100644 --- a/mooncake-transfer-engine/include/multi_transport.h +++ b/mooncake-transfer-engine/include/multi_transport.h @@ -49,6 +49,10 @@ class MultiTransport { Transport *installTransport(const std::string &proto, std::shared_ptr topo); + bool transportNeedArgs(const std::string &proto); + + Transport *installTransportWithArgs(const std::string &proto, void **args); + Transport *getTransport(const std::string &proto); std::vector listTransports(); diff --git a/mooncake-transfer-engine/include/transfer_engine.h b/mooncake-transfer-engine/include/transfer_engine.h index 0807ef690..9cf019445 100644 --- a/mooncake-transfer-engine/include/transfer_engine.h +++ b/mooncake-transfer-engine/include/transfer_engine.h @@ -46,12 +46,14 @@ using SegmentHandle = Transport::SegmentHandle; using SegmentID = Transport::SegmentID; using BatchID = Transport::BatchID; using BufferEntry = Transport::BufferEntry; +using FileBufferID = TransferMetadata::FileBufferID; class TransferEngine { public: TransferEngine(bool auto_discover = false) : metadata_(nullptr), local_topology_(std::make_shared()), + next_file_id_(1), auto_discover_(auto_discover) { #ifdef WITH_METRICS InitializeMetricsConfig(); @@ -62,6 +64,7 @@ class TransferEngine { TransferEngine(bool auto_discover, const std::vector &filter) : metadata_(nullptr), local_topology_(std::make_shared()), + next_file_id_(1), auto_discover_(auto_discover), filter_(filter) { #ifdef WITH_METRICS @@ -111,6 +114,25 @@ class TransferEngine { int unregisterLocalMemoryBatch(const std::vector &addr_list); + bool supportFileBuffer(); + + /** + * @brief Register a local file as a shared buffer. + * @param[in] path Local path of the file. + * @param[in] size Available size of the file. + * @param[out] id The id of the registered file buffer. + * @return 0 on success, or error number on failure. + */ + int registerLocalFile(const std::string &path, size_t size, + FileBufferID &id); + + /** + * @brief Unregister a previously registered file. + * @param[in] path The path of the registered file buffer. + * @return 0 on success, or error number on failure. + */ + int unregisterLocalFile(const std::string &path); + BatchID allocateBatchID(size_t batch_size) { return multi_transports_->allocateBatchID(batch_size); } @@ -233,6 +255,12 @@ class TransferEngine { bool remote_accessible; }; + struct LocalFile { + FileBufferID id; + std::string path; + std::size_t size; + }; + std::shared_ptr metadata_; std::string local_server_name_; std::shared_ptr multi_transports_; @@ -240,6 +268,9 @@ class TransferEngine { std::vector local_memory_regions_; std::shared_ptr local_topology_; + std::atomic next_file_id_; + std::unordered_map local_files_; + RWSpinlock send_notifies_lock_; std::unordered_map> diff --git a/mooncake-transfer-engine/include/transfer_engine_c.h b/mooncake-transfer-engine/include/transfer_engine_c.h index 453821908..c194d090c 100644 --- a/mooncake-transfer-engine/include/transfer_engine_c.h +++ b/mooncake-transfer-engine/include/transfer_engine_c.h @@ -17,6 +17,7 @@ #include #include +#include #ifdef __cplusplus extern "C" { @@ -24,6 +25,7 @@ extern "C" { #define segment_handle_t int32_t #define segment_id_t int32_t +#define file_id_t uint32_t #define batch_id_t uint64_t #define LOCAL_SEGMENT (0) #define INVALID_BATCH UINT64_MAX @@ -35,6 +37,7 @@ struct transfer_request { int opcode; void *source; segment_id_t target_id; + file_id_t file_id; uint64_t target_offset; uint64_t length; }; @@ -135,6 +138,13 @@ int registerLocalMemoryBatch(transfer_engine_t engine, int unregisterLocalMemoryBatch(transfer_engine_t engine, void **addr_list, size_t addr_len); +bool supportFileBuffer(transfer_engine_t engine); + +int registerLocalFile(transfer_engine_t engine, const char *path, size_t size, + file_id_t *id); + +int unregisterLocalFile(transfer_engine_t engine, const char *path); + batch_id_t allocateBatchID(transfer_engine_t engine, size_t batch_size); int submitTransfer(transfer_engine_t engine, batch_id_t batch_id, diff --git a/mooncake-transfer-engine/include/transfer_metadata.h b/mooncake-transfer-engine/include/transfer_metadata.h index 70f15c8d4..13d9b55ce 100644 --- a/mooncake-transfer-engine/include/transfer_metadata.h +++ b/mooncake-transfer-engine/include/transfer_metadata.h @@ -64,6 +64,16 @@ class TransferMetadata { std::unordered_map local_path_map; }; + // Identify a single file in a segment's file buffers. + using FileBufferID = uint32_t; + + struct FileBufferDesc { + FileBufferID id; + std::string path; + std::size_t size; + std::size_t align; // For future usage. + }; + struct RankInfoDesc { uint64_t rankId = 0xFFFFFFFF; // rank id, user rank std::string hostIp; @@ -76,6 +86,17 @@ class TransferMetadata { uint64_t pid; }; +#ifdef USE_NVMEOF_GENERIC + // NVMeoF transport id. + struct NVMeoFGenericTrid { + std::string trtype; + std::string adrfam; + std::string traddr; + std::string trsvcid; + std::string subnqn; + }; +#endif + using SegmentID = uint64_t; struct SegmentDesc { @@ -87,6 +108,8 @@ class TransferMetadata { std::vector buffers; // this is for nvmeof. std::vector nvmeof_buffers; + // Generic file buffers. + std::vector file_buffers; // this is for cxl. std::string cxl_name; uint64_t cxl_base_addr; @@ -94,6 +117,10 @@ class TransferMetadata { std::string timestamp; // this is for ascend RankInfoDesc rank_info; +#ifdef USE_NVMEOF_GENERIC + // this is for nvmeof_generic + NVMeoFGenericTrid nvmeof_generic_trid; +#endif int tcp_data_port; @@ -148,6 +175,10 @@ class TransferMetadata { int removeLocalMemoryBuffer(void *addr, bool update_metadata); + int addFileBuffer(const FileBufferDesc &buffer_desc, bool update_metadata); + + int removeFileBuffer(FileBufferID id, bool update_metadata); + int addLocalSegment(SegmentID segment_id, const std::string &segment_name, std::shared_ptr &&desc); diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h new file mode 100644 index 000000000..4b92d7a5e --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_initiator.h @@ -0,0 +1,126 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_INITIATOR_H_ +#define NVMEOF_GENERIC_INITIATOR_H_ + +#include + +#include +#include + +#include "transport/transport.h" + +namespace mooncake { + +using Slice = Transport::Slice; +using NamespaceID = Transport::FileBufferID; + +class NVMeoFQueue; +class NVMeoFController; + +class NVMeoFInitiator : public std::enable_shared_from_this { + friend class NVMeoFController; + + public: + static std::shared_ptr create(bool direct_io = false); + + ~NVMeoFInitiator(); + + std::shared_ptr attachController( + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn); + + void detachController(std::shared_ptr ctrlr); + + private: + NVMeoFInitiator(bool direct_io); + + int setup(); + + const bool direct_io; + struct nvme_fabrics_config cfg; + nvme_root_t root; + nvme_host_t host; +}; + +class NVMeoFController : public std::enable_shared_from_this { + friend class NVMeoFInitiator; + + public: + ~NVMeoFController(); + + void rescan(); + + std::unique_ptr createQueue(size_t queueDepth); + + int getNsFd(NamespaceID nsid); + + private: + struct NVMeoFNamespace { + NamespaceID nsid; + int fd; + + ~NVMeoFNamespace() { close(fd); } + }; + + NVMeoFController(std::shared_ptr initiator, + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn); + + int connect(); + + int disconnect(); + + const std::shared_ptr initiator; + const std::string trtype; + const std::string adrfam; + const std::string traddr; + const std::string trsvcid; + const std::string subnqn; + + nvme_ctrl_t ctrl; + bool should_disconnect_ctrl; + + RWSpinlock ns_lock; + std::unordered_map namespaces; +}; + +class NVMeoFQueue { + friend class NVMeoFController; + + public: + ~NVMeoFQueue(); + + int submitRequest(Slice *slice); + + void reapCompletions(); + + std::shared_ptr getCtrlr() { return this->ctrlr; } + + private: + NVMeoFQueue(std::shared_ptr ctrlr, size_t queueDepth); + + int setup(); + + std::shared_ptr ctrlr; + size_t depth; + io_context_t io_ctx; + std::vector events; +}; + +} // namespace mooncake +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h new file mode 100644 index 000000000..3a664ff1a --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_target.h @@ -0,0 +1,109 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_TARGET_H_ +#define NVMEOF_GENERIC_TARGET_H_ + +#include +#include +#include +#include +#include + +namespace mooncake { +using FileBufferID = uint32_t; +using NamespaceID = FileBufferID; + +namespace nvmeof_target { +class NVMeoFNamespace { + public: + const std::string subnqn; + const NamespaceID nsid; + const std::string file; + + NVMeoFNamespace(const std::string &subnqn, NamespaceID nsid, + const std::string &file); + ~NVMeoFNamespace(); + + int setup(); +}; + +class NVMeoFSubsystem { + public: + const std::string subnqn; + + NVMeoFSubsystem(const std::string &subnqn); + ~NVMeoFSubsystem(); + + int setup(); + + int addNamespace(NamespaceID nsid, const std::string &file); + + int removeNamespace(NamespaceID nsid); + + private: + std::unordered_map> + namespaces; +}; + +class NVMeoFListener { + public: + const std::string trtype; + const std::string adrfam; + const std::string traddr; + const std::string trsvcid; + + NVMeoFListener(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid); + ~NVMeoFListener(); + + int setup(); + + int addSubsystem(std::shared_ptr subsys); + + int removeSubsystem(std::shared_ptr subsys); + + private: + static std::atomic next_id; + + const unsigned int id; + std::vector> subsystems; +}; +} // namespace nvmeof_target + +class NVMeoFTarget { + public: + NVMeoFTarget(const std::string &hostname); + ~NVMeoFTarget(); + + int setup(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid); + + int addFile(FileBufferID file_id, const std::string &file); + + int removeFile(FileBufferID file_id); + + const std::string &getSubNQN() { return subsystem->subnqn; } + + private: + const std::string hostname; + + std::mutex mutex; + std::unique_ptr listener; + std::shared_ptr subsystem; +}; + +} // namespace mooncake + +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h new file mode 100644 index 000000000..af9c0431a --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/nvmeof_transport.h @@ -0,0 +1,111 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_TRANSPORT_H_ +#define NVMEOF_GENERIC_TRANSPORT_H_ + +#include + +#include +#include +#include +#include +#include + +#include "nvmeof_target.h" +#include "nvmeof_initiator.h" +#include "worker_pool.h" +#include "transfer_metadata.h" +#include "transport/transport.h" + +namespace mooncake { +using FileBufferID = TransferMetadata::FileBufferID; +using FileBufferDesc = TransferMetadata::FileBufferDesc; +using NVMeoFTrid = TransferMetadata::NVMeoFGenericTrid; + +class NVMeoFGenericTransport : public Transport { + public: + NVMeoFGenericTransport(); + + ~NVMeoFGenericTransport(); + + BatchID allocateBatchID(size_t batch_size) override; + + Status freeBatchID(BatchID batch_id) override; + + Status submitTransferTask( + const std::vector &task_list) override; + + Status submitTransfer(BatchID batch_id, + const std::vector &entries) override; + + Status getTransferStatus(BatchID batch_id, size_t task_id, + TransferStatus &status) override; + + private: + int installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args) override; + + int registerLocalMemory(void *addr, size_t length, + const std::string &location, bool remote_accessible, + bool update_metadata) override; + + int unregisterLocalMemory(void *addr, + bool update_metadata = false) override; + + int registerLocalMemoryBatch( + const std::vector &buffer_list, + const std::string &location) override { + return 0; + } + + int unregisterLocalMemoryBatch( + const std::vector &addr_list) override { + return 0; + } + + int setupLocalSegment(); + + bool supportFileBuffer() override { return true; } + + int registerLocalFile(FileBufferID id, const std::string &path, + size_t size) override; + + int unregisterLocalFile(FileBufferID id) override; + + const char *getName() const override { return "nvmeof_generic"; } + + int parseTrid(const std::string &trStr); + + bool validateTrid(const NVMeoFTrid &local_trid); + + int setupInitiator(); + + std::shared_ptr getOrCreateController( + SegmentHandle handle); + + std::shared_ptr initiator; + std::unique_ptr worker_pool; + + NVMeoFTrid local_trid; + std::unique_ptr target; + + RWSpinlock controller_lock_; + std::unordered_map> + segment_to_controller_; +}; +} // namespace mooncake + +#endif diff --git a/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h new file mode 100644 index 000000000..1c7c99ec3 --- /dev/null +++ b/mooncake-transfer-engine/include/transport/nvmeof_generic_transport/worker_pool.h @@ -0,0 +1,83 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NVMEOF_GENERIC_WORKER_POOL_H_ +#define NVMEOF_GENERIC_WORKER_POOL_H_ + +#include +#include +#include +#include +#include + +#include + +#include "nvmeof_initiator.h" + +namespace mooncake { +struct NVMeoFWorkerTask { + NVMeoFController *ctrlr; + Slice *slice; + uint64_t timestamp; +}; + +class NVMeoFWorker { + friend class NVMeoFWorkerPool; + + public: + NVMeoFWorker(size_t id); + ~NVMeoFWorker(); + + void addController(std::shared_ptr ctrlr); + void removeController(NVMeoFController *ctrlr); + + private: + void sendMsg(const std::function &func); + int submitTask(NVMeoFController *ctrlr, Slice *slice); + void dispatchTasks(); + void poll(); + + const size_t id; + std::thread thread; + bool stopping; + std::atomic clock; + + boost::lockfree::queue *> + msg_queue; + std::unordered_map> queues; + + NVMeoFWorkerTask *tasks; + NVMeoFWorkerTask *curr_task; + boost::lockfree::queue free_tasks; + boost::lockfree::queue task_queue; +}; + +class NVMeoFWorkerPool { + public: + NVMeoFWorkerPool(size_t num_workers); + ~NVMeoFWorkerPool(); + + int addController(std::shared_ptr ctrlr); + int removeController(NVMeoFController *ctrlr); + + int submitTask(NVMeoFController *ctrlr, Slice *slice); + + private: + const size_t num_workers; + std::atomic next_worker; + std::vector> workers; +}; +} // namespace mooncake + +#endif \ No newline at end of file diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h index 1aaac3cf7..66dfd834a 100644 --- a/mooncake-transfer-engine/include/transport/transport.h +++ b/mooncake-transfer-engine/include/transport/transport.h @@ -27,6 +27,10 @@ #include #include +#ifdef USE_NVMEOF_GENERIC +#include +#endif + #include "common/base/status.h" #include "transfer_metadata.h" @@ -43,6 +47,8 @@ class Transport { using SegmentID = uint64_t; using SegmentHandle = SegmentID; + using FileBufferID = TransferMetadata::FileBufferID; + using BatchID = uint64_t; const static BatchID INVALID_BATCH_ID = UINT64_MAX; @@ -60,6 +66,7 @@ class Transport { uint64_t target_offset; size_t length; int advise_retry_cnt = 0; + FileBufferID file_id; }; enum TransferStatusEnum { @@ -92,6 +99,7 @@ class Transport { SliceStatus status; TransferTask *task; bool from_cache; + FileBufferID file_id; union { struct { @@ -124,6 +132,12 @@ class Transport { struct { uint64_t dest_addr; } ascend_direct; +#ifdef USE_NVMEOF_GENERIC + struct { + uint64_t offset; + struct iocb iocb; + } nvmeof_generic; +#endif }; public: @@ -263,6 +277,10 @@ class Transport { std::shared_ptr meta, std::shared_ptr topo); + virtual int installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args); + std::string local_server_name_; std::shared_ptr metadata_; @@ -287,6 +305,17 @@ class Transport { virtual int unregisterLocalMemoryBatch( const std::vector &addr_list) = 0; + virtual bool supportFileBuffer() { return false; }; + + virtual int registerLocalFile(FileBufferID id, const std::string &path, + size_t size) { + return ERR_NOT_IMPLEMENTED; + } + + virtual int unregisterLocalFile(FileBufferID id) { + return ERR_NOT_IMPLEMENTED; + } + virtual const char *getName() const = 0; }; } // namespace mooncake diff --git a/mooncake-transfer-engine/src/CMakeLists.txt b/mooncake-transfer-engine/src/CMakeLists.txt index 769359bfc..bfb8bdf49 100644 --- a/mooncake-transfer-engine/src/CMakeLists.txt +++ b/mooncake-transfer-engine/src/CMakeLists.txt @@ -78,3 +78,8 @@ if (USE_ASCEND_HETEROGENEOUS) link_directories(${ASCEND_LIB_DIR}) target_link_libraries(transfer_engine PUBLIC ascendcl ascend_transport) endif() + +if (USE_NVMEOF_GENERIC) + target_link_libraries(transfer_engine PUBLIC nvmeof_generic_transport aio nvme) +endif() + \ No newline at end of file diff --git a/mooncake-transfer-engine/src/config.cpp b/mooncake-transfer-engine/src/config.cpp index 9bc4b76fb..5fcfa8275 100644 --- a/mooncake-transfer-engine/src/config.cpp +++ b/mooncake-transfer-engine/src/config.cpp @@ -278,6 +278,28 @@ void loadGlobalConfig(GlobalConfig &config) { if (std::getenv("MC_ENABLE_DEST_DEVICE_AFFINITY")) { config.enable_dest_device_affinity = true; } + +#ifdef USE_NVMEOF_GENERIC + const char *nvmeof_generic_direct_io = + std::getenv("MC_NVMEOF_GENERIC_DIRECT_IO"); + if (nvmeof_generic_direct_io != nullptr && + strlen(nvmeof_generic_direct_io) > 0) { + LOG(INFO) << "Enabling direct I/O for nvmeof_generic transport"; + config.nvmeof_generic_direct_io = true; + } + + const char *nvmeof_generic_num_workers = + std::getenv("MC_NVMEOF_GENERIC_NUM_WORKERS"); + if (nvmeof_generic_num_workers != NULL) { + int val = atoi(nvmeof_generic_num_workers); + if (val > 0) { + config.nvmeof_generic_num_workers = val; + } else { + LOG(ERROR) << "Invalid value for MC_NVMEOF_GENERIC_NUM_WORKERS: " + << nvmeof_generic_num_workers; + } + } +#endif } std::string mtuLengthToString(ibv_mtu mtu) { @@ -326,6 +348,12 @@ void dumpGlobalConfig() { LOG(INFO) << "max_wr = " << config.max_wr; LOG(INFO) << "max_inline = " << config.max_inline; LOG(INFO) << "mtu_length = " << mtuLengthToString(config.mtu_length); +#ifdef USE_NVMEOF_GENERIC + LOG(INFO) << "nvmeof_generic_direct_io = " + << config.nvmeof_generic_direct_io; + LOG(INFO) << "nvmeof_generic_num_workers = " + << config.nvmeof_generic_num_workers; +#endif } GlobalConfig &globalConfig() { diff --git a/mooncake-transfer-engine/src/multi_transport.cpp b/mooncake-transfer-engine/src/multi_transport.cpp index 9c24836a2..51633567b 100644 --- a/mooncake-transfer-engine/src/multi_transport.cpp +++ b/mooncake-transfer-engine/src/multi_transport.cpp @@ -39,6 +39,9 @@ #ifdef USE_CXL #include "transport/cxl_transport/cxl_transport.h" #endif +#ifdef USE_NVMEOF_GENERIC +#include "transport/nvmeof_generic_transport/nvmeof_transport.h" +#endif #include @@ -252,6 +255,41 @@ Transport *MultiTransport::installTransport(const std::string &proto, return transport; } +bool MultiTransport::transportNeedArgs(const std::string &proto) { +#ifdef USE_NVMEOF_GENERIC + if (proto == "nvmeof_generic") { + return true; + } +#endif + return false; +} + +Transport *MultiTransport::installTransportWithArgs(const std::string &proto, + void **args) { + std::shared_ptr transport = nullptr; + +#ifdef USE_NVMEOF_GENERIC + if (proto == "nvmeof_generic") { + transport = std::make_shared(); + } +#endif + + if (!transport) { + LOG(ERROR) << "Unsupported transport " << proto + << ", please rebuild Mooncake"; + return nullptr; + } + + int rc = transport->installWithArgs(local_server_name_, metadata_, args); + if (rc != 0) { + LOG(ERROR) << "Failed to install transport " << proto << ", rc=" << rc; + return nullptr; + } + + transport_map_[proto] = transport; + return transport.get(); +} + Status MultiTransport::selectTransport(const TransferRequest &entry, Transport *&transport) { auto target_segment_desc = metadata_->getSegmentDescByID(entry.target_id); diff --git a/mooncake-transfer-engine/src/transfer_engine.cpp b/mooncake-transfer-engine/src/transfer_engine.cpp index 32b46b13f..cb9d468a3 100644 --- a/mooncake-transfer-engine/src/transfer_engine.cpp +++ b/mooncake-transfer-engine/src/transfer_engine.cpp @@ -267,16 +267,22 @@ Transport *TransferEngine::installTransport(const std::string &proto, return transport; } - if (args != nullptr && args[0] != nullptr) { - const std::string nic_priority_matrix = static_cast(args[0]); - int ret = local_topology_->parse(nic_priority_matrix); - if (ret) { - LOG(ERROR) << "Failed to parse NIC priority matrix"; - return nullptr; + if (multi_transports_->transportNeedArgs(proto)) { + transport = multi_transports_->installTransportWithArgs(proto, args); + } else { + if (args != nullptr && args[0] != nullptr) { + const std::string nic_priority_matrix = + static_cast(args[0]); + int ret = local_topology_->parse(nic_priority_matrix); + if (ret) { + LOG(ERROR) << "Failed to parse NIC priority matrix"; + return nullptr; + } } + + transport = multi_transports_->installTransport(proto, local_topology_); } - transport = multi_transports_->installTransport(proto, local_topology_); if (!transport) return nullptr; // Since installTransport() is only called once during initialization @@ -289,6 +295,15 @@ Transport *TransferEngine::installTransport(const std::string &proto, entry.addr, entry.length, entry.location, entry.remote_accessible); if (ret < 0) return nullptr; } + + if (transport->supportFileBuffer()) { + for (auto &file : local_files_) { + int ret = transport->registerLocalFile( + file.second.id, file.second.path, file.second.size); + if (ret < 0) return nullptr; + } + } + return transport; } @@ -438,6 +453,77 @@ int TransferEngine::unregisterLocalMemoryBatch( return 0; } +bool TransferEngine::supportFileBuffer() { + bool supported = false; + for (auto &transport : multi_transports_->listTransports()) { + supported = supported || transport->supportFileBuffer(); + } + return supported; +} + +int TransferEngine::registerLocalFile(const std::string &path, size_t size, + FileBufferID &id) { + if (!supportFileBuffer()) { + LOG(ERROR) << "File buffers not suppotred"; + return ERR_NOT_IMPLEMENTED; + } + + std::unique_lock lock(mutex_); + if (local_files_.count(path) > 0) { + LOG(ERROR) << "Registering an already registered file: " << path; + return ERR_ADDRESS_OVERLAPPED; + } + + const auto id_ = next_file_id_.fetch_add(1); + + for (auto &transport : multi_transports_->listTransports()) { + if (!transport->supportFileBuffer()) { + continue; + } + + int ret = transport->registerLocalFile(id_, path, size); + if (ret != 0) { + LOG(ERROR) << "Failed to register file " << path << " to transport " + << transport->getName() << ", ret=" << ret; + return ret; + } + } + + local_files_[path] = {id_, path, size}; + id = id_; + return 0; +} + +int TransferEngine::unregisterLocalFile(const std::string &path) { + if (!supportFileBuffer()) { + LOG(ERROR) << "File buffers not suppotred"; + return ERR_NOT_IMPLEMENTED; + } + + std::unique_lock lock(mutex_); + auto it = local_files_.find(path); + if (it == local_files_.end()) { + return ERR_ADDRESS_NOT_REGISTERED; + } + + for (auto &transport : multi_transports_->listTransports()) { + if (!transport->supportFileBuffer()) { + continue; + } + + int ret = transport->unregisterLocalFile(it->second.id); + if (ret != 0 && ret != ERR_ADDRESS_NOT_REGISTERED) { + LOG(ERROR) << "Failed to unregister file " << path + << " from transport " << transport->getName() + << ", ret=" << ret; + return ret; + } + } + + local_files_.erase(it); + return 0; +} + #ifdef WITH_METRICS // Helper function to convert string to lowercase for case-insensitive // comparison diff --git a/mooncake-transfer-engine/src/transfer_engine_c.cpp b/mooncake-transfer-engine/src/transfer_engine_c.cpp index 89667f225..3a3816ce6 100644 --- a/mooncake-transfer-engine/src/transfer_engine_c.cpp +++ b/mooncake-transfer-engine/src/transfer_engine_c.cpp @@ -117,6 +117,22 @@ int unregisterLocalMemoryBatch(transfer_engine_t engine, void **addr_list, return native->unregisterLocalMemoryBatch(native_addr_list); } +bool supportFileBuffer(transfer_engine_t engine) { + TransferEngine *native = (TransferEngine *)engine; + return native->supportFileBuffer(); +} + +int registerLocalFile(transfer_engine_t engine, const char *path, size_t size, + file_id_t *id) { + TransferEngine *native = (TransferEngine *)engine; + return native->registerLocalFile(path, size, *id); +} + +int unregisterLocalFile(transfer_engine_t engine, const char *path) { + TransferEngine *native = (TransferEngine *)engine; + return native->unregisterLocalFile(path); +} + batch_id_t allocateBatchID(transfer_engine_t engine, size_t batch_size) { TransferEngine *native = (TransferEngine *)engine; return (batch_id_t)native->allocateBatchID(batch_size); @@ -132,6 +148,7 @@ int submitTransfer(transfer_engine_t engine, batch_id_t batch_id, (Transport::TransferRequest::OpCode)entries[index].opcode; native_entries[index].source = entries[index].source; native_entries[index].target_id = entries[index].target_id; + native_entries[index].file_id = entries[index].file_id; native_entries[index].target_offset = entries[index].target_offset; native_entries[index].length = entries[index].length; } diff --git a/mooncake-transfer-engine/src/transfer_metadata.cpp b/mooncake-transfer-engine/src/transfer_metadata.cpp index 5b8953894..2f3669ebd 100644 --- a/mooncake-transfer-engine/src/transfer_metadata.cpp +++ b/mooncake-transfer-engine/src/transfer_metadata.cpp @@ -226,11 +226,33 @@ int TransferMetadata::encodeSegmentDesc(const SegmentDesc &desc, buffersJSON.append(bufferJSON); } segmentJSON["buffers"] = buffersJSON; +#ifdef USE_NVMEOF_GENERIC + } else if (segmentJSON["protocol"] == "nvmeof_generic") { + Json::Value tridJSON; + tridJSON["trtype"] = desc.nvmeof_generic_trid.trtype; + tridJSON["adrfam"] = desc.nvmeof_generic_trid.adrfam; + tridJSON["traddr"] = desc.nvmeof_generic_trid.traddr; + tridJSON["trsvcid"] = desc.nvmeof_generic_trid.trsvcid; + tridJSON["subnqn"] = desc.nvmeof_generic_trid.subnqn; + segmentJSON["nvmeof_generic_trid"] = tridJSON; +#endif } else { LOG(ERROR) << "Unsupported segment descriptor for register, name " << desc.name << " protocol " << desc.protocol; return ERR_METADATA; } + + Json::Value fileBuffersJson(Json::arrayValue); + for (const auto &fileBuffer : desc.file_buffers) { + Json::Value bufferJSON; + bufferJSON["id"] = fileBuffer.id; + bufferJSON["path"] = fileBuffer.path; + bufferJSON["size"] = fileBuffer.size; + bufferJSON["align"] = fileBuffer.align; + fileBuffersJson.append(bufferJSON); + } + segmentJSON["file_buffers"] = fileBuffersJson; + return 0; } @@ -410,11 +432,44 @@ TransferMetadata::decodeSegmentDesc(Json::Value &segmentJSON, } desc->buffers.push_back(buffer); } +#ifdef USE_NVMEOF_GENERIC + } else if (desc->protocol == "nvmeof_generic") { + if (!segmentJSON.isMember("nvmeof_generic_trid")) { + LOG(WARNING) << "Corrupted segment descriptor, name " + << segment_name << " protocol " << desc->protocol; + return nullptr; + } + + Json::Value tridJson = segmentJSON["nvmeof_generic_trid"]; + if (!tridJson.isMember("trtype") || !tridJson.isMember("adrfam") || + !tridJson.isMember("traddr") || !tridJson.isMember("trsvcid") || + !tridJson.isMember("subnqn")) { + LOG(WARNING) << "Corrupted segment descriptor, name " + << segment_name << " protocol " << desc->protocol; + return nullptr; + } + + desc->nvmeof_generic_trid.trtype = tridJson["trtype"].asString(); + desc->nvmeof_generic_trid.adrfam = tridJson["adrfam"].asString(); + desc->nvmeof_generic_trid.traddr = tridJson["traddr"].asString(); + desc->nvmeof_generic_trid.trsvcid = tridJson["trsvcid"].asString(); + desc->nvmeof_generic_trid.subnqn = tridJson["subnqn"].asString(); +#endif } else { LOG(ERROR) << "Unsupported segment descriptor, name " << segment_name << " protocol " << desc->protocol; return nullptr; } + + for (const auto &bufferJSON : segmentJSON["file_buffers"]) { + FileBufferDesc buffer; + buffer.id = bufferJSON["id"].asUInt(); + buffer.path = bufferJSON["path"].asString(); + buffer.size = bufferJSON["size"].asUInt64(); + buffer.align = bufferJSON["align"].asUInt64(); + desc->file_buffers.push_back(buffer); + } + return desc; } @@ -605,6 +660,38 @@ int TransferMetadata::removeLocalMemoryBuffer(void *addr, return ERR_ADDRESS_NOT_REGISTERED; } +int TransferMetadata::addFileBuffer(const FileBufferDesc &buffer_desc, + bool update_metadata) { + { + RWSpinlock::WriteGuard guard(segment_lock_); + auto &segment_desc = segment_id_to_desc_map_[LOCAL_SEGMENT_ID]; + segment_desc->file_buffers.push_back(buffer_desc); + } + if (update_metadata) return updateLocalSegmentDesc(); + return 0; +} + +int TransferMetadata::removeFileBuffer(FileBufferID id, bool update_metadata) { + bool buffer_exist = false; + { + RWSpinlock::WriteGuard guard(segment_lock_); + auto &segment_desc = segment_id_to_desc_map_[LOCAL_SEGMENT_ID]; + for (auto iter = segment_desc->file_buffers.begin(); + iter != segment_desc->file_buffers.end(); ++iter) { + if (iter->id == id) { + segment_desc->file_buffers.erase(iter); + buffer_exist = true; + break; + } + } + } + if (buffer_exist) { + if (update_metadata) return updateLocalSegmentDesc(); + return 0; + } + return ERR_ADDRESS_NOT_REGISTERED; +} + int TransferMetadata::addRpcMetaEntry(const std::string &server_name, RpcMetaDesc &desc) { local_rpc_meta_ = desc; diff --git a/mooncake-transfer-engine/src/transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/CMakeLists.txt index 5517a5ddc..bac9988cc 100644 --- a/mooncake-transfer-engine/src/transport/CMakeLists.txt +++ b/mooncake-transfer-engine/src/transport/CMakeLists.txt @@ -14,6 +14,11 @@ if (USE_NVMEOF) target_sources(transport PUBLIC $) endif() +if (USE_NVMEOF_GENERIC) + add_subdirectory(nvmeof_generic_transport) + target_sources(transport PUBLIC $) +endif() + if (USE_CXL) add_subdirectory(cxl_transport) target_sources(transport PUBLIC $) diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt new file mode 100644 index 000000000..10b03bca3 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/CMakeLists.txt @@ -0,0 +1,4 @@ +file(GLOB NVMEOF_GENERIC_SOURCES "*.cpp") + +add_library(nvmeof_generic_transport OBJECT ${NVMEOF_GENERIC_SOURCES}) +target_include_directories(nvmeof_generic_transport PUBLIC) \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp new file mode 100644 index 000000000..92f0a4066 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_initiator.cpp @@ -0,0 +1,411 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_initiator.h" + +#include +#include + +namespace mooncake { +static constexpr auto kMaxRescanDuration = std::chrono::seconds(15); + +static nvme_ctrl_t nvme_find_ctrl(nvme_root_t root, nvme_host_t host, + const std::string &trtype, + const std::string &traddr, + const std::string &trsvcid, + const std::string &subnqn) { + nvme_subsystem_t subsys; + nvme_ctrl_t ctrl; + + // Scan the topology first. + nvme_scan_topology(root, NULL, NULL); + + nvme_for_each_subsystem(host, subsys) { + nvme_subsystem_for_each_ctrl(subsys, ctrl) { + if (strcasecmp(nvme_ctrl_get_transport(ctrl), trtype.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_traddr(ctrl), traddr.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_trsvcid(ctrl), trsvcid.c_str())) { + continue; + } + + if (strcmp(nvme_ctrl_get_subsysnqn(ctrl), subnqn.c_str())) { + continue; + } + + return ctrl; + } + } + + return nullptr; +} + +static int nvme_get_active_ns_list(nvme_ctrl_t ctrl, + std::unordered_set &ns_list) { + struct nvme_ns_list ns_list_ = {0}; + + int fd = nvme_ctrl_get_fd(ctrl); + if (fd < 0) { + LOG(ERROR) << "Invalid fd " << fd << " of controller " + << nvme_ctrl_get_subsysnqn(ctrl); + return -EINVAL; + } + + int rc = nvme_identify_active_ns_list(fd, 0, &ns_list_); + if (rc != 0) { + LOG(ERROR) << "Failed to identify active ns list of controller " + << nvme_ctrl_get_subsysnqn(ctrl) << ", rc=" << rc; + return -EIO; + } + + for (size_t i = 0; i < NVME_ID_NS_LIST_MAX; i++) { + if (ns_list_.ns[i] > 0) { + ns_list.insert(ns_list_.ns[i]); + } + } + + return 0; +} + +std::shared_ptr NVMeoFInitiator::create(bool direct_io) { + auto initiator = + std::shared_ptr(new NVMeoFInitiator(direct_io)); + int rc = initiator->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to create nvmeof initiator, rc=" << rc; + return nullptr; + } + + return initiator; +} + +NVMeoFInitiator::NVMeoFInitiator(bool direct_io) + : direct_io(direct_io), root(nullptr), host(nullptr) {} + +NVMeoFInitiator::~NVMeoFInitiator() { + if (root != nullptr) { + nvme_free_tree(root); + } +} + +int NVMeoFInitiator::setup() { + nvmf_default_config(&cfg); + + // Disconnect the controller immediately on error. + cfg.ctrl_loss_tmo = 0; + + root = nvme_scan(NULL); + if (root == NULL) { + LOG(ERROR) << "Failed to create NVMe root"; + return -ENOMEM; + } + + host = nvme_default_host(root); + if (host == NULL) { + LOG(ERROR) << "Failed to create default NVMe host"; + return -ENOMEM; + } + + return 0; +} + +std::shared_ptr NVMeoFInitiator::attachController( + const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid, + const std::string &subnqn) { + auto ctrlr = std::shared_ptr(new NVMeoFController( + shared_from_this(), trtype, adrfam, traddr, trsvcid, subnqn)); + int rc = ctrlr->connect(); + if (rc != 0) { + LOG(ERROR) << "Failed to connect controller " << subnqn + << ", rc=" << rc; + return nullptr; + } + + return ctrlr; +} + +void NVMeoFInitiator::detachController( + std::shared_ptr ctrlr) { + ctrlr->disconnect(); +} + +NVMeoFController::NVMeoFController(std::shared_ptr initiator, + const std::string &trtype, + const std::string &adrfam, + const std::string &traddr, + const std::string &trsvcid, + const std::string &subnqn) + : initiator(initiator), + trtype(trtype), + adrfam(adrfam), + traddr(traddr), + trsvcid(trsvcid), + subnqn(subnqn), + ctrl(nullptr), + should_disconnect_ctrl(false) {} + +NVMeoFController::~NVMeoFController() { + if (ctrl != nullptr) { + if (should_disconnect_ctrl) { + nvme_disconnect_ctrl(ctrl); + } + nvme_free_ctrl(ctrl); + } +} + +int NVMeoFController::connect() { + ctrl = nvme_find_ctrl(initiator->root, initiator->host, trtype, traddr, + trsvcid, subnqn); + if (ctrl != nullptr) { + // The controller has been connected. + rescan(); + return 0; + } + + ctrl = nvme_create_ctrl(initiator->root, subnqn.c_str(), trtype.c_str(), + traddr.c_str(), NULL, NULL, trsvcid.c_str()); + if (ctrl == NULL) { + LOG(ERROR) << "Failed to create nvme controller " << subnqn; + return -ENOMEM; + } + + int rc = nvmf_add_ctrl(initiator->host, ctrl, &initiator->cfg); + if (rc != 0) { + LOG(ERROR) << "Failed to connect to controller, " << subnqn + << " rc=" << rc; + return rc; + } + + // We connected the controller, so we are responsible for disconnecting it. + should_disconnect_ctrl = true; + + // Trigger rescan to open namespaces. + rescan(); + + return 0; +} + +void NVMeoFController::rescan() { + if (ctrl == nullptr) { + // Do not scan disconnected controller. + return; + } + + RWSpinlock::WriteGuard guard(ns_lock); + const auto rescan_timeout = + std::chrono::steady_clock::now() + kMaxRescanDuration; + + while (true) { + // Retrieve active namespace list via NVMe Identify command. + std::unordered_set active_ns; + int rc = nvme_get_active_ns_list(ctrl, active_ns); + if (rc != 0) { + LOG(ERROR) << "Failed to get active ns list of controller " + << nvme_ctrl_get_name(ctrl) << ", rc=" << rc; + break; + } + + // Remove invalid namespaces. + auto it = namespaces.begin(); + while (it != namespaces.end()) { + if (!active_ns.contains(it->first)) { + it = namespaces.erase(it); + } else { + it++; + } + } + + // Scan controller sysfs directory to get attached namespaces. + struct dirent **ns_dirents = NULL; + int num_ns_dirents = nvme_scan_ctrl_namespaces(ctrl, &ns_dirents); + if (num_ns_dirents < 0) { + LOG(ERROR) << "Failed to scan namespaces of controller " + << nvme_ctrl_get_name(ctrl) << ", errno=" << errno; + break; + } + + // Open namespace block devices. + for (int i = 0; i < num_ns_dirents; i++) { + char ns_dev[256]; + rc = snprintf(ns_dev, sizeof(ns_dev), "/dev/%s", + ns_dirents[i]->d_name); + if (rc <= 0) { + LOG(ERROR) << "Invalid namespace device name " + << ns_dirents[i]->d_name; + continue; + } + + int flags = O_RDWR; + if (initiator->direct_io) flags |= O_DIRECT; + + int fd = open(ns_dev, flags); + if (fd < 0) { + LOG(ERROR) << "Failed to open nvme namespace " << ns_dev + << ", errno=" << errno; + continue; + } + + uint32_t nsid; + rc = nvme_get_nsid(fd, &nsid); + if (rc != 0) { + LOG(ERROR) << "Failed to get nsid of namespace " + << ns_dirents[i]->d_name << ", errno=" << errno; + close(fd); + continue; + } + + if (namespaces.contains(nsid) && namespaces[nsid].fd >= 0) { + // The namespace has been open. + close(fd); + continue; + } + + LOG(INFO) << "Added namespace " << nsid << " of controller " + << nvme_ctrl_get_name(ctrl); + namespaces[nsid] = {nsid, fd}; + } + + // Free dirents. + for (int i = 0; i < num_ns_dirents; i++) { + free(ns_dirents[i]); + } + free(ns_dirents); + + // Check if all active namespaces are open. + if (namespaces.size() == active_ns.size()) { + break; + } + + if (std::chrono::steady_clock::now() >= rescan_timeout) { + LOG(ERROR) << "Timedout to wait for namespaces of " << subnqn + << " to be attached, expected " << active_ns.size() + << ", attached " << namespaces.size(); + break; + } + + // Wait a moment for namespaces to be attached. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } +} + +int NVMeoFController::disconnect() { + { + RWSpinlock::WriteGuard guard(ns_lock); + namespaces.clear(); + } + + if (ctrl != nullptr) { + if (should_disconnect_ctrl) { + should_disconnect_ctrl = false; + nvme_disconnect_ctrl(ctrl); + } + nvme_free_ctrl(ctrl); + ctrl = nullptr; + } + + return 0; +} + +std::unique_ptr NVMeoFController::createQueue(size_t queueDepth) { + auto queue = std::unique_ptr( + new NVMeoFQueue(shared_from_this(), queueDepth)); + int rc = queue->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to create queue, rc=" << rc; + return nullptr; + } + + return queue; +} + +int NVMeoFController::getNsFd(NamespaceID nsid) { + RWSpinlock::ReadGuard guard(ns_lock); + auto it = namespaces.find(nsid); + if (it == namespaces.end()) { + return -1; + } + return it->second.fd; +} + +NVMeoFQueue::NVMeoFQueue(std::shared_ptr ctrlr, + size_t queueDepth) + : ctrlr(ctrlr), depth(queueDepth), io_ctx(nullptr), events(depth) {} + +NVMeoFQueue::~NVMeoFQueue() { + if (io_ctx != nullptr) { + io_destroy(this->io_ctx); + } +} + +int NVMeoFQueue::setup() { + int rc = io_setup(this->depth, &this->io_ctx); + if (rc != 0) { + LOG(ERROR) << "Failed to setup aio context, rc=" << rc; + return rc; + } + return 0; +} + +int NVMeoFQueue::submitRequest(Slice *slice) { + int fd = ctrlr->getNsFd(slice->file_id); + if (fd < 0) { + LOG(ERROR) << "No namespace " << slice->file_id + << " in nvme controller"; + return -ENOENT; + } + + struct iocb *iocb = &slice->nvmeof_generic.iocb; + if (slice->opcode == Transport::TransferRequest::READ) { + io_prep_pread(iocb, fd, slice->source_addr, slice->length, + slice->nvmeof_generic.offset); + } else { + io_prep_pwrite(iocb, fd, slice->source_addr, slice->length, + slice->nvmeof_generic.offset); + } + iocb->data = slice; + + int rc = io_submit(this->io_ctx, 1, &iocb); + return rc > 0 ? 0 : rc; +} + +void NVMeoFQueue::reapCompletions() { + struct timespec timeout = { + .tv_sec = 0, + .tv_nsec = 0, + }; + Slice *slice = nullptr; + + int rc = io_getevents(this->io_ctx, 0, this->depth, this->events.data(), + &timeout); + if (rc < 0) { + LOG(ERROR) << "Failed to poll aio events, rc = " << rc; + return; + } + + for (int i = 0; i < rc; i++) { + slice = (Slice *)(events[i].data); + if (events[i].res == slice->length) { + slice->markSuccess(); + } else { + slice->markFailed(); + } + } +} +}; // namespace mooncake \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp new file mode 100644 index 000000000..d41c98fc9 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_target.cpp @@ -0,0 +1,333 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_target.h" + +#include +#include +#include +#include + +namespace mooncake { +namespace nvmeof_target { +static const std::filesystem::path kNVMeTConfigPath = + "/sys/kernel/config/nvmet"; + +static inline std::filesystem::path makePortPath(uint32_t portid) { + return kNVMeTConfigPath / "ports" / std::to_string(portid); +} + +static inline std::filesystem::path makeSubsysPath(const std::string &subnqn) { + return kNVMeTConfigPath / "subsystems" / subnqn; +} + +static inline std::filesystem::path makeSubsysLinkPath( + const std::string &subnqn, uint32_t portid) { + return makePortPath(portid) / "subsystems" / subnqn; +} + +static inline std::filesystem::path makeNamespacePath(const std::string &subnqn, + NamespaceID nsid) { + return makeSubsysPath(subnqn) / "namespaces" / std::to_string(nsid); +} + +static inline bool pathExists(const std::filesystem::path &path) { + return access(path.c_str(), F_OK) == 0; +} + +static inline int mkDir(const std::filesystem::path &path) { + int rc = mkdir(path.c_str(), 0644); + if (rc != 0) { + LOG(ERROR) << "Failed to make directory " << path + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int rmDir(const std::filesystem::path &path) { + int rc = rmdir(path.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to remove directory " << path + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int setAttr(const std::filesystem::path &path, + const std::string &value) { + int fd = open(path.c_str(), O_RDWR); + if (fd < 0) { + LOG(ERROR) << "Failed to open file " << path << ", errno=" << errno; + return -errno; + } + + int rc = write(fd, value.c_str(), value.size()); + if (rc < 0) { + LOG(ERROR) << "Failed to write \"" << value << "\" to file " << path + << ", errno=" << errno; + rc = -errno; + } + + close(fd); + + return rc >= 0 ? 0 : rc; +} + +static inline int symLink(const std::filesystem::path &dest, + const std::filesystem::path &name) { + int rc = symlink(dest.c_str(), name.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to create symlink to " << dest << " at " << name + << ", errno=" << errno; + return -errno; + } + return 0; +} + +static inline int unLink(const std::filesystem::path &name) { + int rc = unlink(name.c_str()); + if (rc != 0) { + LOG(ERROR) << "Failed to unlink " << name << ", errno=" << errno; + return -errno; + } + return 0; +} + +std::atomic NVMeoFListener::next_id = 1; + +NVMeoFNamespace::NVMeoFNamespace(const std::string &subnqn, NamespaceID nsid, + const std::string &file) + : subnqn(subnqn), nsid(nsid), file(file) {} + +NVMeoFNamespace::~NVMeoFNamespace() { + auto path = makeNamespacePath(subnqn, nsid); + if (pathExists(path)) { + if (pathExists(path / "enable")) { + setAttr(path / "enable", "0"); + } + rmDir(path); + } +} + +int NVMeoFNamespace::setup() { + auto path = makeNamespacePath(subnqn, nsid); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create ns " << std::to_string(nsid) + << " of subsys " << subnqn; + return rc; + } + + rc = setAttr(path / "device_path", file); + if (rc != 0) { + LOG(ERROR) << "Failed to set device_path for ns " + << std::to_string(nsid) << " of subsys " << subnqn; + return rc; + } + + rc = setAttr(path / "enable", "1"); + if (rc != 0) { + LOG(ERROR) << "Failed to enable ns " << std::to_string(nsid) + << " of subsys " << subnqn; + return rc; + } + + return 0; +} + +NVMeoFSubsystem::NVMeoFSubsystem(const std::string &subnqn) : subnqn(subnqn) {} + +NVMeoFSubsystem::~NVMeoFSubsystem() { + // Remove namespaces before removing the subsystem. + namespaces.clear(); + + auto path = makeSubsysPath(subnqn); + if (pathExists(path)) { + rmDir(path); + } +} + +int NVMeoFSubsystem::setup() { + auto path = makeSubsysPath(subnqn); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create subsystem " << subnqn; + return rc; + } + + rc = setAttr(path / "attr_allow_any_host", "1"); + if (rc != 0) { + LOG(ERROR) << "Failed to set allow_any_host for subsystem " << subnqn; + return rc; + } + + return 0; +} + +int NVMeoFSubsystem::addNamespace(NamespaceID nsid, const std::string &file) { + for (auto &it : namespaces) { + if (it.first == nsid || it.second->file == file) { + LOG(ERROR) << "Duplicated namespace " << nsid << ", file=" << file; + return -EEXIST; + } + } + + auto ns = std::make_unique(subnqn, nsid, file); + int rc = ns->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to add namespace " << nsid << " to subsystem " + << subnqn << ", rc=" << rc; + return rc; + } + + namespaces[nsid] = std::move(ns); + return 0; +} + +int NVMeoFSubsystem::removeNamespace(NamespaceID nsid) { + namespaces.erase(nsid); + return 0; +} + +NVMeoFListener::NVMeoFListener(const std::string &trtype, + const std::string &adrfam, + const std::string &traddr, + const std::string &trsvcid) + : trtype(trtype), + adrfam(adrfam), + traddr(traddr), + trsvcid(trsvcid), + id(next_id++) {} + +NVMeoFListener::~NVMeoFListener() { + for (auto &subsys : subsystems) { + auto path = makeSubsysLinkPath(subsys->subnqn, id); + if (pathExists(path)) { + unLink(path); + } + } + subsystems.clear(); + + auto path = makePortPath(id); + if (pathExists(path)) { + rmDir(path); + } +} + +int NVMeoFListener::setup() { + auto path = makePortPath(id); + + int rc = mkDir(path); + if (rc != 0) { + LOG(ERROR) << "Failed to create port " << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_trtype", trtype); + if (rc != 0) { + LOG(ERROR) << "Failed to set trtype " << trtype << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_adrfam", adrfam); + if (rc != 0) { + LOG(ERROR) << "Failed to set adrfam " << adrfam << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_traddr", traddr); + if (rc != 0) { + LOG(ERROR) << "Failed to set traddr " << traddr << " for port " + << std::to_string(id); + return rc; + } + + rc = setAttr(path / "addr_trsvcid", trsvcid); + if (rc != 0) { + LOG(ERROR) << "Failed to set trsvcid " << trsvcid << " for port " + << std::to_string(id); + return rc; + } + + return 0; +} + +int NVMeoFListener::addSubsystem(std::shared_ptr subsys) { + auto dest = makeSubsysPath(subsys->subnqn); + auto name = makeSubsysLinkPath(subsys->subnqn, id); + return symLink(dest, name); +} + +int NVMeoFListener::removeSubsystem(std::shared_ptr subsys) { + auto name = makeSubsysLinkPath(subsys->subnqn, id); + return unLink(name); +} +} // namespace nvmeof_target + +NVMeoFTarget::NVMeoFTarget(const std::string &hostname) + : hostname(hostname), listener(nullptr), subsystem(nullptr) {} + +NVMeoFTarget::~NVMeoFTarget() { + if (listener != nullptr && subsystem != nullptr) { + listener->removeSubsystem(subsystem); + } +} + +int NVMeoFTarget::setup(const std::string &trtype, const std::string &adrfam, + const std::string &traddr, const std::string &trsvcid) { + listener = std::make_unique(trtype, adrfam, + traddr, trsvcid); + int rc = listener->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup nvmeof target listener, trtype=" + << trtype << " adrfam=" << adrfam << " traddr=" << traddr + << " trsvcid=" << trsvcid << ", rc=" << rc; + return rc; + } + + auto subnqn = "nqn.2016-06.io.mc:" + hostname; + subsystem = std::make_shared(subnqn); + rc = subsystem->setup(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup nvmeof subsystem, subnqn=" << subnqn + << ", rc=" << rc; + return rc; + } + + rc = listener->addSubsystem(subsystem); + if (rc != 0) { + LOG(ERROR) << "Failed to add subsystem " << subsystem->subnqn + << " to listener, rc=" << rc; + return rc; + } + + return 0; +} + +int NVMeoFTarget::addFile(FileBufferID file_id, const std::string &file) { + std::lock_guard guard(this->mutex); + return subsystem->addNamespace(file_id, file); +} + +int NVMeoFTarget::removeFile(FileBufferID file_id) { + std::lock_guard guard(this->mutex); + return subsystem->removeNamespace(file_id); +} +} // namespace mooncake \ No newline at end of file diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp new file mode 100644 index 000000000..ea04144f5 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/nvmeof_transport.cpp @@ -0,0 +1,393 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/nvmeof_transport.h" + +#include + +#include "common.h" +#include "config.h" +#include "transfer_engine.h" +#include "transfer_metadata.h" +#include "transport/transport.h" + +namespace mooncake { +NVMeoFGenericTransport::NVMeoFGenericTransport() + : initiator(nullptr), worker_pool(nullptr), target(nullptr) {} + +NVMeoFGenericTransport::~NVMeoFGenericTransport() { + for (auto &it : segment_to_controller_) { + worker_pool->removeController(it.second.get()); + initiator->detachController(it.second); + } + + segment_to_controller_.clear(); + worker_pool.reset(); + initiator.reset(); + target.reset(); +} + +BatchID NVMeoFGenericTransport::allocateBatchID(size_t batch_size) { + auto batch_id = Transport::allocateBatchID(batch_size); + return batch_id; +} + +Status NVMeoFGenericTransport::freeBatchID(BatchID batch_id) { + Status rc = Transport::freeBatchID(batch_id); + return rc; +} + +Status NVMeoFGenericTransport::getTransferStatus(BatchID batch_id, + size_t task_id, + TransferStatus &status) { + auto &batch_desc = *((BatchDesc *)(batch_id)); + const size_t task_count = batch_desc.task_list.size(); + if (task_id >= task_count) { + return Status::InvalidArgument("Task ID " + std::to_string(task_id) + + " out of range " + + std::to_string(task_count)); + } + + auto &task = batch_desc.task_list[task_id]; + status.transferred_bytes = task.transferred_bytes; + + auto success_slice_count = task.success_slice_count; + auto failed_slice_count = task.failed_slice_count; + if (success_slice_count + failed_slice_count < task.slice_count) { + status.s = Transport::TransferStatusEnum::WAITING; + } else { + task.is_finished = true; + if (failed_slice_count) { + status.s = Transport::TransferStatusEnum::FAILED; + } else { + status.s = Transport::TransferStatusEnum::COMPLETED; + } + } + + return Status::OK(); +} + +Status NVMeoFGenericTransport::submitTransferTask( + const std::vector &task_list) { + std::unordered_map> slice_to_submit; + + for (auto task : task_list) { + auto request = task->request; + Slice *slice = getSliceCache().allocate(); + slice->task = task; + slice->source_addr = request->source; + slice->length = request->length; + slice->opcode = request->opcode; + slice->target_id = request->target_id; + slice->file_id = request->file_id; + slice->nvmeof_generic.offset = request->target_offset; + slice->status = Slice::PENDING; + slice->ts = 0; + + task->slice_list.push_back(slice); + task->total_bytes += request->length; + __sync_fetch_and_add(&task->slice_count, 1); + + slice_to_submit[request->target_id].push_back(slice); + } + + for (auto it : slice_to_submit) { + auto ctrlr = getOrCreateController(it.first); + if (ctrlr != nullptr) { + for (auto &slice : it.second) { + worker_pool->submitTask(ctrlr.get(), slice); + } + } else { + for (auto slice : it.second) { + slice->markFailed(); + } + } + it.second.clear(); + } + + return Status::OK(); +} + +Status NVMeoFGenericTransport::submitTransfer( + BatchID batch_id, const std::vector &entries) { + auto &batch_desc = *((BatchDesc *)(batch_id)); + if (batch_desc.task_list.size() + entries.size() > batch_desc.batch_size) { + LOG(ERROR) << "NVMeoFGenericTransport: Exceed the limitation of " + "current batch's " + "capacity"; + return Status::InvalidArgument( + "NVMeoFGenericTransport: Exceed the limitation of capacity, batch " + "id: " + + std::to_string(batch_id)); + } + + size_t task_id = batch_desc.task_list.size(); + batch_desc.task_list.resize(task_id + entries.size()); + + std::vector task_list; + for (auto &request : entries) { + auto &task = batch_desc.task_list[task_id++]; + task.batch_id = batch_id; + task.request = &request; + task_list.push_back(&task); + } + + return this->submitTransferTask(task_list); +} + +int NVMeoFGenericTransport::installWithArgs( + std::string &local_server_name, std::shared_ptr meta, + void **args) { + int rc = Transport::installWithArgs(local_server_name, meta, args); + if (rc != 0) { + LOG(ERROR) << "Transport::install failed, rc=" << rc; + return rc; + } + + if (args != nullptr && args[0] != nullptr) { + std::string trStr = static_cast(args[0]); + rc = parseTrid(trStr); + if (rc != 0) { + LOG(ERROR) << "Failed to parse nvmeof trid \"" << trStr + << "\", rc=" << rc; + return rc; + } + } + + return 0; +} + +int NVMeoFGenericTransport::setupLocalSegment() { + if (this->target != nullptr) { + return 0; + } + + if (!validateTrid(local_trid)) { + LOG(ERROR) << "NVMeoF trid not specified"; + return ERR_INVALID_ARGUMENT; + } + + this->target = std::make_unique(local_server_name_); + int rc = this->target->setup(local_trid.trtype, local_trid.adrfam, + local_trid.traddr, local_trid.trsvcid); + if (rc != 0) { + LOG(ERROR) << "Failed to create nvmeof target, rc=" << rc; + return ERR_INVALID_ARGUMENT; + } + + auto desc = std::make_shared(); + if (!desc) { + LOG(ERROR) << "Failed to create local segment"; + this->target.reset(); + return ERR_MEMORY; + } + + desc->name = local_server_name_; + desc->protocol = "nvmeof_generic"; + desc->nvmeof_generic_trid = local_trid; + desc->nvmeof_generic_trid.subnqn = this->target->getSubNQN(); + + metadata_->addLocalSegment(LOCAL_SEGMENT_ID, local_server_name_, + std::move(desc)); + return 0; +} + +int NVMeoFGenericTransport::registerLocalMemory(void *addr, size_t length, + const std::string &location, + bool remote_accessible, + bool update_metadata) { + return 0; +} + +int NVMeoFGenericTransport::unregisterLocalMemory(void *addr, + bool update_metadata) { + return 0; +} + +int NVMeoFGenericTransport::registerLocalFile(FileBufferID id, + const std::string &path, + size_t size) { + int rc = setupLocalSegment(); + if (rc != 0) { + LOG(ERROR) << "Failed to allocate local segment, rc=" << rc; + return ERR_MEMORY; + } + + rc = this->target->addFile(id, path); + if (rc != 0) { + LOG(ERROR) << "Failed to add file " << path << ", rc=" << rc; + return rc; + } + + FileBufferDesc buffer_desc; + buffer_desc.id = id; + buffer_desc.path = path; + buffer_desc.size = size; + /// TODO: Set align according to file type. + buffer_desc.align = 0; + + rc = this->metadata_->addFileBuffer(buffer_desc, true); + if (rc != 0) { + LOG(ERROR) << "Failed to add file buffer " << path << ", rc=" << rc; + this->target->removeFile(id); + return rc; + } + + return 0; +} + +int NVMeoFGenericTransport::unregisterLocalFile(FileBufferID id) { + if (this->target == nullptr) { + LOG(ERROR) << "NVMeoFGenericTransport::target has not been initialized"; + return ERR_ADDRESS_NOT_REGISTERED; + } + + int rc = this->metadata_->removeFileBuffer(id, true); + if (rc != 0) { + LOG(ERROR) << "Failed to remove file buffer " << id << ", rc=" << rc; + return rc; + } + + this->target->removeFile(id); + return 0; +} + +int NVMeoFGenericTransport::parseTrid(const std::string &trStr) { + std::istringstream stream(trStr); + std::string option; + + while (stream >> option) { + auto sep = option.find('='); + if (sep == option.npos) { + sep = option.find(':'); + if (sep == option.npos) { + LOG(ERROR) << "No separator '=' or ':' found in trid string \"" + << trStr << "\""; + return ERR_INVALID_ARGUMENT; + } + } + + auto key = option.substr(0, sep); + auto value = option.substr(sep + 1); + if (key.empty() || value.empty()) { + LOG(ERROR) << "Invalid trid option: key=" << key + << " value=" << value; + return ERR_INVALID_ARGUMENT; + } + + if (key == "trtype") { + local_trid.trtype = value; + } else if (key == "adrfam") { + local_trid.adrfam = value; + } else if (key == "traddr") { + local_trid.traddr = value; + } else if (key == "trsvcid") { + local_trid.trsvcid = value; + } else { + LOG(ERROR) << "Invalid trid string operation: key=" << key + << ", value=" << value; + return ERR_INVALID_ARGUMENT; + } + } + + if (!validateTrid(local_trid)) { + LOG(ERROR) << "Invalid trid: trtype=" << local_trid.trtype + << ", adrfam=" << local_trid.adrfam + << ", traddr=" << local_trid.traddr + << ", trsvcid=" << local_trid.trsvcid; + return ERR_INVALID_ARGUMENT; + } + + return 0; +} + +bool NVMeoFGenericTransport::validateTrid(const NVMeoFTrid &local_trid) { + return !(local_trid.trtype.empty() || local_trid.adrfam.empty() || + local_trid.traddr.empty() || local_trid.trsvcid.empty()); +} + +int NVMeoFGenericTransport::setupInitiator() { + if (this->initiator == nullptr) { + this->initiator = + NVMeoFInitiator::create(globalConfig().nvmeof_generic_direct_io); + if (this->initiator == nullptr) { + LOG(ERROR) << "Failed to create nvmeof initiator"; + return ERR_MEMORY; + } + } + + if (this->worker_pool == nullptr) { + this->worker_pool = std::make_unique( + globalConfig().nvmeof_generic_num_workers); + if (this->worker_pool == nullptr) { + LOG(ERROR) << "Failed to create nvmeof worker pool"; + return ERR_MEMORY; + } + } + + return 0; +} + +std::shared_ptr NVMeoFGenericTransport::getOrCreateController( + SegmentHandle handle) { + { + RWSpinlock::ReadGuard guard(controller_lock_); + auto it = segment_to_controller_.find(handle); + if (it != segment_to_controller_.end()) { + return it->second; + } + } + + auto desc = metadata_->getSegmentDescByID(handle); + if (desc == nullptr || desc->protocol != "nvmeof_generic" || + desc->file_buffers.size() <= 0) { + LOG(ERROR) << "Invalid segment " << desc; + return nullptr; + } + + RWSpinlock::WriteGuard guard(controller_lock_); + auto it = segment_to_controller_.find(handle); + if (it != segment_to_controller_.end()) { + // Someone else attached the controller. + return it->second; + } + + int rc = setupInitiator(); + if (rc != 0) { + LOG(ERROR) << "Failed to setup initiator, rc=" << rc; + return nullptr; + } + + auto &trid = desc->nvmeof_generic_trid; + auto controller = initiator->attachController( + trid.trtype, trid.adrfam, trid.traddr, trid.trsvcid, trid.subnqn); + if (controller == nullptr) { + LOG(ERROR) << "Failed to attach controller trtype=" << trid.trtype + << " adrfam=" << trid.adrfam << " traddr=" << trid.traddr + << " trsvcid=" << trid.trsvcid << " subnqn=" << trid.subnqn; + return nullptr; + } + + rc = this->worker_pool->addController(controller); + if (rc != 0) { + LOG(ERROR) << "Failed to add controller to worker pool, rc=" << rc; + initiator->detachController(controller); + return nullptr; + } + + segment_to_controller_[handle] = controller; + return controller; +} + +} // namespace mooncake diff --git a/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp new file mode 100644 index 000000000..47ed19f98 --- /dev/null +++ b/mooncake-transfer-engine/src/transport/nvmeof_generic_transport/worker_pool.cpp @@ -0,0 +1,226 @@ +// Copyright 2025 Alibaba Cloud and its affiliates +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "transport/nvmeof_generic_transport/worker_pool.h" + +#include + +#include + +#define WORKER_QUEUE_DEPTH 256 +#define WORKER_MAX_NUM_TASKS 4096 + +namespace mooncake { +NVMeoFWorker::NVMeoFWorker(size_t id) + : id(id), stopping(false), clock(0), tasks(nullptr), curr_task(nullptr) { + this->tasks = new NVMeoFWorkerTask[WORKER_MAX_NUM_TASKS]; + for (size_t i = 0; i < WORKER_MAX_NUM_TASKS; i++) { + this->free_tasks.push(&this->tasks[i]); + } + + this->thread = std::thread(std::bind(&NVMeoFWorker::poll, this)); +} + +NVMeoFWorker::~NVMeoFWorker() { + this->stopping = true; + if (this->thread.joinable()) { + this->thread.join(); + } + + delete[] this->tasks; +} + +void NVMeoFWorker::addController(std::shared_ptr ctrlr) { + auto it = this->queues.find(ctrlr.get()); + if (it != this->queues.end()) { + LOG(WARNING) << "Controller exists: " << ctrlr.get(); + return; + } + + auto queue = ctrlr->createQueue(WORKER_QUEUE_DEPTH); + if (queue == nullptr) { + LOG(ERROR) << "Failed to create nvmeof queue"; + return; + } + + this->queues[ctrlr.get()] = std::move(queue); +} + +void NVMeoFWorker::removeController(NVMeoFController *ctrlr) { + auto it = this->queues.find(ctrlr); + if (it == this->queues.end()) { + return; + } + + this->queues.erase(it); +} + +void NVMeoFWorker::sendMsg(const std::function &func) { + this->msg_queue.push(&func); +} + +int NVMeoFWorker::submitTask(NVMeoFController *ctrlr, Slice *slice) { + NVMeoFWorkerTask *task; + + if (!this->free_tasks.pop(task)) { + return -ENOMEM; + } + + task->ctrlr = ctrlr; + task->slice = slice; + task->timestamp = this->clock.load(); + this->task_queue.push(task); + + return 0; +} + +void NVMeoFWorker::dispatchTasks() { + uint64_t prev = this->clock.fetch_add(1); + + if (this->curr_task == nullptr && !this->task_queue.pop(this->curr_task)) { + return; + } + + do { + auto task = std::exchange(this->curr_task, nullptr); + + auto queue = this->queues.find(task->ctrlr); + if (queue == this->queues.end()) { + task->slice->markFailed(); + this->free_tasks.push(task); + } else { + int rc = queue->second->submitRequest(task->slice); + if (rc == 0) { + this->free_tasks.push(task); + } else if (rc == -EAGAIN || rc == -EWOULDBLOCK) { + task->timestamp = this->clock.load(); + this->task_queue.push(task); + } else { + LOG(ERROR) << "Failed to submit request, rc = " << rc; + task->slice->markFailed(); + this->free_tasks.push(task); + } + } + } while (this->task_queue.pop(this->curr_task) && + this->curr_task->timestamp == prev); +} + +void NVMeoFWorker::poll() { + // Allow thread to be scheduled to different CPU cores. + cpu_set_t cpuset; + memset(&cpuset, -1, sizeof(cpuset)); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + std::function *func; + while (!this->stopping) { + while (this->msg_queue.pop(func)) { + (*func)(this); + } + + this->dispatchTasks(); + + // Take a break before reaping completions. + std::this_thread::yield(); + + for (auto &it : this->queues) { + it.second->reapCompletions(); + } + } + + while (this->msg_queue.pop(func)) { + (*func)(this); + } + + if (this->curr_task != nullptr) { + this->curr_task->slice->markFailed(); + this->free_tasks.push(this->curr_task); + this->curr_task = nullptr; + } + + while (this->task_queue.pop(this->curr_task)) { + this->curr_task->slice->markFailed(); + this->free_tasks.push(this->curr_task); + this->curr_task = nullptr; + } +} + +NVMeoFWorkerPool::NVMeoFWorkerPool(size_t num_workers) + : num_workers(num_workers), next_worker(0) { + for (size_t i = 0; i < num_workers; i++) { + auto worker = std::make_unique(i); + this->workers.push_back(std::move(worker)); + } +} + +NVMeoFWorkerPool::~NVMeoFWorkerPool() { this->workers.clear(); } + +int NVMeoFWorkerPool::addController(std::shared_ptr ctrlr) { + std::latch latch(this->num_workers); + auto msg_fn = [&ctrlr, &latch](NVMeoFWorker *worker) { + worker->addController(ctrlr); + latch.count_down(); + }; + + for (size_t i = 0; i < this->num_workers; i++) { + auto worker = this->workers[i].get(); + worker->sendMsg(msg_fn); + } + + latch.wait(); + return 0; +} + +int NVMeoFWorkerPool::removeController(NVMeoFController *ctrlr) { + std::latch latch(this->num_workers); + auto msg_fn = [&ctrlr, &latch](NVMeoFWorker *worker) { + worker->removeController(ctrlr); + latch.count_down(); + }; + + for (size_t i = 0; i < this->num_workers; i++) { + auto worker = this->workers[i].get(); + worker->sendMsg(msg_fn); + } + + latch.wait(); + return 0; +} + +int NVMeoFWorkerPool::submitTask(NVMeoFController *ctrlr, Slice *slice) { + uint32_t worker_idx; + int rc; + + /// Randomly pick a worker. + worker_idx = std::rand() % this->num_workers; + rc = this->workers[worker_idx]->submitTask(ctrlr, slice); + if (rc == 0) { + return 0; + } + + /// Try all workers. + uint32_t failed_worker_idx = worker_idx; + worker_idx = (worker_idx + 1) % this->num_workers; + while (rc != 0 && worker_idx != failed_worker_idx) { + rc = this->workers[worker_idx]->submitTask(ctrlr, slice); + worker_idx = (worker_idx + 1) % this->num_workers; + } + + if (rc != 0) { + LOG(ERROR) << "Failed to submit transfer task"; + } + + return rc; +} + +} // namespace mooncake diff --git a/mooncake-transfer-engine/src/transport/transport.cpp b/mooncake-transfer-engine/src/transport/transport.cpp index 00e8bf96f..c4d139a30 100644 --- a/mooncake-transfer-engine/src/transport/transport.cpp +++ b/mooncake-transfer-engine/src/transport/transport.cpp @@ -64,4 +64,12 @@ int Transport::install(std::string &local_server_name, metadata_ = meta; return 0; } + +int Transport::installWithArgs(std::string &local_server_name, + std::shared_ptr meta, + void **args) { + local_server_name_ = local_server_name; + metadata_ = meta; + return 0; +} } // namespace mooncake \ No newline at end of file