Skip to content

Commit 27b7cca

Browse files
author
Ziye Yang
committed
blk/spdk: Add the support to use nvme device provided by NVMe-of Target
This patch is used to add the support to use the nvmedevice provided by NVMe-oF target. Signed-off-by: Ziye Yang <[email protected]>
1 parent c5684e4 commit 27b7cca

File tree

4 files changed

+69
-32
lines changed

4 files changed

+69
-32
lines changed

PendingReleaseNotes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@
33
* RGW's default backend for `rgw_enable_ops_log` changed from RADOS to file.
44
The default value of `rgw_ops_log_rados` is now false, and `rgw_ops_log_file_path`
55
defaults to "/var/log/ceph/ops-log-$cluster-$name.log".
6+
* The SPDK backend for BlueStore is now able to connect to an NVMeoF target.
7+
Please note that this is not an officially supported feature.

doc/rados/configuration/bluestore-config-ref.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,14 +357,19 @@ The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF`
357357

358358
and then set::
359359

360-
bluestore_block_path = spdk:0000:01:00.0
360+
bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0"
361361

362362
Where ``0000:01:00.0`` is the device selector found in the output of ``lspci``
363363
command above.
364364

365+
You may also specify a remote NVMeoF target over the TCP transport as in the
366+
following example::
367+
368+
bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
369+
365370
To run multiple SPDK instances per node, you must specify the
366371
amount of dpdk memory in MB that each instance will use, to make sure each
367-
instance uses its own dpdk memory
372+
instance uses its own DPDK memory.
368373

369374
In most cases, a single device can be used for data, DB, and WAL. We describe
370375
this strategy as *colocating* these components. Be sure to enter the below

src/blk/spdk/NVMEDevice.cc

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ static constexpr uint32_t data_buffer_size = 8192;
5555

5656
static constexpr uint16_t inline_segment_num = 32;
5757

58+
/* Default to 10 seconds for the keep alive value. This value is arbitrary. */
59+
static constexpr uint32_t nvme_ctrlr_keep_alive_timeout_in_ms = 10000;
60+
5861
static void io_complete(void *t, const struct spdk_nvme_cpl *completion);
5962

6063
struct IORequest {
@@ -78,6 +81,7 @@ class SharedDriverData {
7881
spdk_nvme_ns *ns;
7982
uint32_t block_size = 0;
8083
uint64_t size = 0;
84+
std::thread admin_thread;
8185

8286
public:
8387
std::vector<NVMEDevice*> registered_devices;
@@ -90,12 +94,30 @@ class SharedDriverData {
9094
ns(ns_) {
9195
block_size = spdk_nvme_ns_get_extended_sector_size(ns);
9296
size = spdk_nvme_ns_get_size(ns);
97+
if (trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
98+
return;
99+
}
100+
101+
// For Non-PCIe transport, we need to send keep-alive periodically.
102+
admin_thread = std::thread(
103+
[this]() {
104+
int rc;
105+
while (true) {
106+
rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
107+
ceph_assert(rc >= 0);
108+
sleep(1);
109+
}
110+
}
111+
);
93112
}
94113

95114
bool is_equal(const spdk_nvme_transport_id& trid2) const {
96115
return spdk_nvme_transport_id_compare(&trid, &trid2) == 0;
97116
}
98117
~SharedDriverData() {
118+
if (admin_thread.joinable()) {
119+
admin_thread.join();
120+
}
99121
}
100122

101123
void register_device(NVMEDevice *device) {
@@ -146,7 +168,7 @@ class SharedDriverQueueData {
146168
struct spdk_nvme_io_qpair_opts opts = {};
147169
spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
148170
opts.qprio = SPDK_NVME_QPRIO_URGENT;
149-
// usable queue depth should minus 1 to aovid overflow.
171+
// usable queue depth should minus 1 to avoid overflow.
150172
max_queue_depth = opts.io_queue_size - 1;
151173
qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts));
152174
ceph_assert(qpair != NULL);
@@ -478,23 +500,31 @@ static NVMEManager manager;
478500
static bool probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr_opts *opts)
479501
{
480502
NVMEManager::ProbeContext *ctx = static_cast<NVMEManager::ProbeContext*>(cb_ctx);
503+
bool do_attach = false;
481504

482-
if (trid->trtype != SPDK_NVME_TRANSPORT_PCIE) {
483-
dout(0) << __func__ << " only probe local nvme device" << dendl;
484-
return false;
505+
if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
506+
do_attach = spdk_nvme_transport_id_compare(&ctx->trid, trid) == 0;
507+
if (!do_attach) {
508+
dout(0) << __func__ << " device traddr (" << ctx->trid.traddr
509+
<< ") not match " << trid->traddr << dendl;
510+
}
511+
} else {
512+
// for non-pcie devices, should always match the specified trid
513+
assert(!spdk_nvme_transport_id_compare(&ctx->trid, trid));
514+
do_attach = true;
485515
}
486516

487-
dout(0) << __func__ << " found device at: "
488-
<< "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", "
489-
<< "traddr=" << trid->traddr << dendl;
490-
if (spdk_nvme_transport_id_compare(&ctx->trid, trid)) {
491-
dout(0) << __func__ << " device traddr (" << ctx->trid.traddr << ") not match " << trid->traddr << dendl;
492-
return false;
493-
}
517+
if (do_attach) {
518+
dout(0) << __func__ << " found device at: "
519+
<< "trtype=" << spdk_nvme_transport_id_trtype_str(trid->trtype) << ", "
520+
<< "traddr=" << trid->traddr << dendl;
494521

495-
opts->io_queue_size = UINT16_MAX;
522+
opts->io_queue_size = UINT16_MAX;
523+
opts->io_queue_requests = UINT16_MAX;
524+
opts->keep_alive_timeout_ms = nvme_ctrlr_keep_alive_timeout_in_ms;
525+
}
496526

497-
return true;
527+
return do_attach;
498528
}
499529

500530
static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
@@ -543,12 +573,6 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **
543573
}
544574
}
545575

546-
struct spdk_pci_addr pci_addr;
547-
int rc = spdk_pci_addr_parse(&pci_addr, trid.traddr);
548-
if (rc < 0) {
549-
derr << __func__ << " invalid transport address: " << trid.traddr << dendl;
550-
return -ENOENT;
551-
}
552576
auto coremask_arg = g_conf().get_val<std::string>("bluestore_spdk_coremask");
553577
int m_core_arg = find_first_bitset(coremask_arg);
554578
// at least one core is needed for using spdk
@@ -563,18 +587,24 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **
563587

564588
if (!dpdk_thread.joinable()) {
565589
dpdk_thread = std::thread(
566-
[this, coremask_arg, m_core_arg, mem_size_arg, pci_addr]() {
590+
[this, coremask_arg, m_core_arg, mem_size_arg, trid]() {
567591
struct spdk_env_opts opts;
568-
struct spdk_pci_addr addr = pci_addr;
592+
struct spdk_pci_addr addr;
569593
int r;
570594

571-
spdk_env_opts_init(&opts);
595+
bool local_pci_device = false;
596+
int rc = spdk_pci_addr_parse(&addr, trid.traddr);
597+
if (!rc) {
598+
local_pci_device = true;
599+
opts.pci_whitelist = &addr;
600+
opts.num_pci_addr = 1;
601+
}
602+
603+
spdk_env_opts_init(&opts);
572604
opts.name = "nvme-device-manager";
573605
opts.core_mask = coremask_arg.c_str();
574606
opts.master_core = m_core_arg;
575607
opts.mem_size = mem_size_arg;
576-
opts.pci_whitelist = &addr;
577-
opts.num_pci_addr = 1;
578608
spdk_env_init(&opts);
579609
spdk_unaffinitize_thread();
580610

@@ -583,7 +613,7 @@ int NVMEManager::try_get(const spdk_nvme_transport_id& trid, SharedDriverData **
583613
if (!probe_queue.empty()) {
584614
ProbeContext* ctxt = probe_queue.front();
585615
probe_queue.pop_front();
586-
r = spdk_nvme_probe(NULL, ctxt, probe_cb, attach_cb, NULL);
616+
r = spdk_nvme_probe(local_pci_device ? NULL : &trid, ctxt, probe_cb, attach_cb, NULL);
587617
if (r < 0) {
588618
ceph_assert(!ctxt->driver);
589619
derr << __func__ << " device probe nvme failed" << dendl;
@@ -714,7 +744,8 @@ int NVMEDevice::open(const string& p)
714744
return r;
715745
}
716746
if (int r = manager.try_get(trid, &driver); r < 0) {
717-
derr << __func__ << " failed to get nvme device with transport address " << trid.traddr << dendl;
747+
derr << __func__ << " failed to get nvme device with transport address "
748+
<< trid.traddr << " type " << trid.trtype << dendl;
718749
return r;
719750
}
720751

src/os/bluestore/BlueStore.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6870,12 +6870,11 @@ int BlueStore::_setup_block_symlink_or_file(
68706870
return r;
68716871
}
68726872
// write the Transport ID of the NVMe device
6873-
// a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
6873+
// a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0"
68746874
// where "0000:02:00.0" is the selector of a PCI device, see
68756875
// the first column of "lspci -mm -n -D"
6876-
string trid{"trtype:PCIe "};
6877-
trid += "traddr:";
6878-
trid += epath.substr(strlen(SPDK_PREFIX));
6876+
// a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420"
6877+
string trid = epath.substr(strlen(SPDK_PREFIX));
68796878
r = ::write(fd, trid.c_str(), trid.size());
68806879
ceph_assert(r == static_cast<int>(trid.size()));
68816880
dout(1) << __func__ << " created " << name << " symlink to "

0 commit comments

Comments
 (0)