Skip to content
69 changes: 68 additions & 1 deletion src/bio/bio_device.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2020-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1067,3 +1067,70 @@ bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, u
return led_manage(xs_ctxt, pci_addr, (Ctl__LedAction)action, (Ctl__LedState *)state,
duration);
}

static void
set_power_mgmt_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct bio_bdev *d_bdev = cb_arg;
int sc;
int sct;
uint32_t cdw0;

spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
if (sc) {
D_ERROR("Set power management failed for device %s, NVMe status code/type: %d/%d\n",
d_bdev->bb_name, sc, sct);
} else {
D_INFO("Power management value set on device %s\n", d_bdev->bb_name);
}

spdk_bdev_free_io(bdev_io);
}

int
bio_set_power_mgmt(struct bio_bdev *d_bdev, struct spdk_io_channel *channel)
{
struct spdk_bdev *bdev;
struct spdk_nvme_cmd cmd;
int rc;

/* If default has not been overwritten, skip setting the value */
if (bio_spdk_power_mgmt_val == UINT32_MAX)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to make this a constant - BIO_SPDK_POWER_MGMT_UNINIT or something like that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will address if I have to re-push

return 0;

D_ASSERT(d_bdev != NULL);
D_ASSERT(d_bdev->bb_desc != NULL);
D_ASSERT(channel != NULL);

bdev = spdk_bdev_desc_get_bdev(d_bdev->bb_desc);
if (bdev == NULL) {
D_ERROR("No bdev associated with device descriptor\n");
return -DER_INVAL;
}

if (get_bdev_type(bdev) != BDEV_CLASS_NVME) {
D_DEBUG(DB_MGMT, "Device %s is not NVMe, skipping power management\n",
d_bdev->bb_name);
return -DER_NOTSUPPORTED;
}

if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) {
D_ERROR("Bdev NVMe admin passthru not supported for %s\n", d_bdev->bb_name);
return -DER_NOTSUPPORTED;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'bdev' can be acquired by spdk_bdev_get_by_name(), these checking should be done before spdk_bdev_open_ext()


memset(&cmd, 0, sizeof(cmd));
cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
cmd.cdw10 = SPDK_NVME_FEAT_POWER_MANAGEMENT;
cmd.cdw11 = bio_spdk_power_mgmt_val;

rc = spdk_bdev_nvme_admin_passthru(d_bdev->bb_desc, channel, &cmd, NULL, 0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This uses bdh_io_channel but doesn't hold reference count, that could cause issue when bio_dev_health being finalized before the completion. See bio_fini_health_monitoring().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've decoupled implementation from health monitoring, is it better now?

set_power_mgmt_completion, d_bdev);
if (rc) {
D_ERROR("Failed to submit power management command for %s, rc:%d\n",
d_bdev->bb_name, rc);
return daos_errno2der(-rc);
}

return 0;
}
5 changes: 4 additions & 1 deletion src/bio/bio_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2018-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -606,6 +606,7 @@ extern unsigned int bio_numa_node;
extern unsigned int bio_spdk_max_unmap_cnt;
extern unsigned int bio_max_async_sz;
extern unsigned int bio_io_timeout;
extern unsigned int bio_spdk_power_mgmt_val;

int xs_poll_completion(struct bio_xs_context *ctxt, unsigned int *inflights,
uint64_t timeout);
Expand Down Expand Up @@ -720,6 +721,8 @@ void trigger_faulty_reaction(struct bio_blobstore *bbs);
int fill_in_traddr(struct bio_dev_info *b_info, char *dev_name);
struct bio_dev_info *
alloc_dev_info(uuid_t dev_id, struct bio_bdev *d_bdev, struct smd_dev_info *s_info);
int
bio_set_power_mgmt(struct bio_bdev *d_bdev, struct spdk_io_channel *channel);

/* bio_config.c */
int
Expand Down
8 changes: 7 additions & 1 deletion src/bio/bio_monitor.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1014,6 +1014,12 @@ bio_init_health_monitoring(struct bio_blobstore *bb, char *bdev_name)
D_ASSERT(channel != NULL);
bb->bb_dev_health.bdh_io_channel = channel;

/* Set NVMe power management */
rc = bio_set_power_mgmt(bb->bb_dev, channel);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't want this being skipped when health monitor is disabled? (see bypass_health_collect() at the beginning of this function).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it okay to move this function and the channel fetch to before the bypass check?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this function uses the io_channel created by bio_init_health_monitoring().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

if (rc != 0 && rc != -DER_NOTSUPPORTED)
D_WARN("Failed to set power management for device %s: " DF_RC "\n", bdev_name,
DP_RC(rc));

/* Set the NVMe SSD PCI Vendor ID */
bio_set_vendor_id(bb, bdev_name);
/* Register DAOS metrics to export NVMe SSD health stats */
Expand Down
9 changes: 8 additions & 1 deletion src/bio/bio_xstream.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* (C) Copyright 2018-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -61,6 +61,8 @@ bool bio_spdk_inited;
bool bio_vmd_enabled;
/* SPDK subsystem fini timeout */
unsigned int bio_spdk_subsys_timeout = 25000; /* ms */
/* SPDK NVMe power management value */
unsigned int bio_spdk_power_mgmt_val = UINT32_MAX; /* usually 0-4 */
/* How many blob unmap calls can be called in a row */
unsigned int bio_spdk_max_unmap_cnt = 32;
unsigned int bio_max_async_sz = (1UL << 15) /* 32k */;
Expand Down Expand Up @@ -270,6 +272,11 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size,
d_getenv_bool("DAOS_SCM_RDMA_ENABLED", &bio_scm_rdma);
D_INFO("RDMA to SCM is %s\n", bio_scm_rdma ? "enabled" : "disabled");

d_getenv_uint("DAOS_NVME_POWER_MGMT", &bio_spdk_power_mgmt_val);
if (bio_spdk_power_mgmt_val != UINT32_MAX)
D_INFO("NVMe power management setting to be applied is %u\n",
bio_spdk_power_mgmt_val);

d_getenv_uint("DAOS_SPDK_SUBSYS_TIMEOUT", &bio_spdk_subsys_timeout);
D_INFO("SPDK subsystem fini timeout is %u ms\n", bio_spdk_subsys_timeout);

Expand Down
Loading