Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 68 additions & 1 deletion src/bio/bio_device.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2020-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1067,3 +1067,70 @@ bio_led_manage(struct bio_xs_context *xs_ctxt, char *tr_addr, uuid_t dev_uuid, u
return led_manage(xs_ctxt, pci_addr, (Ctl__LedAction)action, (Ctl__LedState *)state,
duration);
}

static void
set_power_mgmt_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct bio_bdev *d_bdev = cb_arg;
int sc;
int sct;
uint32_t cdw0;

spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
if (sc) {
D_ERROR("Set power management failed for device %s, NVMe status code/type: %d/%d\n",
d_bdev->bb_name, sc, sct);
} else {
D_INFO("Power management value set on device %s\n", d_bdev->bb_name);
}

spdk_bdev_free_io(bdev_io);
}

int
bio_set_power_mgmt(struct bio_bdev *d_bdev, struct spdk_io_channel *channel)
{
struct spdk_bdev *bdev;
struct spdk_nvme_cmd cmd;
int rc;

/* If default has not been overwritten, skip setting the value */
if (bio_spdk_power_mgmt_val == UINT32_MAX)
return 0;

D_ASSERT(d_bdev != NULL);
D_ASSERT(d_bdev->bb_desc != NULL);
D_ASSERT(channel != NULL);

bdev = spdk_bdev_desc_get_bdev(d_bdev->bb_desc);
if (bdev == NULL) {
D_ERROR("No bdev associated with device descriptor\n");
return -DER_INVAL;
}

if (get_bdev_type(bdev) != BDEV_CLASS_NVME) {
D_DEBUG(DB_MGMT, "Device %s is not NVMe, skipping power management\n",
d_bdev->bb_name);
return -DER_NOTSUPPORTED;
}

if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) {
D_ERROR("Bdev NVMe admin passthru not supported for %s\n", d_bdev->bb_name);
return -DER_NOTSUPPORTED;
}

memset(&cmd, 0, sizeof(cmd));
cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
cmd.cdw10 = SPDK_NVME_FEAT_POWER_MANAGEMENT;
cmd.cdw11 = bio_spdk_power_mgmt_val;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

union spdk_nvme_cmd_cdw11 has a member union spdk_nvme_feat_power_management feat_power_management that has multiple fields, not just the power state. So I think we should be using that to make sure we're setting the value correctly.

IMO we should also check the value taken from the env variable is in an acceptable range, preferably when we first ingest it. I had some trouble finding definitions for the power state values. Does SPDK define those somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I wait for @Michael-Hennecke to verify the PR. this is more of an enablement change that will unblock some testing. Once verified we can look at tidying things.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like we at least need to check the value. I was wondering about what would happen if we set the env variable to some value larger than 5 bits, potentially overflowing into the other bit field ("workload hint"). Here's the struct definition I found: https://spdk.io/doc/unionspdk__nvme__feat__power__management.html

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Michael-Hennecke thoughts on acceptable values?


rc = spdk_bdev_nvme_admin_passthru(d_bdev->bb_desc, channel, &cmd, NULL, 0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This uses bdh_io_channel but doesn't hold reference count, that could cause issue when bio_dev_health being finalized before the completion. See bio_fini_health_monitoring().

set_power_mgmt_completion, d_bdev);
if (rc) {
D_ERROR("Failed to submit power management command for %s, rc:%d\n",
d_bdev->bb_name, rc);
return daos_errno2der(-rc);
}

return 0;
}
5 changes: 4 additions & 1 deletion src/bio/bio_internal.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2018-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -606,6 +606,7 @@ extern unsigned int bio_numa_node;
extern unsigned int bio_spdk_max_unmap_cnt;
extern unsigned int bio_max_async_sz;
extern unsigned int bio_io_timeout;
extern unsigned int bio_spdk_power_mgmt_val;

int xs_poll_completion(struct bio_xs_context *ctxt, unsigned int *inflights,
uint64_t timeout);
Expand Down Expand Up @@ -720,6 +721,8 @@ void trigger_faulty_reaction(struct bio_blobstore *bbs);
int fill_in_traddr(struct bio_dev_info *b_info, char *dev_name);
struct bio_dev_info *
alloc_dev_info(uuid_t dev_id, struct bio_bdev *d_bdev, struct smd_dev_info *s_info);
int
bio_set_power_mgmt(struct bio_bdev *d_bdev, struct spdk_io_channel *channel);

/* bio_config.c */
int
Expand Down
8 changes: 7 additions & 1 deletion src/bio/bio_monitor.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* (C) Copyright 2019-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1014,6 +1014,12 @@ bio_init_health_monitoring(struct bio_blobstore *bb, char *bdev_name)
D_ASSERT(channel != NULL);
bb->bb_dev_health.bdh_io_channel = channel;

/* Set NVMe power management */
rc = bio_set_power_mgmt(bb->bb_dev, channel);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't want this being skipped when health monitor is disabled? (see bypass_health_collect() at the beginning of this function).

if (rc != 0 && rc != -DER_NOTSUPPORTED)
D_WARN("Failed to set power management for device %s: " DF_RC "\n", bdev_name,
DP_RC(rc));

/* Set the NVMe SSD PCI Vendor ID */
bio_set_vendor_id(bb, bdev_name);
/* Register DAOS metrics to export NVMe SSD health stats */
Expand Down
9 changes: 8 additions & 1 deletion src/bio/bio_xstream.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* (C) Copyright 2018-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -61,6 +61,8 @@ bool bio_spdk_inited;
bool bio_vmd_enabled;
/* SPDK subsystem fini timeout */
unsigned int bio_spdk_subsys_timeout = 25000; /* ms */
/* SPDK NVMe power management value */
unsigned int bio_spdk_power_mgmt_val = UINT32_MAX; /* usually 0-4 */
/* How many blob unmap calls can be called in a row */
unsigned int bio_spdk_max_unmap_cnt = 32;
unsigned int bio_max_async_sz = (1UL << 15) /* 32k */;
Expand Down Expand Up @@ -270,6 +272,11 @@ bio_nvme_init_ext(const char *nvme_conf, int numa_node, unsigned int mem_size,
d_getenv_bool("DAOS_SCM_RDMA_ENABLED", &bio_scm_rdma);
D_INFO("RDMA to SCM is %s\n", bio_scm_rdma ? "enabled" : "disabled");

d_getenv_uint("DAOS_NVME_POWER_MGMT", &bio_spdk_power_mgmt_val);
if (bio_spdk_power_mgmt_val != UINT32_MAX)
D_INFO("NVMe power management setting to be applied is %u\n",
bio_spdk_power_mgmt_val);

d_getenv_uint("DAOS_SPDK_SUBSYS_TIMEOUT", &bio_spdk_subsys_timeout);
D_INFO("SPDK subsystem fini timeout is %u ms\n", bio_spdk_subsys_timeout);

Expand Down
Loading