Skip to content

Commit 4985ab6

Browse files
Add patch for runtime PM and PMEs
1 parent ece7129 commit 4985ab6

File tree

2 files changed

+317
-0
lines changed

2 files changed

+317
-0
lines changed

kernel.spec.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch
151151
Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch
152152
Patch62: xen-pm-use-suspend.patch
153153
Patch63: xen-pciback-pm-suspend.patch
154+
Patch64: xen-pciback-pm-runtime.patch
154155

155156
%description
156157
Qubes Dom0 kernel.

xen-pciback-pm-runtime.patch

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
From 414fe8e7e2ead9fb8aa7dcf4a608ca539d757ca3 Mon Sep 17 00:00:00 2001
2+
From: Vertex X7-53 <vertex@glassway.net>
3+
Date: Sun, 17 Aug 2025 01:34:00 +0100
4+
Subject: [PATCH] xen/pciback: Improve runtime power management
5+
6+
An important part of S0ix runtime power management is the control of PCI device D-states.
7+
Without both the device and any applicable PCI bridges in D3cold, the PMC will
8+
keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2.
9+
10+
The vast majority of devices depend on PME (Power Management Events) to
11+
wake from D3cold, so Linux will not attempt to put them into deeper
12+
sleep states if it detects the device does not support PME.
13+
PMEs can be delivered a variety of different ways, which include interrupts
14+
on the pcieport, ACPI events, and the setting of the PME status register in
15+
the PCI configuration space. Up until now, Xen has not supported the
16+
passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space.
17+
18+
This first patch is a modification to the dom0 kernel, specifically pciback.
19+
We enable support for runtime PM in pciback, to allow the dom0 kernel
20+
to suspend upstream bridges. Then we allow domains to read PME capability registers.
21+
When dom0 receives a PME, it forwards this to pciback, and pciback then sets
22+
a special emulated flag on the device. This flag is cleared by the guest when it
23+
resets the register to 0, after handling the event. We also respond to requests
24+
from the guest to change the power state and place pciback in a PM state
25+
in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports.
26+
---
27+
.../xen/xen-pciback/conf_space_capability.c | 108 +++++++++++-------
28+
drivers/xen/xen-pciback/pci_stub.c | 52 +++++++++
29+
drivers/xen/xen-pciback/pciback.h | 2 +
30+
3 files changed, 122 insertions(+), 40 deletions(-)
31+
32+
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
33+
index cf568e899ee2..7d53766bf4ff 100644
34+
--- a/drivers/xen/xen-pciback/conf_space_capability.c
35+
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
36+
@@ -8,8 +8,11 @@
37+
38+
#include <linux/kernel.h>
39+
#include <linux/pci.h>
40+
+#include <linux/pm.h>
41+
+#include <linux/pm_runtime.h>
42+
#include "pciback.h"
43+
#include "conf_space.h"
44+
+#include "../../pci/pci.h"
45+
46+
static LIST_HEAD(capabilities);
47+
struct xen_pcibk_config_capability {
48+
@@ -91,39 +94,92 @@ static const struct config_field caplist_vpd[] = {
49+
{}
50+
};
51+
52+
-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
53+
+static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value,
54+
void *data)
55+
{
56+
int err;
57+
u16 real_value;
58+
59+
- err = pci_read_config_word(dev, offset, &real_value);
60+
- if (err)
61+
- goto out;
62+
+ pm_runtime_get_noresume(&dev->dev);
63+
+ pm_runtime_barrier(&dev->dev);
64+
+
65+
+ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI.
66+
+ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command,
67+
+ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions.
68+
+ */
69+
+ pci_read_config_word(dev, offset, &real_value);
70+
+ if (PCI_POSSIBLE_ERROR(real_value))
71+
+ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */
72+
+ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET;
73+
+
74+
+ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
75+
+ if (dev_data->pme_status)
76+
+ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE);
77+
+ pm_runtime_put_noidle(&dev->dev);
78+
79+
- *value = real_value & ~PCI_PM_CAP_PME_MASK;
80+
+ *value = real_value;
81+
82+
out:
83+
return err;
84+
}
85+
86+
-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
87+
- * Can't allow driver domain to enable PMEs - they're shared */
88+
-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
89+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */
90+
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_PME_ENABLE|PCI_PM_CTRL_DATA_SEL_MASK)
91+
92+
static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
93+
void *data)
94+
{
95+
int err;
96+
+ int pm_err;
97+
u16 old_value;
98+
pci_power_t new_state;
99+
+ pci_power_t current_state;
100+
+
101+
+ pm_runtime_get_noresume(&dev->dev);
102+
+ pm_runtime_barrier(&dev->dev);
103+
+
104+
+ /* PME status is RW1CS */
105+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
106+
+ if (new_value & PCI_PM_CTRL_PME_STATUS) {
107+
+ dev_data->pme_status = 0;
108+
+ }
109+
+
110+
+ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
111+
+
112+
+ /* First, use pm ops to transition state */
113+
+ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state);
114+
+ pm_runtime_put_noidle(&dev->dev);
115+
+
116+
+ bool runtime_pm = pm_runtime_enabled(&dev->dev);
117+
+ if (runtime_pm) {
118+
+ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) {
119+
+ pm_err = pm_runtime_resume(&dev->dev);
120+
+ if (pm_err < 0) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err);
121+
+ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) {
122+
+ pm_err = pm_runtime_suspend(&dev->dev);
123+
+ if (pm_err < 0) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err);
124+
+ }
125+
+ }
126+
+
127+
+ current_state = dev->current_state;
128+
+ if (current_state == PCI_D3cold)
129+
+ current_state = PCI_D3hot;
130+
+
131+
+ /* Otherwise, set it manually */
132+
+ if (!runtime_pm || current_state != new_state) {
133+
+ err = pci_set_power_state(dev, new_state);
134+
+ if (err) {
135+
+ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err);
136+
+ err = PCIBIOS_SET_FAILED;
137+
+ goto out;
138+
+ }
139+
+ }
140+
141+
+ /* This must happen here, after pm_runtime_resume is called */
142+
err = pci_read_config_word(dev, offset, &old_value);
143+
if (err)
144+
goto out;
145+
146+
- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
147+
-
148+
new_value &= PM_OK_BITS;
149+
if ((old_value & PM_OK_BITS) != new_value) {
150+
new_value = (old_value & ~PM_OK_BITS) | new_value;
151+
@@ -132,48 +188,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
152+
goto out;
153+
}
154+
155+
- /* Let pci core handle the power management change */
156+
- dev_dbg(&dev->dev, "set power state to %x\n", new_state);
157+
- err = pci_set_power_state(dev, new_state);
158+
- if (err) {
159+
- err = PCIBIOS_SET_FAILED;
160+
- goto out;
161+
- }
162+
-
163+
out:
164+
return err;
165+
}
166+
167+
-/* Ensure PMEs are disabled */
168+
-static void *pm_ctrl_init(struct pci_dev *dev, int offset)
169+
-{
170+
- int err;
171+
- u16 value;
172+
-
173+
- err = pci_read_config_word(dev, offset, &value);
174+
- if (err)
175+
- goto out;
176+
-
177+
- if (value & PCI_PM_CTRL_PME_ENABLE) {
178+
- value &= ~PCI_PM_CTRL_PME_ENABLE;
179+
- err = pci_write_config_word(dev, offset, value);
180+
- }
181+
-
182+
-out:
183+
- return err ? ERR_PTR(err) : NULL;
184+
-}
185+
-
186+
static const struct config_field caplist_pm[] = {
187+
{
188+
.offset = PCI_PM_PMC,
189+
.size = 2,
190+
- .u.w.read = pm_caps_read,
191+
+ .u.w.read = xen_pcibk_read_config_word,
192+
},
193+
{
194+
.offset = PCI_PM_CTRL,
195+
.size = 2,
196+
- .init = pm_ctrl_init,
197+
- .u.w.read = xen_pcibk_read_config_word,
198+
+ .u.w.read = pm_ctrl_read,
199+
.u.w.write = pm_ctrl_write,
200+
},
201+
{
202+
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
203+
index 073b259747e9..1f91b9670ace 100644
204+
--- a/drivers/xen/xen-pciback/pci_stub.c
205+
+++ b/drivers/xen/xen-pciback/pci_stub.c
206+
@@ -18,6 +18,8 @@
207+
#include <linux/wait.h>
208+
#include <linux/sched.h>
209+
#include <linux/atomic.h>
210+
+#include <linux/pm.h>
211+
+#include <linux/pm_runtime.h>
212+
#include <xen/events.h>
213+
#include <xen/pci.h>
214+
#include <xen/xen.h>
215+
@@ -153,6 +155,7 @@ static void pcistub_device_release(struct kref *kref)
216+
217+
kfree(dev_data);
218+
pci_set_drvdata(dev, NULL);
219+
+ pm_runtime_get_noresume(&dev->dev);
220+
221+
/* Clean-up the device */
222+
xen_pcibk_config_free_dyn_fields(dev);
223+
@@ -494,6 +497,8 @@ static int pcistub_init_device(struct pcistub_device *psdev)
224+
xen_pcibk_reset_device(dev);
225+
226+
pci_set_dev_assigned(dev);
227+
+ pm_runtime_put_noidle(&dev->dev);
228+
+
229+
return 0;
230+
231+
config_release:
232+
@@ -1042,6 +1047,16 @@ static void xen_pcibk_error_resume(struct pci_dev *dev)
233+
return;
234+
}
235+
236+
+static int xen_pcibk_prepare(struct device *dev) {
237+
+ // Clear PME bit and block future PMEs
238+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
239+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
240+
+ dev_data->pme_blocked = 1;
241+
+ dev_data->pme_status = 0;
242+
+
243+
+ return 0;
244+
+}
245+
+
246+
static int xen_pcibk_suspend_noirq(struct device *dev) {
247+
// Imitate pci_pm_suspend_noirq but with per-device opt-in and force
248+
// option.
249+
@@ -1073,6 +1088,39 @@ static int xen_pcibk_suspend_noirq(struct device *dev) {
250+
return 0;
251+
}
252+
253+
+static int xen_pcibk_resume_noirq(struct device *dev) {
254+
+ // Unblock PMEs
255+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
256+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
257+
+ dev_data->pme_blocked = 0;
258+
+
259+
+ return 0;
260+
+}
261+
+
262+
+/* Since this is a virtual representation of the PM state, we only allow the device
263+
+ * to enter the "suspended" state after the guest commands the device into D3hot
264+
+*/
265+
+static int xen_pcibk_pm_idle(struct device *dev)
266+
+{
267+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
268+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
269+
+ if (dev_data->pme_status) return -EBUSY;
270+
+
271+
+ return pci_dev->current_state >= PCI_D3hot ? 0 : -EBUSY;
272+
+}
273+
+
274+
+static int xen_pcibk_pm_resume(struct device *dev)
275+
+{
276+
+ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not
277+
+ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */
278+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
279+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
280+
+ if (!dev_data->pme_blocked)
281+
+ dev_data->pme_status = 1;
282+
+
283+
+ return 0;
284+
+}
285+
+
286+
/*add xen_pcibk AER handling*/
287+
static const struct pci_error_handlers xen_pcibk_error_handler = {
288+
.error_detected = xen_pcibk_error_detected,
289+
@@ -1082,7 +1130,11 @@ static const struct pci_error_handlers xen_pcibk_error_handler = {
290+
};
291+
292+
static const struct dev_pm_ops xen_pcibk_pm_ops = {
293+
+ .prepare = xen_pcibk_prepare,
294+
.suspend_noirq = xen_pcibk_suspend_noirq,
295+
+ .resume_noirq = xen_pcibk_resume_noirq,
296+
+ .runtime_idle = xen_pcibk_pm_idle,
297+
+ .runtime_resume = xen_pcibk_pm_resume,
298+
};
299+
300+
/*
301+
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
302+
index cf6df6964664..ab1467d13278 100644
303+
--- a/drivers/xen/xen-pciback/pciback.h
304+
+++ b/drivers/xen/xen-pciback/pciback.h
305+
@@ -56,6 +56,8 @@ struct xen_pcibk_dev_data {
306+
unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
307+
unsigned int ack_intr:1; /* .. and ACK-ing */
308+
unsigned long handled;
309+
+ unsigned int pme_status:1;
310+
+ unsigned int pme_blocked:1;
311+
unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
312+
char irq_name[]; /* xen-pcibk[000:04:00.0] */
313+
};
314+
--
315+
2.49.0
316+

0 commit comments

Comments
 (0)