Skip to content

Commit 17c1f96

Browse files
Marc Zyngierbjorn-helgaas
authored andcommitted
PCI: xgene-msi: Sanitise MSI allocation and affinity setting
Plugging a device that doesn't use managed affinity on an XGene-1 machine results in messages such as: genirq: irq_chip PCI-MSIX-0000:01:00.0 did not update eff. affinity mask of irq 39 As it turns out, the driver was never updated to populate the effective affinity on irq_set_affinity() call, and the core code is prickly about that. But upon further investigation, it appears that the driver keeps repainting the hwirq field of the irq_data structure as a way to track the affinity of the MSI, something that is very much frowned upon as it breaks the fundamentals of an IRQ domain (an array indexed by hwirq). Fixing this results more or less in a rewrite of the driver: - Define how a hwirq and a CPU affinity map onto the MSI termination registers - Allocate a single entry in the bitmap per MSI instead of *8* - Correctly track CPU affinity - Fix the documentation so that it actually means something (to me) - Use standard bitmap iterators - and plenty of other cleanups With this, the driver behaves correctly on my vintage Mustang board. Signed-off-by: Marc Zyngier <[email protected]> [lpieralisi: replaced open coded GENMASK(6, 4) with MSInRx_HWIRQ_MASK] Signed-off-by: Lorenzo Pieralisi <[email protected]> Signed-off-by: Bjorn Helgaas <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 011f4fc commit 17c1f96

File tree

1 file changed

+93
-129
lines changed

1 file changed

+93
-129
lines changed

drivers/pci/controller/pci-xgene-msi.c

Lines changed: 93 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* Author: Tanmay Inamdar <[email protected]>
77
* Duc Dang <[email protected]>
88
*/
9+
#include <linux/bitfield.h>
910
#include <linux/cpu.h>
1011
#include <linux/interrupt.h>
1112
#include <linux/irqdomain.h>
@@ -22,7 +23,15 @@
2223
#define IDX_PER_GROUP 8
2324
#define IRQS_PER_IDX 16
2425
#define NR_HW_IRQS 16
25-
#define NR_MSI_VEC (IDX_PER_GROUP * IRQS_PER_IDX * NR_HW_IRQS)
26+
#define NR_MSI_BITS (IDX_PER_GROUP * IRQS_PER_IDX * NR_HW_IRQS)
27+
#define NR_MSI_VEC (NR_MSI_BITS / num_possible_cpus())
28+
29+
#define MSI_GROUP_MASK GENMASK(22, 19)
30+
#define MSI_INDEX_MASK GENMASK(18, 16)
31+
#define MSI_INTR_MASK GENMASK(19, 16)
32+
33+
#define MSInRx_HWIRQ_MASK GENMASK(6, 4)
34+
#define DATA_HWIRQ_MASK GENMASK(3, 0)
2635

2736
struct xgene_msi {
2837
struct irq_domain *inner_domain;
@@ -37,8 +46,26 @@ struct xgene_msi {
3746
static struct xgene_msi *xgene_msi_ctrl;
3847

3948
/*
40-
* X-Gene v1 has 16 groups of MSI termination registers MSInIRx, where
41-
* n is group number (0..F), x is index of registers in each group (0..7)
49+
* X-Gene v1 has 16 frames of MSI termination registers MSInIRx, where n is
50+
* frame number (0..15), x is index of registers in each frame (0..7). Each
51+
* 32b register is at the beginning of a 64kB region, each frame occupying
52+
* 512kB (and the whole thing 8MB of PA space).
53+
*
54+
* Each register supports 16 MSI vectors (0..15) to generate interrupts. A
55+
* write to the MSInIRx from the PCI side generates an interrupt. A read
56+
* from the MSInRx on the CPU side returns a bitmap of the pending MSIs in
57+
* the lower 16 bits. A side effect of this read is that all pending
58+
* interrupts are acknowledged and cleared).
59+
*
60+
* Additionally, each MSI termination frame has 1 MSIINTn register (n is
61+
* 0..15) to indicate the MSI pending status caused by any of its 8
62+
* termination registers, reported as a bitmap in the lower 8 bits. Each 32b
63+
* register is at the beginning of a 64kB region (and overall occupying an
64+
* extra 1MB).
65+
*
66+
* There is one GIC IRQ assigned for each MSI termination frame, 16 in
67+
* total.
68+
*
4269
* The register layout is as follows:
4370
* MSI0IR0 base_addr
4471
* MSI0IR1 base_addr + 0x10000
@@ -59,107 +86,74 @@ static struct xgene_msi *xgene_msi_ctrl;
5986
* MSIINT1 base_addr + 0x810000
6087
* ... ...
6188
* MSIINTF base_addr + 0x8F0000
62-
*
63-
* Each index register supports 16 MSI vectors (0..15) to generate interrupt.
64-
* There are total 16 GIC IRQs assigned for these 16 groups of MSI termination
65-
* registers.
66-
*
67-
* Each MSI termination group has 1 MSIINTn register (n is 0..15) to indicate
68-
* the MSI pending status caused by 1 of its 8 index registers.
6989
*/
7090

7191
/* MSInIRx read helper */
72-
static u32 xgene_msi_ir_read(struct xgene_msi *msi,
73-
u32 msi_grp, u32 msir_idx)
92+
static u32 xgene_msi_ir_read(struct xgene_msi *msi, u32 msi_grp, u32 msir_idx)
7493
{
7594
return readl_relaxed(msi->msi_regs + MSI_IR0 +
76-
(msi_grp << 19) + (msir_idx << 16));
95+
(FIELD_PREP(MSI_GROUP_MASK, msi_grp) |
96+
FIELD_PREP(MSI_INDEX_MASK, msir_idx)));
7797
}
7898

7999
/* MSIINTn read helper */
80100
static u32 xgene_msi_int_read(struct xgene_msi *msi, u32 msi_grp)
81101
{
82-
return readl_relaxed(msi->msi_regs + MSI_INT0 + (msi_grp << 16));
102+
return readl_relaxed(msi->msi_regs + MSI_INT0 +
103+
FIELD_PREP(MSI_INTR_MASK, msi_grp));
83104
}
84105

85106
/*
86-
* With 2048 MSI vectors supported, the MSI message can be constructed using
87-
* following scheme:
88-
* - Divide into 8 256-vector groups
89-
* Group 0: 0-255
90-
* Group 1: 256-511
91-
* Group 2: 512-767
92-
* ...
93-
* Group 7: 1792-2047
94-
* - Each 256-vector group is divided into 16 16-vector groups
95-
* As an example: 16 16-vector groups for 256-vector group 0-255 is
96-
* Group 0: 0-15
97-
* Group 1: 16-32
98-
* ...
99-
* Group 15: 240-255
100-
* - The termination address of MSI vector in 256-vector group n and 16-vector
101-
* group x is the address of MSIxIRn
102-
* - The data for MSI vector in 16-vector group x is x
107+
* In order to allow an MSI to be moved from one CPU to another without
108+
* having to repaint both the address and the data (which cannot be done
109+
* atomically), we statically partitions the MSI frames between CPUs. Given
110+
* that XGene-1 has 8 CPUs, each CPU gets two frames assigned to it
111+
*
112+
* We adopt the convention that when an MSI is moved, it is configured to
113+
* target the same register number in the congruent frame assigned to the
114+
* new target CPU. This reserves a given MSI across all CPUs, and reduces
115+
* the MSI capacity from 2048 to 256.
116+
*
117+
* Effectively, this amounts to:
118+
* - hwirq[7]::cpu[2:0] is the target frame number (n in MSInIRx)
119+
* - hwirq[6:4] is the register index in any given frame (x in MSInIRx)
120+
* - hwirq[3:0] is the MSI data
103121
*/
104-
static u32 hwirq_to_reg_set(unsigned long hwirq)
122+
static irq_hw_number_t compute_hwirq(u8 frame, u8 index, u8 data)
105123
{
106-
return (hwirq / (NR_HW_IRQS * IRQS_PER_IDX));
107-
}
108-
109-
static u32 hwirq_to_group(unsigned long hwirq)
110-
{
111-
return (hwirq % NR_HW_IRQS);
112-
}
113-
114-
static u32 hwirq_to_msi_data(unsigned long hwirq)
115-
{
116-
return ((hwirq / NR_HW_IRQS) % IRQS_PER_IDX);
124+
return (FIELD_PREP(BIT(7), FIELD_GET(BIT(3), frame)) |
125+
FIELD_PREP(MSInRx_HWIRQ_MASK, index) |
126+
FIELD_PREP(DATA_HWIRQ_MASK, data));
117127
}
118128

119129
static void xgene_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
120130
{
121131
struct xgene_msi *msi = irq_data_get_irq_chip_data(data);
122-
u32 reg_set = hwirq_to_reg_set(data->hwirq);
123-
u32 group = hwirq_to_group(data->hwirq);
124-
u64 target_addr = msi->msi_addr + (((8 * group) + reg_set) << 16);
132+
u64 target_addr;
133+
u32 frame, msir;
134+
int cpu;
125135

126-
msg->address_hi = upper_32_bits(target_addr);
127-
msg->address_lo = lower_32_bits(target_addr);
128-
msg->data = hwirq_to_msi_data(data->hwirq);
129-
}
136+
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
137+
msir = FIELD_GET(MSInRx_HWIRQ_MASK, data->hwirq);
138+
frame = FIELD_PREP(BIT(3), FIELD_GET(BIT(7), data->hwirq)) | cpu;
130139

131-
/*
132-
* X-Gene v1 only has 16 MSI GIC IRQs for 2048 MSI vectors. To maintain
133-
* the expected behaviour of .set_affinity for each MSI interrupt, the 16
134-
* MSI GIC IRQs are statically allocated to 8 X-Gene v1 cores (2 GIC IRQs
135-
* for each core). The MSI vector is moved from 1 MSI GIC IRQ to another
136-
* MSI GIC IRQ to steer its MSI interrupt to correct X-Gene v1 core. As a
137-
* consequence, the total MSI vectors that X-Gene v1 supports will be
138-
* reduced to 256 (2048/8) vectors.
139-
*/
140-
static int hwirq_to_cpu(unsigned long hwirq)
141-
{
142-
return (hwirq % num_possible_cpus());
143-
}
140+
target_addr = msi->msi_addr;
141+
target_addr += (FIELD_PREP(MSI_GROUP_MASK, frame) |
142+
FIELD_PREP(MSI_INTR_MASK, msir));
144143

145-
static unsigned long hwirq_to_canonical_hwirq(unsigned long hwirq)
146-
{
147-
return (hwirq - hwirq_to_cpu(hwirq));
144+
msg->address_hi = upper_32_bits(target_addr);
145+
msg->address_lo = lower_32_bits(target_addr);
146+
msg->data = FIELD_GET(DATA_HWIRQ_MASK, data->hwirq);
148147
}
149148

150149
static int xgene_msi_set_affinity(struct irq_data *irqdata,
151150
const struct cpumask *mask, bool force)
152151
{
153152
int target_cpu = cpumask_first(mask);
154-
int curr_cpu;
155-
156-
curr_cpu = hwirq_to_cpu(irqdata->hwirq);
157-
if (curr_cpu == target_cpu)
158-
return IRQ_SET_MASK_OK_DONE;
159153

160-
/* Update MSI number to target the new CPU */
161-
irqdata->hwirq = hwirq_to_canonical_hwirq(irqdata->hwirq) + target_cpu;
154+
irq_data_update_effective_affinity(irqdata, cpumask_of(target_cpu));
162155

156+
/* Force the core code to regenerate the message */
163157
return IRQ_SET_MASK_OK;
164158
}
165159

@@ -173,23 +167,20 @@ static int xgene_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
173167
unsigned int nr_irqs, void *args)
174168
{
175169
struct xgene_msi *msi = domain->host_data;
176-
int msi_irq;
170+
irq_hw_number_t hwirq;
177171

178172
mutex_lock(&msi->bitmap_lock);
179173

180-
msi_irq = bitmap_find_next_zero_area(msi->bitmap, NR_MSI_VEC, 0,
181-
num_possible_cpus(), 0);
182-
if (msi_irq < NR_MSI_VEC)
183-
bitmap_set(msi->bitmap, msi_irq, num_possible_cpus());
184-
else
185-
msi_irq = -ENOSPC;
174+
hwirq = find_first_zero_bit(msi->bitmap, NR_MSI_VEC);
175+
if (hwirq < NR_MSI_VEC)
176+
set_bit(hwirq, msi->bitmap);
186177

187178
mutex_unlock(&msi->bitmap_lock);
188179

189-
if (msi_irq < 0)
190-
return msi_irq;
180+
if (hwirq >= NR_MSI_VEC)
181+
return -ENOSPC;
191182

192-
irq_domain_set_info(domain, virq, msi_irq,
183+
irq_domain_set_info(domain, virq, hwirq,
193184
&xgene_msi_bottom_irq_chip, domain->host_data,
194185
handle_simple_irq, NULL, NULL);
195186

@@ -201,12 +192,10 @@ static void xgene_irq_domain_free(struct irq_domain *domain,
201192
{
202193
struct irq_data *d = irq_domain_get_irq_data(domain, virq);
203194
struct xgene_msi *msi = irq_data_get_irq_chip_data(d);
204-
u32 hwirq;
205195

206196
mutex_lock(&msi->bitmap_lock);
207197

208-
hwirq = hwirq_to_canonical_hwirq(d->hwirq);
209-
bitmap_clear(msi->bitmap, hwirq, num_possible_cpus());
198+
clear_bit(d->hwirq, msi->bitmap);
210199

211200
mutex_unlock(&msi->bitmap_lock);
212201

@@ -263,55 +252,30 @@ static void xgene_msi_isr(struct irq_desc *desc)
263252
unsigned int *irqp = irq_desc_get_handler_data(desc);
264253
struct irq_chip *chip = irq_desc_get_chip(desc);
265254
struct xgene_msi *xgene_msi = xgene_msi_ctrl;
266-
int msir_index, msir_val, hw_irq, ret;
267-
u32 intr_index, grp_select, msi_grp;
255+
unsigned long grp_pending;
256+
int msir_idx;
257+
u32 msi_grp;
268258

269259
chained_irq_enter(chip, desc);
270260

271261
msi_grp = irqp - xgene_msi->gic_irq;
272262

273-
/*
274-
* MSIINTn (n is 0..F) indicates if there is a pending MSI interrupt
275-
* If bit x of this register is set (x is 0..7), one or more interrupts
276-
* corresponding to MSInIRx is set.
277-
*/
278-
grp_select = xgene_msi_int_read(xgene_msi, msi_grp);
279-
while (grp_select) {
280-
msir_index = ffs(grp_select) - 1;
281-
/*
282-
* Calculate MSInIRx address to read to check for interrupts
283-
* (refer to termination address and data assignment
284-
* described in xgene_compose_msi_msg() )
285-
*/
286-
msir_val = xgene_msi_ir_read(xgene_msi, msi_grp, msir_index);
287-
while (msir_val) {
288-
intr_index = ffs(msir_val) - 1;
289-
/*
290-
* Calculate MSI vector number (refer to the termination
291-
* address and data assignment described in
292-
* xgene_compose_msi_msg function)
293-
*/
294-
hw_irq = (((msir_index * IRQS_PER_IDX) + intr_index) *
295-
NR_HW_IRQS) + msi_grp;
296-
/*
297-
* As we have multiple hw_irq that maps to single MSI,
298-
* always look up the virq using the hw_irq as seen from
299-
* CPU0
300-
*/
301-
hw_irq = hwirq_to_canonical_hwirq(hw_irq);
302-
ret = generic_handle_domain_irq(xgene_msi->inner_domain, hw_irq);
263+
grp_pending = xgene_msi_int_read(xgene_msi, msi_grp);
264+
265+
for_each_set_bit(msir_idx, &grp_pending, IDX_PER_GROUP) {
266+
unsigned long msir;
267+
int intr_idx;
268+
269+
msir = xgene_msi_ir_read(xgene_msi, msi_grp, msir_idx);
270+
271+
for_each_set_bit(intr_idx, &msir, IRQS_PER_IDX) {
272+
irq_hw_number_t hwirq;
273+
int ret;
274+
275+
hwirq = compute_hwirq(msi_grp, msir_idx, intr_idx);
276+
ret = generic_handle_domain_irq(xgene_msi->inner_domain,
277+
hwirq);
303278
WARN_ON_ONCE(ret);
304-
msir_val &= ~(1 << intr_index);
305-
}
306-
grp_select &= ~(1 << msir_index);
307-
308-
if (!grp_select) {
309-
/*
310-
* We handled all interrupts happened in this group,
311-
* resample this group MSI_INTx register in case
312-
* something else has been made pending in the meantime
313-
*/
314-
grp_select = xgene_msi_int_read(xgene_msi, msi_grp);
315279
}
316280
}
317281

0 commit comments

Comments
 (0)