Skip to content

Commit db44dcb

Browse files
committed
Merge tag 'kvm-x86-pir-6.16' of https://github.com/kvm-x86/linux into HEAD
KVM x86 posted interrupt changes for 6.16: Refine and optimize KVM's software processing of the PIR, and ultimately share PIR harvesting code between KVM and the kernel's Posted MSI handler
2 parents 5d816c1 + edaf3ed commit db44dcb

File tree

5 files changed

+95
-72
lines changed

5 files changed

+95
-72
lines changed

arch/x86/include/asm/posted_intr.h

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
11
/* SPDX-License-Identifier: GPL-2.0 */
22
#ifndef _X86_POSTED_INTR_H
33
#define _X86_POSTED_INTR_H
4+
5+
#include <asm/cmpxchg.h>
6+
#include <asm/rwonce.h>
47
#include <asm/irq_vectors.h>
58

9+
#include <linux/bitmap.h>
10+
611
#define POSTED_INTR_ON 0
712
#define POSTED_INTR_SN 1
813

914
#define PID_TABLE_ENTRY_VALID 1
1015

16+
#define NR_PIR_VECTORS 256
17+
#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
18+
1119
/* Posted-Interrupt Descriptor */
1220
struct pi_desc {
13-
union {
14-
u32 pir[8]; /* Posted interrupt requested */
15-
u64 pir64[4];
16-
};
21+
unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
1722
union {
1823
struct {
1924
u16 notifications; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
2631
u32 rsvd[6];
2732
} __aligned(64);
2833

34+
/*
35+
* De-multiplexing posted interrupts is on the performance path, the code
36+
* below is written to optimize the cache performance based on the following
37+
* considerations:
38+
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
39+
* accessed by both CPU and IOMMU.
40+
* 2.During software processing of posted interrupts, the CPU needs to do
41+
* natural width read and xchg for checking and clearing posted interrupt
42+
* request (PIR), a 256 bit field within the PID.
43+
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
44+
* line when posting interrupts and setting control bits.
45+
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
46+
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
47+
* cache line. The cache line states after each operation are as follows,
48+
* assuming a 64-bit kernel:
49+
* CPU IOMMU PID Cache line state
50+
* ---------------------------------------------------------------
51+
*...read64 exclusive
52+
*...lock xchg64 modified
53+
*... post/atomic swap invalid
54+
*...-------------------------------------------------------------
55+
*
56+
* To reduce L1 data cache miss, it is important to avoid contention with
57+
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
58+
* when processing posted interrupts in software, e.g. to dispatch interrupt
59+
* handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
60+
* in KVM.
61+
*
62+
* In addition, the code is trying to keep the cache line state consistent
63+
* as much as possible. e.g. when making a copy and clearing the PIR
64+
* (assuming non-zero PIR bits are present in the entire PIR), it does:
65+
* read, read, read, read, xchg, xchg, xchg, xchg
66+
* instead of:
67+
* read, xchg, read, xchg, read, xchg, read, xchg
68+
*/
69+
static __always_inline bool pi_harvest_pir(unsigned long *pir,
70+
unsigned long *pir_vals)
71+
{
72+
unsigned long pending = 0;
73+
int i;
74+
75+
for (i = 0; i < NR_PIR_WORDS; i++) {
76+
pir_vals[i] = READ_ONCE(pir[i]);
77+
pending |= pir_vals[i];
78+
}
79+
80+
if (!pending)
81+
return false;
82+
83+
for (i = 0; i < NR_PIR_WORDS; i++) {
84+
if (!pir_vals[i])
85+
continue;
86+
87+
pir_vals[i] = arch_xchg(&pir[i], 0);
88+
}
89+
90+
return true;
91+
}
92+
2993
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
3094
{
3195
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
43107

44108
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
45109
{
46-
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
110+
return test_and_set_bit(vector, pi_desc->pir);
47111
}
48112

49113
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
50114
{
51-
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
115+
return bitmap_empty(pi_desc->pir, NR_VECTORS);
52116
}
53117

54118
static inline void pi_set_sn(struct pi_desc *pi_desc)
@@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
110174
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
111175
return false;
112176

113-
return test_bit(vector, (unsigned long *)pid->pir);
177+
return test_bit(vector, pid->pir);
114178
}
115179

116180
extern void intel_posted_msi_init(void);

arch/x86/kernel/irq.c

Lines changed: 10 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -380,61 +380,18 @@ void intel_posted_msi_init(void)
380380
this_cpu_write(posted_msi_pi_desc.ndst, destination);
381381
}
382382

383-
/*
384-
* De-multiplexing posted interrupts is on the performance path, the code
385-
* below is written to optimize the cache performance based on the following
386-
* considerations:
387-
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
388-
* accessed by both CPU and IOMMU.
389-
* 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
390-
* for checking and clearing posted interrupt request (PIR), a 256 bit field
391-
* within the PID.
392-
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
393-
* line when posting interrupts and setting control bits.
394-
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
395-
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
396-
* cache line. The cache line states after each operation are as follows:
397-
* CPU IOMMU PID Cache line state
398-
* ---------------------------------------------------------------
399-
*...read64 exclusive
400-
*...lock xchg64 modified
401-
*... post/atomic swap invalid
402-
*...-------------------------------------------------------------
403-
*
404-
* To reduce L1 data cache miss, it is important to avoid contention with
405-
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
406-
* to dispatch interrupt handlers.
407-
*
408-
* In addition, the code is trying to keep the cache line state consistent
409-
* as much as possible. e.g. when making a copy and clearing the PIR
410-
* (assuming non-zero PIR bits are present in the entire PIR), it does:
411-
* read, read, read, read, xchg, xchg, xchg, xchg
412-
* instead of:
413-
* read, xchg, read, xchg, read, xchg, read, xchg
414-
*/
415-
static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
383+
static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
416384
{
417-
int i, vec = FIRST_EXTERNAL_VECTOR;
418-
unsigned long pir_copy[4];
419-
bool handled = false;
385+
unsigned long pir_copy[NR_PIR_WORDS];
386+
int vec = FIRST_EXTERNAL_VECTOR;
420387

421-
for (i = 0; i < 4; i++)
422-
pir_copy[i] = pir[i];
423-
424-
for (i = 0; i < 4; i++) {
425-
if (!pir_copy[i])
426-
continue;
388+
if (!pi_harvest_pir(pir, pir_copy))
389+
return false;
427390

428-
pir_copy[i] = arch_xchg(&pir[i], 0);
429-
handled = true;
430-
}
431-
432-
if (handled) {
433-
for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
434-
call_irq_handler(vec, regs);
435-
}
391+
for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
392+
call_irq_handler(vec, regs);
436393

437-
return handled;
394+
return true;
438395
}
439396

440397
/*
@@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
464421
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
465422
*/
466423
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
467-
if (!handle_pending_pir(pid->pir64, regs))
424+
if (!handle_pending_pir(pid->pir, regs))
468425
break;
469426
}
470427

@@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
479436
* process PIR bits one last time such that handling the new interrupts
480437
* are not delayed until the next IRQ.
481438
*/
482-
handle_pending_pir(pid->pir64, regs);
439+
handle_pending_pir(pid->pir, regs);
483440

484441
apic_eoi();
485442
irq_exit();

arch/x86/kvm/lapic.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap)
655655
return count;
656656
}
657657

658-
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
658+
bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
659659
{
660+
unsigned long pir_vals[NR_PIR_WORDS];
661+
u32 *__pir = (void *)pir_vals;
660662
u32 i, vec;
661-
u32 pir_val, irr_val, prev_irr_val;
663+
u32 irr_val, prev_irr_val;
662664
int max_updated_irr;
663665

664666
max_updated_irr = -1;
665667
*max_irr = -1;
666668

669+
if (!pi_harvest_pir(pir, pir_vals))
670+
return false;
671+
667672
for (i = vec = 0; i <= 7; i++, vec += 32) {
668673
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
669674

670-
irr_val = *p_irr;
671-
pir_val = READ_ONCE(pir[i]);
672-
673-
if (pir_val) {
674-
pir_val = xchg(&pir[i], 0);
675+
irr_val = READ_ONCE(*p_irr);
675676

677+
if (__pir[i]) {
676678
prev_irr_val = irr_val;
677679
do {
678-
irr_val = prev_irr_val | pir_val;
680+
irr_val = prev_irr_val | __pir[i];
679681
} while (prev_irr_val != irr_val &&
680682
!try_cmpxchg(p_irr, &prev_irr_val, irr_val));
681683

@@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
691693
}
692694
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
693695

694-
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
696+
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
695697
{
696698
struct kvm_lapic *apic = vcpu->arch.apic;
697699
bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);

arch/x86/kvm/lapic.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
103103
int shorthand, unsigned int dest, int dest_mode);
104104
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
105105
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
106-
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
107-
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
106+
bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
107+
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
108108
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
109109
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
110110
struct dest_map *dest_map);

arch/x86/kvm/vmx/posted_intr.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
2020
{
2121
int vec;
2222

23-
vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
23+
vec = find_last_bit(pi_desc->pir, 256);
2424
return vec < 256 ? vec : -1;
2525
}
2626

0 commit comments

Comments
 (0)