Skip to content

Commit 1a2a76c

Browse files
committed
Merge tag 'x86-urgent-2020-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Thomas Gleixner: "A set of fixes for X86: - Ensure that the PIT is set up when the local APIC is disable or configured in legacy mode. This is caused by an ordering issue introduced in the recent changes which skip PIT initialization when the TSC and APIC frequencies are already known. - Handle malformed SRAT tables during early ACPI parsing which caused an infinite loop anda boot hang. - Fix a long standing race in the affinity setting code which affects PCI devices with non-maskable MSI interrupts. The problem is caused by the non-atomic writes of the MSI address (destination APIC id) and data (vector) fields which the device uses to construct the MSI message. The non-atomic writes are mandated by PCI. If both fields change and the device raises an interrupt after writing address and before writing data, then the MSI block constructs a inconsistent message which causes interrupts to be lost and subsequent malfunction of the device. The fix is to redirect the interrupt to the new vector on the current CPU first and then switch it over to the new target CPU. This allows to observe an eventually raised interrupt in the transitional stage (old CPU, new vector) to be observed in the APIC IRR and retriggered on the new target CPU and the new vector. The potential spurious interrupts caused by this are harmless and can in the worst case expose a buggy driver (all handlers have to be able to deal with spurious interrupts as they can and do happen for various reasons). - Add the missing suspend/resume mechanism for the HYPERV hypercall page which prevents resume hibernation on HYPERV guests. This change got lost before the merge window. - Mask the IOAPIC before disabling the local APIC to prevent potentially stale IOAPIC remote IRR bits which cause stale interrupt lines after resume" * tag 'x86-urgent-2020-02-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/apic: Mask IOAPIC entries when disabling the local APIC x86/hyperv: Suspend/resume the hypercall page for hibernation x86/apic/msi: Plug non-maskable MSI affinity race x86/boot: Handle malformed SRAT tables during early ACPI parsing x86/timer: Don't skip PIT setup when APIC is disabled or in legacy mode
2 parents f413776 + 0f378d7 commit 1a2a76c

File tree

13 files changed

+260
-11
lines changed

13 files changed

+260
-11
lines changed

arch/x86/boot/compressed/acpi.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,13 @@ int count_immovable_mem_regions(void)
393393
table = table_addr + sizeof(struct acpi_table_srat);
394394

395395
while (table + sizeof(struct acpi_subtable_header) < table_end) {
396+
396397
sub_table = (struct acpi_subtable_header *)table;
398+
if (!sub_table->length) {
399+
debug_putstr("Invalid zero length SRAT subtable.\n");
400+
return 0;
401+
}
402+
397403
if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
398404
struct acpi_srat_mem_affinity *ma;
399405

arch/x86/hyperv/hv_init.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,15 @@
2121
#include <linux/hyperv.h>
2222
#include <linux/slab.h>
2323
#include <linux/cpuhotplug.h>
24+
#include <linux/syscore_ops.h>
2425
#include <clocksource/hyperv_timer.h>
2526

2627
void *hv_hypercall_pg;
2728
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
2829

30+
/* Storage to save the hypercall page temporarily for hibernation */
31+
static void *hv_hypercall_pg_saved;
32+
2933
u32 *hv_vp_index;
3034
EXPORT_SYMBOL_GPL(hv_vp_index);
3135

@@ -246,6 +250,48 @@ static int __init hv_pci_init(void)
246250
return 1;
247251
}
248252

253+
static int hv_suspend(void)
254+
{
255+
union hv_x64_msr_hypercall_contents hypercall_msr;
256+
257+
/*
258+
* Reset the hypercall page as it is going to be invalidated
259+
* accross hibernation. Setting hv_hypercall_pg to NULL ensures
260+
* that any subsequent hypercall operation fails safely instead of
261+
* crashing due to an access of an invalid page. The hypercall page
262+
* pointer is restored on resume.
263+
*/
264+
hv_hypercall_pg_saved = hv_hypercall_pg;
265+
hv_hypercall_pg = NULL;
266+
267+
/* Disable the hypercall page in the hypervisor */
268+
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
269+
hypercall_msr.enable = 0;
270+
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
271+
272+
return 0;
273+
}
274+
275+
static void hv_resume(void)
276+
{
277+
union hv_x64_msr_hypercall_contents hypercall_msr;
278+
279+
/* Re-enable the hypercall page */
280+
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
281+
hypercall_msr.enable = 1;
282+
hypercall_msr.guest_physical_address =
283+
vmalloc_to_pfn(hv_hypercall_pg_saved);
284+
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
285+
286+
hv_hypercall_pg = hv_hypercall_pg_saved;
287+
hv_hypercall_pg_saved = NULL;
288+
}
289+
290+
static struct syscore_ops hv_syscore_ops = {
291+
.suspend = hv_suspend,
292+
.resume = hv_resume,
293+
};
294+
249295
/*
250296
* This function is to be invoked early in the boot sequence after the
251297
* hypervisor has been detected.
@@ -330,6 +376,8 @@ void __init hyperv_init(void)
330376

331377
x86_init.pci.arch_init = hv_pci_init;
332378

379+
register_syscore_ops(&hv_syscore_ops);
380+
333381
return;
334382

335383
remove_cpuhp_state:
@@ -349,6 +397,8 @@ void hyperv_cleanup(void)
349397
{
350398
union hv_x64_msr_hypercall_contents hypercall_msr;
351399

400+
unregister_syscore_ops(&hv_syscore_ops);
401+
352402
/* Reset our OS id */
353403
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
354404

arch/x86/include/asm/apic.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ extern void apic_soft_disable(void);
140140
extern void lapic_shutdown(void);
141141
extern void sync_Arb_IDs(void);
142142
extern void init_bsp_APIC(void);
143+
extern void apic_intr_mode_select(void);
143144
extern void apic_intr_mode_init(void);
144145
extern void init_apic_mappings(void);
145146
void register_lapic_address(unsigned long address);
@@ -188,6 +189,7 @@ static inline void disable_local_APIC(void) { }
188189
# define setup_secondary_APIC_clock x86_init_noop
189190
static inline void lapic_update_tsc_freq(void) { }
190191
static inline void init_bsp_APIC(void) { }
192+
static inline void apic_intr_mode_select(void) { }
191193
static inline void apic_intr_mode_init(void) { }
192194
static inline void lapic_assign_system_vectors(void) { }
193195
static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
@@ -452,6 +454,14 @@ static inline void ack_APIC_irq(void)
452454
apic_eoi();
453455
}
454456

457+
458+
static inline bool lapic_vector_set_in_irr(unsigned int vector)
459+
{
460+
u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
461+
462+
return !!(irr & (1U << (vector % 32)));
463+
}
464+
455465
static inline unsigned default_get_apic_id(unsigned long x)
456466
{
457467
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));

arch/x86/include/asm/x86_init.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,14 @@ struct x86_init_resources {
5151
* are set up.
5252
* @intr_init: interrupt init code
5353
* @trap_init: platform specific trap setup
54+
* @intr_mode_select: interrupt delivery mode selection
5455
* @intr_mode_init: interrupt delivery mode setup
5556
*/
5657
struct x86_init_irqs {
5758
void (*pre_vector_init)(void);
5859
void (*intr_init)(void);
5960
void (*trap_init)(void);
61+
void (*intr_mode_select)(void);
6062
void (*intr_mode_init)(void);
6163
};
6264

arch/x86/kernel/apic/apic.c

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -830,8 +830,17 @@ bool __init apic_needs_pit(void)
830830
if (!tsc_khz || !cpu_khz)
831831
return true;
832832

833-
/* Is there an APIC at all? */
834-
if (!boot_cpu_has(X86_FEATURE_APIC))
833+
/* Is there an APIC at all or is it disabled? */
834+
if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic)
835+
return true;
836+
837+
/*
838+
* If interrupt delivery mode is legacy PIC or virtual wire without
839+
* configuration, the local APIC timer wont be set up. Make sure
840+
* that the PIT is initialized.
841+
*/
842+
if (apic_intr_mode == APIC_PIC ||
843+
apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
835844
return true;
836845

837846
/* Virt guests may lack ARAT, but still have DEADLINE */
@@ -1322,7 +1331,7 @@ void __init sync_Arb_IDs(void)
13221331

13231332
enum apic_intr_mode_id apic_intr_mode __ro_after_init;
13241333

1325-
static int __init apic_intr_mode_select(void)
1334+
static int __init __apic_intr_mode_select(void)
13261335
{
13271336
/* Check kernel option */
13281337
if (disable_apic) {
@@ -1384,6 +1393,12 @@ static int __init apic_intr_mode_select(void)
13841393
return APIC_SYMMETRIC_IO;
13851394
}
13861395

1396+
/* Select the interrupt delivery mode for the BSP */
1397+
void __init apic_intr_mode_select(void)
1398+
{
1399+
apic_intr_mode = __apic_intr_mode_select();
1400+
}
1401+
13871402
/*
13881403
* An initial setup of the virtual wire mode.
13891404
*/
@@ -1440,8 +1455,6 @@ void __init apic_intr_mode_init(void)
14401455
{
14411456
bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
14421457

1443-
apic_intr_mode = apic_intr_mode_select();
1444-
14451458
switch (apic_intr_mode) {
14461459
case APIC_PIC:
14471460
pr_info("APIC: Keep in PIC mode(8259)\n");
@@ -2626,6 +2639,13 @@ static int lapic_suspend(void)
26262639
#endif
26272640

26282641
local_irq_save(flags);
2642+
2643+
/*
2644+
* Mask IOAPIC before disabling the local APIC to prevent stale IRR
2645+
* entries on some implementations.
2646+
*/
2647+
mask_ioapic_entries();
2648+
26292649
disable_local_APIC();
26302650

26312651
irq_remapping_disable();

arch/x86/kernel/apic/msi.c

Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,8 @@
2323

2424
static struct irq_domain *msi_default_domain;
2525

26-
static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
26+
static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
2727
{
28-
struct irq_cfg *cfg = irqd_cfg(data);
29-
3028
msg->address_hi = MSI_ADDR_BASE_HI;
3129

3230
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
4745
MSI_DATA_VECTOR(cfg->vector);
4846
}
4947

48+
static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
49+
{
50+
__irq_msi_compose_msg(irqd_cfg(data), msg);
51+
}
52+
53+
static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
54+
{
55+
struct msi_msg msg[2] = { [1] = { }, };
56+
57+
__irq_msi_compose_msg(cfg, msg);
58+
irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
59+
}
60+
61+
static int
62+
msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
63+
{
64+
struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
65+
struct irq_data *parent = irqd->parent_data;
66+
unsigned int cpu;
67+
int ret;
68+
69+
/* Save the current configuration */
70+
cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
71+
old_cfg = *cfg;
72+
73+
/* Allocate a new target vector */
74+
ret = parent->chip->irq_set_affinity(parent, mask, force);
75+
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
76+
return ret;
77+
78+
/*
79+
* For non-maskable and non-remapped MSI interrupts the migration
80+
* to a different destination CPU and a different vector has to be
81+
* done careful to handle the possible stray interrupt which can be
82+
* caused by the non-atomic update of the address/data pair.
83+
*
84+
* Direct update is possible when:
85+
* - The MSI is maskable (remapped MSI does not use this code path)).
86+
* The quirk bit is not set in this case.
87+
* - The new vector is the same as the old vector
88+
* - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
89+
* - The new destination CPU is the same as the old destination CPU
90+
*/
91+
if (!irqd_msi_nomask_quirk(irqd) ||
92+
cfg->vector == old_cfg.vector ||
93+
old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
94+
cfg->dest_apicid == old_cfg.dest_apicid) {
95+
irq_msi_update_msg(irqd, cfg);
96+
return ret;
97+
}
98+
99+
/*
100+
* Paranoia: Validate that the interrupt target is the local
101+
* CPU.
102+
*/
103+
if (WARN_ON_ONCE(cpu != smp_processor_id())) {
104+
irq_msi_update_msg(irqd, cfg);
105+
return ret;
106+
}
107+
108+
/*
109+
* Redirect the interrupt to the new vector on the current CPU
110+
* first. This might cause a spurious interrupt on this vector if
111+
* the device raises an interrupt right between this update and the
112+
* update to the final destination CPU.
113+
*
114+
* If the vector is in use then the installed device handler will
115+
* denote it as spurious which is no harm as this is a rare event
116+
* and interrupt handlers have to cope with spurious interrupts
117+
* anyway. If the vector is unused, then it is marked so it won't
118+
* trigger the 'No irq handler for vector' warning in do_IRQ().
119+
*
120+
* This requires to hold vector lock to prevent concurrent updates to
121+
* the affected vector.
122+
*/
123+
lock_vector_lock();
124+
125+
/*
126+
* Mark the new target vector on the local CPU if it is currently
127+
* unused. Reuse the VECTOR_RETRIGGERED state which is also used in
128+
* the CPU hotplug path for a similar purpose. This cannot be
129+
* undone here as the current CPU has interrupts disabled and
130+
* cannot handle the interrupt before the whole set_affinity()
131+
* section is done. In the CPU unplug case, the current CPU is
132+
* about to vanish and will not handle any interrupts anymore. The
133+
* vector is cleaned up when the CPU comes online again.
134+
*/
135+
if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
136+
this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
137+
138+
/* Redirect it to the new vector on the local CPU temporarily */
139+
old_cfg.vector = cfg->vector;
140+
irq_msi_update_msg(irqd, &old_cfg);
141+
142+
/* Now transition it to the target CPU */
143+
irq_msi_update_msg(irqd, cfg);
144+
145+
/*
146+
* All interrupts after this point are now targeted at the new
147+
* vector/CPU.
148+
*
149+
* Drop vector lock before testing whether the temporary assignment
150+
* to the local CPU was hit by an interrupt raised in the device,
151+
* because the retrigger function acquires vector lock again.
152+
*/
153+
unlock_vector_lock();
154+
155+
/*
156+
* Check whether the transition raced with a device interrupt and
157+
* is pending in the local APICs IRR. It is safe to do this outside
158+
* of vector lock as the irq_desc::lock of this interrupt is still
159+
* held and interrupts are disabled: The check is not accessing the
160+
* underlying vector store. It's just checking the local APIC's
161+
* IRR.
162+
*/
163+
if (lapic_vector_set_in_irr(cfg->vector))
164+
irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
165+
166+
return ret;
167+
}
168+
50169
/*
51170
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
52171
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = {
58177
.irq_ack = irq_chip_ack_parent,
59178
.irq_retrigger = irq_chip_retrigger_hierarchy,
60179
.irq_compose_msi_msg = irq_msi_compose_msg,
180+
.irq_set_affinity = msi_set_affinity,
61181
.flags = IRQCHIP_SKIP_SET_WAKE,
62182
};
63183

@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
146266
}
147267
if (!msi_default_domain)
148268
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
269+
else
270+
msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
149271
}
150272

151273
#ifdef CONFIG_IRQ_REMAP

arch/x86/kernel/time.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,18 @@ void __init hpet_time_init(void)
9191

9292
static __init void x86_late_time_init(void)
9393
{
94+
/*
95+
* Before PIT/HPET init, select the interrupt mode. This is required
96+
* to make the decision whether PIT should be initialized correct.
97+
*/
98+
x86_init.irqs.intr_mode_select();
99+
100+
/* Setup the legacy timers */
94101
x86_init.timers.timer_init();
102+
95103
/*
96-
* After PIT/HPET timers init, select and setup
97-
* the final interrupt mode for delivering IRQs.
104+
* After PIT/HPET timers init, set up the final interrupt mode for
105+
* delivering IRQs.
98106
*/
99107
x86_init.irqs.intr_mode_init();
100108
tsc_init();

arch/x86/kernel/x86_init.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ struct x86_init_ops x86_init __initdata = {
8080
.pre_vector_init = init_ISA_irqs,
8181
.intr_init = native_init_IRQ,
8282
.trap_init = x86_init_noop,
83+
.intr_mode_select = apic_intr_mode_select,
8384
.intr_mode_init = apic_intr_mode_init
8485
},
8586

0 commit comments

Comments
 (0)