Skip to content

Commit ec6f5e0

Browse files
committed
Merge tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Thomas Gleixner: "A set of x86 and membarrier fixes: - Correct a few problems in the x86 and the generic membarrier implementation. Small corrections for assumptions about visibility which have turned out not to be true. - Make the PAT bits for memory encryption correct vs 4K and 2M/1G page table entries as they are at a different location. - Fix a concurrency issue in the the local bandwidth readout of resource control leading to incorrect values - Fix the ordering of allocating a vector for an interrupt. The order missed to respect the provided cpumask when the first attempt of allocating node local in the mask fails. It then tries the node instead of trying the full provided mask first. This leads to erroneous error messages and breaking the (user) supplied affinity request. Reorder it. - Make the INT3 padding detection in optprobe work correctly" * tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/kprobes: Fix optprobe to detect INT3 padding correctly x86/apic/vector: Fix ordering in vector assignment x86/resctrl: Fix incorrect local bandwidth when mba_sc is enabled x86/mm/mem_encrypt: Fix definition of PMD_FLAGS_DEC_WP membarrier: Execute SYNC_CORE on the calling thread membarrier: Explicitly sync remote cores when SYNC_CORE is requested membarrier: Add an actual barrier before rseq_preempt() x86/membarrier: Get rid of a dubious optimization
2 parents d2360a3 + 0d07c0e commit ec6f5e0

File tree

8 files changed

+111
-42
lines changed

8 files changed

+111
-42
lines changed

arch/x86/include/asm/pgtable_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ enum page_cache_mode {
155155
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
156156

157157
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
158+
#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
158159

159160
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
160161
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))

arch/x86/include/asm/sync_core.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
9898
/* With PTI, we unconditionally serialize before running user code. */
9999
if (static_cpu_has(X86_FEATURE_PTI))
100100
return;
101+
101102
/*
102-
* Return from interrupt and NMI is done through iret, which is core
103-
* serializing.
103+
* Even if we're in an interrupt, we might reschedule before returning,
104+
* in which case we could switch to a different thread in the same mm
105+
* and return using SYSRET or SYSEXIT. Instead of trying to keep
106+
* track of our need to sync the core, just sync right away.
104107
*/
105-
if (in_irq() || in_nmi())
106-
return;
107108
sync_core();
108109
}
109110

arch/x86/kernel/apic/vector.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
273273
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
274274
int node = irq_data_get_node(irqd);
275275

276-
if (node == NUMA_NO_NODE)
277-
goto all;
278-
/* Try the intersection of @affmsk and node mask */
279-
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
280-
if (!assign_vector_locked(irqd, vector_searchmask))
281-
return 0;
282-
/* Try the node mask */
283-
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
284-
return 0;
285-
all:
276+
if (node != NUMA_NO_NODE) {
277+
/* Try the intersection of @affmsk and node mask */
278+
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
279+
if (!assign_vector_locked(irqd, vector_searchmask))
280+
return 0;
281+
}
282+
286283
/* Try the full affinity mask */
287284
cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
288285
if (!assign_vector_locked(irqd, vector_searchmask))
289286
return 0;
287+
288+
if (node != NUMA_NO_NODE) {
289+
/* Try the node mask */
290+
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
291+
return 0;
292+
}
293+
290294
/* Try the full online mask */
291295
return assign_vector_locked(irqd, cpu_online_mask);
292296
}

arch/x86/kernel/cpu/resctrl/monitor.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
279279
return;
280280

281281
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
282-
m->chunks += chunks;
283282
cur_bw = (chunks * r->mon_scale) >> 20;
284283

285284
if (m->delta_comp)
@@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
450449
}
451450
if (is_mbm_local_enabled()) {
452451
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
452+
__mon_event_count(rmid, &rr);
453453

454454
/*
455455
* Call the MBA software controller only for the
456456
* control groups and when user has enabled
457457
* the software controller explicitly.
458458
*/
459-
if (!is_mba_sc(NULL))
460-
__mon_event_count(rmid, &rr);
461-
else
459+
if (is_mba_sc(NULL))
462460
mbm_bw_count(rmid, &rr);
463461
}
464462
}

arch/x86/kernel/kprobes/opt.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn)
272272
return ret;
273273
}
274274

275+
static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
276+
{
277+
unsigned char ops;
278+
279+
for (; addr < eaddr; addr++) {
280+
if (get_kernel_nofault(ops, (void *)addr) < 0 ||
281+
ops != INT3_INSN_OPCODE)
282+
return false;
283+
}
284+
285+
return true;
286+
}
287+
275288
/* Decode whole function to ensure any instructions don't jump into target */
276289
static int can_optimize(unsigned long paddr)
277290
{
@@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr)
310323
return 0;
311324
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
312325
insn_get_length(&insn);
313-
/* Another subsystem puts a breakpoint */
326+
/*
327+
* In the case of detecting unknown breakpoint, this could be
328+
* a padding INT3 between functions. Let's check that all the
329+
* rest of the bytes are also INT3.
330+
*/
314331
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
315-
return 0;
332+
return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
333+
316334
/* Recover address */
317335
insn.kaddr = (void *)addr;
318336
insn.next_byte = (void *)(addr + insn.length);

arch/x86/mm/mem_encrypt_identity.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@
4545
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
4646

4747
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
48-
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
49-
(_PAGE_PAT | _PAGE_PWT))
48+
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
49+
(_PAGE_PAT_LARGE | _PAGE_PWT))
5050

5151
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
5252

arch/x86/mm/tlb.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
474474
/*
475475
* The membarrier system call requires a full memory barrier and
476476
* core serialization before returning to user-space, after
477-
* storing to rq->curr. Writing to CR3 provides that full
478-
* memory barrier and core serializing instruction.
477+
* storing to rq->curr, when changing mm. This is because
478+
* membarrier() sends IPIs to all CPUs that are in the target mm
479+
* to make them issue memory barriers. However, if another CPU
480+
* switches to/from the target mm concurrently with
481+
* membarrier(), it can cause that CPU not to receive an IPI
482+
* when it really should issue a memory barrier. Writing to CR3
483+
* provides that full memory barrier and core serializing
484+
* instruction.
479485
*/
480486
if (real_prev == next) {
481487
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=

kernel/sched/membarrier.c

Lines changed: 59 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,33 @@ static void ipi_mb(void *info)
3838
smp_mb(); /* IPIs should be serializing but paranoid. */
3939
}
4040

41+
static void ipi_sync_core(void *info)
42+
{
43+
/*
44+
* The smp_mb() in membarrier after all the IPIs is supposed to
45+
* ensure that memory on remote CPUs that occur before the IPI
46+
* become visible to membarrier()'s caller -- see scenario B in
47+
* the big comment at the top of this file.
48+
*
49+
* A sync_core() would provide this guarantee, but
50+
* sync_core_before_usermode() might end up being deferred until
51+
* after membarrier()'s smp_mb().
52+
*/
53+
smp_mb(); /* IPIs should be serializing but paranoid. */
54+
55+
sync_core_before_usermode();
56+
}
57+
4158
static void ipi_rseq(void *info)
4259
{
60+
/*
61+
* Ensure that all stores done by the calling thread are visible
62+
* to the current task before the current task resumes. We could
63+
* probably optimize this away on most architectures, but by the
64+
* time we've already sent an IPI, the cost of the extra smp_mb()
65+
* is negligible.
66+
*/
67+
smp_mb();
4368
rseq_preempt(current);
4469
}
4570

@@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
154179
if (!(atomic_read(&mm->membarrier_state) &
155180
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
156181
return -EPERM;
182+
ipi_func = ipi_sync_core;
157183
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
158184
if (!IS_ENABLED(CONFIG_RSEQ))
159185
return -EINVAL;
@@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id)
168194
return -EPERM;
169195
}
170196

171-
if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
197+
if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
198+
(atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
172199
return 0;
173200

174201
/*
@@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
187214

188215
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
189216
goto out;
190-
if (cpu_id == raw_smp_processor_id())
191-
goto out;
192217
rcu_read_lock();
193218
p = rcu_dereference(cpu_rq(cpu_id)->curr);
194219
if (!p || p->mm != mm) {
@@ -203,29 +228,45 @@ static int membarrier_private_expedited(int flags, int cpu_id)
203228
for_each_online_cpu(cpu) {
204229
struct task_struct *p;
205230

206-
/*
207-
* Skipping the current CPU is OK even through we can be
208-
* migrated at any point. The current CPU, at the point
209-
* where we read raw_smp_processor_id(), is ensured to
210-
* be in program order with respect to the caller
211-
* thread. Therefore, we can skip this CPU from the
212-
* iteration.
213-
*/
214-
if (cpu == raw_smp_processor_id())
215-
continue;
216231
p = rcu_dereference(cpu_rq(cpu)->curr);
217232
if (p && p->mm == mm)
218233
__cpumask_set_cpu(cpu, tmpmask);
219234
}
220235
rcu_read_unlock();
221236
}
222237

223-
preempt_disable();
224-
if (cpu_id >= 0)
238+
if (cpu_id >= 0) {
239+
/*
240+
* smp_call_function_single() will call ipi_func() if cpu_id
241+
* is the calling CPU.
242+
*/
225243
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
226-
else
227-
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
228-
preempt_enable();
244+
} else {
245+
/*
246+
* For regular membarrier, we can save a few cycles by
247+
* skipping the current cpu -- we're about to do smp_mb()
248+
* below, and if we migrate to a different cpu, this cpu
249+
* and the new cpu will execute a full barrier in the
250+
* scheduler.
251+
*
252+
* For SYNC_CORE, we do need a barrier on the current cpu --
253+
* otherwise, if we are migrated and replaced by a different
254+
* task in the same mm just before, during, or after
255+
* membarrier, we will end up with some thread in the mm
256+
* running without a core sync.
257+
*
258+
* For RSEQ, don't rseq_preempt() the caller. User code
259+
* is not supposed to issue syscalls at all from inside an
260+
* rseq critical section.
261+
*/
262+
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
263+
preempt_disable();
264+
smp_call_function_many(tmpmask, ipi_func, NULL, true);
265+
preempt_enable();
266+
} else {
267+
on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
268+
}
269+
}
229270

230271
out:
231272
if (cpu_id < 0)

0 commit comments

Comments
 (0)