|
25 | 25 | #include "tcmalloc/internal/mincore.h" |
26 | 26 | #include "tcmalloc/internal/percpu.h" |
27 | 27 |
|
| 28 | +#if defined(PERCPU_USE_RSEQ) |
| 29 | +#if !defined(__clang__) |
| 30 | +#define PERCPU_USE_RSEQ_ASM_GOTO 1 |
| 31 | +#elif __clang_major__ >= 9 && !__has_feature(speculative_load_hardening) |
| 32 | +// asm goto requires the use of Clang 9 or newer: |
| 33 | +// https://releases.llvm.org/9.0.0/tools/clang/docs/ReleaseNotes.html#c-language-changes-in-clang |
| 34 | +// |
| 35 | +// SLH (Speculative Load Hardening) builds do not support asm goto. We can |
| 36 | +// detect these compilation modes since |
| 37 | +// https://github.com/llvm/llvm-project/commit/379e68a763097bed55556c6dc7453e4b732e3d68. |
| 38 | +#define PERCPU_USE_RSEQ_ASM_GOTO 1 |
| 39 | +#else |
| 40 | +#define PERCPU_USE_RSEQ_ASM_GOTO 0 |
| 41 | +#endif |
| 42 | +#else |
| 43 | +#define PERCPU_USE_RSEQ_ASM_GOTO 0 |
| 44 | +#endif |
| 45 | + |
28 | 46 | namespace tcmalloc { |
29 | 47 |
|
30 | 48 | struct PerCPUMetadataState { |
@@ -229,10 +247,12 @@ template <size_t Shift, size_t NumClasses> |
229 | 247 | static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push( |
230 | 248 | typename TcmallocSlab<Shift, NumClasses>::Slabs* slabs, size_t cl, |
231 | 249 | void* item, OverflowHandler f) { |
232 | | - // TODO(b/149467541): Move this to asm goto. |
233 | | - uint64_t scratch, current; |
| 250 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 251 | + asm goto( |
| 252 | +#else |
234 | 253 | bool overflow; |
235 | 254 | asm volatile( |
| 255 | +#endif |
236 | 256 | // TODO(b/141629158): __rseq_cs only needs to be writeable to allow for |
237 | 257 | // relocations, but could be read-only for non-PIE builds. |
238 | 258 | ".pushsection __rseq_cs, \"aw?\"\n" |
@@ -267,45 +287,69 @@ static inline ABSL_ATTRIBUTE_ALWAYS_INLINE int TcmallocSlab_Push( |
267 | 287 | "jmp 3f\n" |
268 | 288 | ".popsection\n" |
269 | 289 | // Prepare |
| 290 | + // |
| 291 | + // TODO(b/151503411): Pending widespread availability of LLVM's asm |
| 292 | + // goto with output contraints |
| 293 | + // (https://github.com/llvm/llvm-project/commit/23c2a5ce33f0), we can |
| 294 | + // return the register allocations to the compiler rather than using |
| 295 | + // explicit clobbers. Prior to this, blocks which use asm goto cannot |
| 296 | + // also specify outputs. |
| 297 | + // |
| 298 | + // r10: Scratch |
| 299 | + // r11: Current |
270 | 300 | "3:\n" |
271 | | - "lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %[scratch]\n" |
272 | | - "mov %[scratch], %c[rseq_cs_offset](%[rseq_abi])\n" |
| 301 | + "lea __rseq_cs_TcmallocSlab_Push_%=(%%rip), %%r10\n" |
| 302 | + "mov %%r10, %c[rseq_cs_offset](%[rseq_abi])\n" |
273 | 303 | // Start |
274 | 304 | "4:\n" |
275 | | - // scratch = __rseq_abi.cpu_id; |
276 | | - "mov %c[rseq_cpu_offset](%[rseq_abi]), %k[scratch]\n" |
277 | | - // scratch = slabs + scratch |
278 | | - "shl %[shift], %[scratch]\n" |
279 | | - "add %[slabs], %[scratch]\n" |
| 305 | + // r10 = __rseq_abi.cpu_id; |
| 306 | + "mov %c[rseq_cpu_offset](%[rseq_abi]), %%r10d\n" |
| 307 | + // r10 = slabs + r10 |
| 308 | + "shl %[shift], %%r10\n" |
| 309 | + "add %[slabs], %%r10\n" |
280 | 310 | // r11 = slabs->current; |
281 | | - "movzwq (%[scratch], %[cl], 8), %[current]\n" |
| 311 | + "movzwq (%%r10, %[cl], 8), %%r11\n" |
282 | 312 | // if (ABSL_PREDICT_FALSE(r11 >= slabs->end)) { goto overflow; } |
283 | | - "cmp 6(%[scratch], %[cl], 8), %w[current]\n" |
| 313 | + "cmp 6(%%r10, %[cl], 8), %%r11w\n" |
| 314 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 315 | + "jae %l[overflow_label]\n" |
| 316 | +#else |
284 | 317 | "jae 5f\n" |
285 | | - // Important! code below this must not affect any flags (i.e.: ccae) |
286 | | - // If so, the above code needs to explicitly set a ccae return value. |
287 | | - "mov %[item], (%[scratch], %[current], 8)\n" |
288 | | - "lea 1(%[current]), %[current]\n" |
289 | | - "mov %w[current], (%[scratch], %[cl], 8)\n" |
| 318 | + // Important! code below this must not affect any flags (i.e.: ccae) |
| 319 | + // If so, the above code needs to explicitly set a ccae return value. |
| 320 | +#endif |
| 321 | + "mov %[item], (%%r10, %%r11, 8)\n" |
| 322 | + "lea 1(%%r11), %%r11\n" |
| 323 | + "mov %%r11w, (%%r10, %[cl], 8)\n" |
290 | 324 | // Commit |
291 | 325 | "5:\n" |
292 | | - : [current] "=&r"(current), [scratch] "=&r"(scratch), |
293 | | - [overflow] "=@ccae"(overflow) |
| 326 | + : |
| 327 | +#ifndef PERCPU_USE_RSEQ_ASM_GOTO |
| 328 | + [overflow] "=@ccae"(overflow) |
| 329 | +#endif |
294 | 330 | : [rseq_abi] "r"(&__rseq_abi), |
295 | 331 | [rseq_cs_offset] "n"(offsetof(kernel_rseq, rseq_cs)), |
296 | 332 | // TODO(b/130894622): When using virtual CPU IDs, this will be dynamic. |
297 | 333 | [rseq_cpu_offset] "n"(offsetof(kernel_rseq, cpu_id)), |
298 | 334 | [rseq_sig] "in"(PERCPU_RSEQ_SIGNATURE), [shift] "in"(Shift), |
299 | 335 | [slabs] "r"(slabs), [cl] "r"(cl), [item] "r"(item) |
300 | | - : "cc", "memory"); |
301 | | - // Undo transformation of cpu_id to the value of scratch. |
302 | | - int cpu = reinterpret_cast<typename TcmallocSlab<Shift, NumClasses>::Slabs*>( |
303 | | - scratch) - |
304 | | - slabs; |
| 336 | + : "cc", "memory", "r10", "r11" |
| 337 | +#ifdef PERCPU_USE_RSEQ_ASM_GOTO |
| 338 | + : overflow_label |
| 339 | +#endif |
| 340 | + ); |
| 341 | +#ifndef PERCPU_USE_RSEQ_ASM_GOTO |
305 | 342 | if (ABSL_PREDICT_FALSE(overflow)) { |
306 | | - return f(cpu, cl, item); |
| 343 | + goto overflow_label; |
307 | 344 | } |
308 | | - return cpu; |
| 345 | +#endif |
| 346 | + return 0; |
| 347 | +overflow_label: |
| 348 | + // As of 3/2020, LLVM's asm goto (even with output constraints) only provides |
| 349 | + // values for the fallthrough path. The values on the taken branches are |
| 350 | + // undefined. |
| 351 | + int cpu = __rseq_abi.cpu_id; |
| 352 | + return f(cpu, cl, item); |
309 | 353 | } |
310 | 354 | #endif // defined(__x86_64__) |
311 | 355 |
|
|
0 commit comments