Skip to content

clang-cl generates suboptimal code for extern __declspec(thread) thread-local variables #103484

@mcfi

Description

@mcfi

Consider this example https://godbolt.org/z/fEjrzrW8n

struct A {
    int a;
};

extern __declspec(thread) struct A *a;

struct A*
getA(void)
{
    return a;
}

clang-cl 19.1.0-rc2 generates the following when targeting x64

getA:                                   # @getA
.seh_proc getA
# %bb.0:
	sub	rsp, 40
	.seh_stackalloc 40
	.seh_endprologue
	mov	eax, dword ptr [rip + _tls_index]
	mov	rcx, qword ptr gs:[88]
	mov	rax, qword ptr [rcx + 8*rax]
	cmp	byte ptr [rax + __tls_guard@SECREL32], 0
	jne	.LBB0_2
# %bb.1:
	call	__dyn_tls_on_demand_init
.LBB0_2:
	mov	eax, dword ptr [rip + _tls_index]
	mov	rcx, qword ptr gs:[88]
	mov	rax, qword ptr [rcx + 8*rax]
	mov	rax, qword ptr [rax + a@SECREL32]
	add	rsp, 40
	ret

and below when targeting arm64

getA:                                   // @getA
.seh_proc getA
// %bb.0:
	str	x19, [sp, #-16]!                // 8-byte Folded Spill
	.seh_save_reg_x	x19, 16
	str	x30, [sp, #8]                   // 8-byte Folded Spill
	.seh_save_reg	x30, 8
	.seh_endprologue
	adrp	x19, _tls_index
	ldr	x9, [x18, #88]
	ldr	w8, [x19, :lo12:_tls_index]
	ldr	x8, [x9, x8, lsl #3]
	add	x8, x8, :secrel_hi12:__tls_guard
	ldrb	w8, [x8, :secrel_lo12:__tls_guard]
	cbnz	w8, .LBB0_2
// %bb.1:
	bl	__dyn_tls_on_demand_init
.LBB0_2:
	ldr	w8, [x19, :lo12:_tls_index]
	ldr	x9, [x18, #88]
	ldr	x8, [x9, x8, lsl #3]
	add	x8, x8, :secrel_hi12:a
	ldr	x0, [x8, :secrel_lo12:a]
	.seh_startepilogue
	ldr	x30, [sp, #8]                   // 8-byte Folded Reload
	.seh_save_reg	x30, 8
	ldr	x19, [sp], #16                  // 8-byte Folded Reload
	.seh_save_reg_x	x19, 16
	.seh_endepilogue
	ret

Compared to MSVC-generated code below, clang-cl generates suboptimal code. If you look at the code generated by MSVC, there's no call against __dyn_tls_on_demand_init. In general, MSVC doesn't call __dyn_tls_on_demand_init if a thread-local variable doesn't have a constructor.

x64 getA    PROC                                            ; COMDAT
        mov     ecx, DWORD PTR _tls_index
        mov     rax, QWORD PTR gs:88
        mov     edx, OFFSET FLAT:a
        mov     rax, QWORD PTR [rax+rcx*8]
        mov     rax, QWORD PTR [rdx+rax]
        ret     0
getA    ENDP

arm64 |getA|  PROC
        adrp        x8,_tls_index
        ldr         w9,[x8,_tls_index]
        ldr         x8,[xpr,#0x58]
        ldr         x8,[x8,w9 uxtw #3]
        add         x8,x8,a,lsl #0xC
        ldr         x0,[x8,a]
        ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions