@@ -3449,5 +3449,268 @@ entry:
34493449 ret void
34503450}
34513451
3452+ ; The first (S0) operand of the v_dot4 is derived from the LHS of the mul chain (that is %op80, %op50).
3453+ ; These correspond to the 0th, and 4th bytes starting from %inptr1.
3454+ ; Confirm that we are actually accessing these bytes.
3455+ ;
3456+ ; Previously, we used the dword offset from the corresponding byte in the second (S1) operand.
3457+ ; The result was to access the 0th byte instead of the 4th (i.e. a dword offset of 0 instead of 1).
3458+
3459+ define amdgpu_kernel void @ByteOffsetCorrectness (ptr addrspace (1 ) %inptr1 , i8 %l81 , i8 %l51 ) {
3460+ ; GFX7-LABEL: ByteOffsetCorrectness:
3461+ ; GFX7: ; %bb.0: ; %.entry
3462+ ; GFX7-NEXT: s_load_dword s0, s[2:3], 0xb
3463+ ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
3464+ ; GFX7-NEXT: s_mov_b32 s7, 0xf000
3465+ ; GFX7-NEXT: s_mov_b32 s6, -1
3466+ ; GFX7-NEXT: s_mov_b32 s8, 0
3467+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3468+ ; GFX7-NEXT: s_sext_i32_i8 s2, s0
3469+ ; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80008
3470+ ; GFX7-NEXT: s_mov_b32 s9, s8
3471+ ; GFX7-NEXT: s_mov_b32 s10, s6
3472+ ; GFX7-NEXT: s_mov_b32 s11, s7
3473+ ; GFX7-NEXT: s_and_b64 s[0:1], exec, -1
3474+ ; GFX7-NEXT: .LBB17_1: ; %.lr.ph
3475+ ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
3476+ ; GFX7-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:4
3477+ ; GFX7-NEXT: buffer_load_sbyte v1, off, s[4:7], 0
3478+ ; GFX7-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:1
3479+ ; GFX7-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2
3480+ ; GFX7-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:3
3481+ ; GFX7-NEXT: s_waitcnt vmcnt(4)
3482+ ; GFX7-NEXT: v_mul_lo_u32 v0, v0, s3
3483+ ; GFX7-NEXT: s_waitcnt vmcnt(3)
3484+ ; GFX7-NEXT: v_mul_lo_u32 v1, v1, s2
3485+ ; GFX7-NEXT: s_waitcnt vmcnt(1)
3486+ ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
3487+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
3488+ ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
3489+ ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 8
3490+ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
3491+ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
3492+ ; GFX7-NEXT: s_mov_b64 vcc, s[0:1]
3493+ ; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0
3494+ ; GFX7-NEXT: s_cbranch_vccnz .LBB17_1
3495+ ; GFX7-NEXT: ; %bb.2: ; %DummyReturnBlock
3496+ ; GFX7-NEXT: s_endpgm
3497+ ;
3498+ ; GFX8-LABEL: ByteOffsetCorrectness:
3499+ ; GFX8: ; %bb.0: ; %.entry
3500+ ; GFX8-NEXT: s_load_dword s6, s[2:3], 0x2c
3501+ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3502+ ; GFX8-NEXT: v_mov_b32_e32 v10, 0
3503+ ; GFX8-NEXT: v_mov_b32_e32 v11, 0
3504+ ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3505+ ; GFX8-NEXT: s_sext_i32_i8 s2, s6
3506+ ; GFX8-NEXT: s_add_u32 s4, s0, 4
3507+ ; GFX8-NEXT: s_addc_u32 s5, s1, 0
3508+ ; GFX8-NEXT: s_bfe_i32 s3, s6, 0x80008
3509+ ; GFX8-NEXT: s_add_u32 s6, s0, 3
3510+ ; GFX8-NEXT: s_addc_u32 s7, s1, 0
3511+ ; GFX8-NEXT: s_add_u32 s8, s0, 2
3512+ ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3513+ ; GFX8-NEXT: s_addc_u32 s9, s1, 0
3514+ ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3515+ ; GFX8-NEXT: s_add_u32 s0, s0, 1
3516+ ; GFX8-NEXT: s_addc_u32 s1, s1, 0
3517+ ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3518+ ; GFX8-NEXT: v_mov_b32_e32 v4, s8
3519+ ; GFX8-NEXT: v_mov_b32_e32 v7, s1
3520+ ; GFX8-NEXT: v_mov_b32_e32 v9, s7
3521+ ; GFX8-NEXT: v_mov_b32_e32 v3, s5
3522+ ; GFX8-NEXT: v_mov_b32_e32 v5, s9
3523+ ; GFX8-NEXT: v_mov_b32_e32 v6, s0
3524+ ; GFX8-NEXT: v_mov_b32_e32 v8, s6
3525+ ; GFX8-NEXT: s_and_b64 s[0:1], exec, -1
3526+ ; GFX8-NEXT: .LBB17_1: ; %.lr.ph
3527+ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
3528+ ; GFX8-NEXT: flat_load_sbyte v12, v[0:1]
3529+ ; GFX8-NEXT: flat_load_sbyte v13, v[2:3]
3530+ ; GFX8-NEXT: flat_load_ubyte v14, v[4:5]
3531+ ; GFX8-NEXT: flat_load_ubyte v15, v[6:7]
3532+ ; GFX8-NEXT: flat_load_ubyte v16, v[8:9]
3533+ ; GFX8-NEXT: s_waitcnt vmcnt(4)
3534+ ; GFX8-NEXT: v_mul_lo_u32 v12, v12, s2
3535+ ; GFX8-NEXT: s_waitcnt vmcnt(3)
3536+ ; GFX8-NEXT: v_mul_lo_u32 v13, v13, s3
3537+ ; GFX8-NEXT: s_waitcnt vmcnt(1)
3538+ ; GFX8-NEXT: v_or_b32_e32 v14, v14, v15
3539+ ; GFX8-NEXT: s_waitcnt vmcnt(0)
3540+ ; GFX8-NEXT: v_or_b32_e32 v14, v16, v14
3541+ ; GFX8-NEXT: v_or_b32_sdwa v13, v13, sext(v14) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3542+ ; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
3543+ ; GFX8-NEXT: s_mov_b64 vcc, s[0:1]
3544+ ; GFX8-NEXT: flat_store_dword v[10:11], v12
3545+ ; GFX8-NEXT: s_cbranch_vccnz .LBB17_1
3546+ ; GFX8-NEXT: ; %bb.2: ; %DummyReturnBlock
3547+ ; GFX8-NEXT: s_endpgm
3548+ ;
3549+ ; GFX9-NODL-LABEL: ByteOffsetCorrectness:
3550+ ; GFX9-NODL: ; %bb.0: ; %.entry
3551+ ; GFX9-NODL-NEXT: s_load_dword s4, s[2:3], 0x2c
3552+ ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3553+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
3554+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0
3555+ ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0
3556+ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
3557+ ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s4
3558+ ; GFX9-NODL-NEXT: s_bfe_i32 s3, s4, 0x80008
3559+ ; GFX9-NODL-NEXT: s_and_b64 vcc, exec, -1
3560+ ; GFX9-NODL-NEXT: .LBB17_1: ; %.lr.ph
3561+ ; GFX9-NODL-NEXT: ; =>This Inner Loop Header: Depth=1
3562+ ; GFX9-NODL-NEXT: global_load_sbyte v3, v2, s[0:1]
3563+ ; GFX9-NODL-NEXT: global_load_sbyte v4, v2, s[0:1] offset:4
3564+ ; GFX9-NODL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:3
3565+ ; GFX9-NODL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3566+ ; GFX9-NODL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3567+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(4)
3568+ ; GFX9-NODL-NEXT: v_mul_lo_u32 v3, v3, s2
3569+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
3570+ ; GFX9-NODL-NEXT: v_mul_lo_u32 v4, v4, s3
3571+ ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
3572+ ; GFX9-NODL-NEXT: v_or_b32_e32 v6, v6, v7
3573+ ; GFX9-NODL-NEXT: v_or_b32_e32 v5, v5, v6
3574+ ; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3575+ ; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v4
3576+ ; GFX9-NODL-NEXT: global_store_dword v[0:1], v3, off
3577+ ; GFX9-NODL-NEXT: s_mov_b64 vcc, vcc
3578+ ; GFX9-NODL-NEXT: s_cbranch_vccnz .LBB17_1
3579+ ; GFX9-NODL-NEXT: ; %bb.2: ; %DummyReturnBlock
3580+ ; GFX9-NODL-NEXT: s_endpgm
3581+ ;
3582+ ; GFX9-DL-LABEL: ByteOffsetCorrectness:
3583+ ; GFX9-DL: ; %bb.0: ; %.entry
3584+ ; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3585+ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3586+ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0xc0c0400
3587+ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0
3588+ ; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0400
3589+ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
3590+ ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s4
3591+ ; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80008
3592+ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
3593+ ; GFX9-DL-NEXT: v_perm_b32 v3, s3, v0, v1
3594+ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
3595+ ; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0
3596+ ; GFX9-DL-NEXT: s_and_b64 vcc, exec, -1
3597+ ; GFX9-DL-NEXT: .LBB17_1: ; %.lr.ph
3598+ ; GFX9-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3599+ ; GFX9-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3600+ ; GFX9-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3601+ ; GFX9-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3602+ ; GFX9-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3603+ ; GFX9-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3604+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
3605+ ; GFX9-DL-NEXT: v_or_b32_e32 v6, v6, v7
3606+ ; GFX9-DL-NEXT: v_or_b32_e32 v4, v4, v6
3607+ ; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
3608+ ; GFX9-DL-NEXT: v_perm_b32 v5, v8, v5, s2
3609+ ; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3610+ ; GFX9-DL-NEXT: v_dot4_i32_i8 v4, v5, v3, v4
3611+ ; GFX9-DL-NEXT: global_store_dword v[0:1], v4, off
3612+ ; GFX9-DL-NEXT: s_mov_b64 vcc, vcc
3613+ ; GFX9-DL-NEXT: s_cbranch_vccnz .LBB17_1
3614+ ; GFX9-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3615+ ; GFX9-DL-NEXT: s_endpgm
3616+ ;
3617+ ; GFX10-DL-LABEL: ByteOffsetCorrectness:
3618+ ; GFX10-DL: ; %bb.0: ; %.entry
3619+ ; GFX10-DL-NEXT: s_clause 0x1
3620+ ; GFX10-DL-NEXT: s_load_dword s4, s[2:3], 0x2c
3621+ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
3622+ ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xc0c0400
3623+ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
3624+ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
3625+ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
3626+ ; GFX10-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3627+ ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
3628+ ; GFX10-DL-NEXT: s_sext_i32_i8 s2, s4
3629+ ; GFX10-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3630+ ; GFX10-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3631+ ; GFX10-DL-NEXT: .LBB17_1: ; %.lr.ph
3632+ ; GFX10-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3633+ ; GFX10-DL-NEXT: s_clause 0x4
3634+ ; GFX10-DL-NEXT: global_load_ubyte v4, v2, s[0:1] offset:3
3635+ ; GFX10-DL-NEXT: global_load_ubyte v5, v2, s[0:1] offset:4
3636+ ; GFX10-DL-NEXT: global_load_ubyte v6, v2, s[0:1] offset:2
3637+ ; GFX10-DL-NEXT: global_load_ubyte v7, v2, s[0:1] offset:1
3638+ ; GFX10-DL-NEXT: global_load_ubyte v8, v2, s[0:1]
3639+ ; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
3640+ ; GFX10-DL-NEXT: v_or_b32_e32 v6, v6, v7
3641+ ; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
3642+ ; GFX10-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3643+ ; GFX10-DL-NEXT: v_or_b32_e32 v4, v4, v6
3644+ ; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3645+ ; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v5, v3
3646+ ; GFX10-DL-NEXT: global_store_dword v[0:1], v4, off
3647+ ; GFX10-DL-NEXT: s_cbranch_vccnz .LBB17_1
3648+ ; GFX10-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3649+ ; GFX10-DL-NEXT: s_endpgm
3650+ ;
3651+ ; GFX11-DL-LABEL: ByteOffsetCorrectness:
3652+ ; GFX11-DL: ; %bb.0: ; %.entry
3653+ ; GFX11-DL-NEXT: s_clause 0x1
3654+ ; GFX11-DL-NEXT: s_load_b32 s4, s[2:3], 0x2c
3655+ ; GFX11-DL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
3656+ ; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0xc0c0400 :: v_dual_mov_b32 v2, 0
3657+ ; GFX11-DL-NEXT: v_mov_b32_e32 v0, 0
3658+ ; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
3659+ ; GFX11-DL-NEXT: s_mov_b32 vcc_lo, exec_lo
3660+ ; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
3661+ ; GFX11-DL-NEXT: s_sext_i32_i8 s2, s4
3662+ ; GFX11-DL-NEXT: s_bfe_i32 s3, s4, 0x80008
3663+ ; GFX11-DL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3664+ ; GFX11-DL-NEXT: v_perm_b32 v3, s2, s3, v3
3665+ ; GFX11-DL-NEXT: .p2align 6
3666+ ; GFX11-DL-NEXT: .LBB17_1: ; %.lr.ph
3667+ ; GFX11-DL-NEXT: ; =>This Inner Loop Header: Depth=1
3668+ ; GFX11-DL-NEXT: s_clause 0x4
3669+ ; GFX11-DL-NEXT: global_load_u8 v4, v2, s[0:1] offset:3
3670+ ; GFX11-DL-NEXT: global_load_u8 v5, v2, s[0:1] offset:4
3671+ ; GFX11-DL-NEXT: global_load_u8 v6, v2, s[0:1] offset:2
3672+ ; GFX11-DL-NEXT: global_load_u8 v7, v2, s[0:1] offset:1
3673+ ; GFX11-DL-NEXT: global_load_u8 v8, v2, s[0:1]
3674+ ; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
3675+ ; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v7
3676+ ; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
3677+ ; GFX11-DL-NEXT: v_perm_b32 v5, v8, v5, 0xc0c0400
3678+ ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3679+ ; GFX11-DL-NEXT: v_or_b32_e32 v4, v4, v6
3680+ ; GFX11-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
3681+ ; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
3682+ ; GFX11-DL-NEXT: v_dot4_i32_iu8 v4, v5, v3, v4 neg_lo:[1,1,0]
3683+ ; GFX11-DL-NEXT: global_store_b32 v[0:1], v4, off
3684+ ; GFX11-DL-NEXT: s_cbranch_vccnz .LBB17_1
3685+ ; GFX11-DL-NEXT: ; %bb.2: ; %DummyReturnBlock
3686+ ; GFX11-DL-NEXT: s_endpgm
3687+ .entry:
3688+ br label %.lr.ph
3689+
3690+ .lr.ph: ; preds = %.lr.ph, %.entry
3691+ %l80 = load i8 , ptr addrspace (1 ) %inptr1 , align 1
3692+ %op80 = sext i8 %l80 to i32
3693+ %op81 = sext i8 %l81 to i32
3694+ %mul8 = mul i32 %op80 , %op81
3695+ %gep50 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 4
3696+ %l50 = load i8 , ptr addrspace (1 ) %gep50 , align 1
3697+ %op50 = sext i8 %l50 to i32
3698+ %op51 = sext i8 %l51 to i32
3699+ %mul5 = mul i32 %op50 , %op51
3700+ %gep40 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 3
3701+ %l40 = load i8 , ptr addrspace (1 ) %gep40 , align 1
3702+ %gep30 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 2
3703+ %l30 = load i8 , ptr addrspace (1 ) %gep30 , align 1
3704+ %gep20 = getelementptr i8 , ptr addrspace (1 ) %inptr1 , i64 1
3705+ %l20 = load i8 , ptr addrspace (1 ) %gep20 , align 1
3706+ %ivadd31 = or i8 %l30 , %l20
3707+ %ivadd42 = or i8 %l40 , %ivadd31
3708+ %ivadd4 = sext i8 %ivadd42 to i32
3709+ %ivadd5 = or i32 %mul5 , %ivadd4
3710+ %ivadd8 = add i32 %mul8 , %ivadd5
3711+ store i32 %ivadd8 , ptr addrspace (1 ) null , align 4
3712+ br label %.lr.ph
3713+ }
3714+
34523715
34533716declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments