Skip to content
6 changes: 3 additions & 3 deletions llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// is transformed to:
//
// addr2 = gep float, float* p, i64 a ; inbounds removed
// addr = gep inbounds float, float* addr2, i64 5
// addr = gep float, float* addr2, i64 5 ; inbounds removed
//
// If a is -4, although the old index b is in bounds, the new index a is
// off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
Expand All @@ -1103,7 +1103,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// TODO(jingyue): do some range analysis to keep as many inbounds as
// possible. GEPs with inbounds are more friendly to alias analysis.
// TODO(gep_nowrap): Preserve nuw at least.
bool GEPWasInBounds = GEP->isInBounds();
GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
GEP->setNoWrapFlags(GEPNoWrapFlags::none());

// Lowers a GEP to either GEPs with a single index or arithmetic operations.
Expand Down Expand Up @@ -1153,7 +1153,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
IRBuilder<> Builder(GEP);
NewGEP = cast<Instruction>(Builder.CreatePtrAdd(
NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true),
GEP->getName(), GEPWasInBounds));
GEP->getName(), NewGEPFlags));
NewGEP->copyMetadata(*GEP);

GEP->replaceAllUsesWith(NewGEP);
Expand Down
364 changes: 264 additions & 100 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ main_body:
%25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
%26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
%28 = or disjoint i32 %27, 3
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
%28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27, !amdgpu.uniform !0
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3, !amdgpu.uniform !0
%30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
%32 = extractelement <4 x float> %31, i32 0
Expand Down Expand Up @@ -270,8 +270,8 @@ main_body:
%25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
%26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
%27 = shl i32 %23, 2
%28 = or disjoint i32 %27, 3
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
%28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27
%29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3
%30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
%31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
%32 = extractelement <4 x float> %31, i32 0
Expand Down
339 changes: 231 additions & 108 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Large diffs are not rendered by default.

159 changes: 159 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s

; This test checks memory addresses with constant offset components that should
; not be folded into memory accesses with immediate offsets.
; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to
; out-of-bounds or negative intermediate results in the address computation,
; which are problematic for flat and scratch instructions:
; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3)


; FIXME the offset here should not be folded: if %p points to the beginning of
; scratch or LDS and %i is -1, a folded offset crashes the program.
define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; GFX90A-LABEL: flat_offset_maybe_oob:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: flat_offset_maybe_oob:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: flat_offset_maybe_oob:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: flat_offset_maybe_oob:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: flat_offset_maybe_oob:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx
%l = load i32, ptr %arrayidx
ret i32 %l
}

; For MUBUF and for GFX12, folding the offset is okay.
define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
; GFX90A-MUBUF-LABEL: private_offset_maybe_oob:
; GFX90A-MUBUF: ; %bb.0:
; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX90A-FLATSCR: ; %bb.0:
; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off
; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-MUBUF-LABEL: private_offset_maybe_oob:
; GFX10-MUBUF: ; %bb.0:
; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX10-FLATSCR: ; %bb.0:
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: private_offset_maybe_oob:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX942-NEXT: scratch_load_dword v0, v0, off
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_offset_maybe_oob:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12
; GFX11-NEXT: scratch_load_b32 v0, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: private_offset_maybe_oob:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0
; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx
%l = load i32, ptr addrspace(5) %arrayidx
ret i32 %l
}
23 changes: 10 additions & 13 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Original file line number Diff line number Diff line change
Expand Up @@ -225,22 +225,19 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
; GCN-SCRATCH-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
%tmp3 = load <4 x i32>, ptr addrspace(5) %tmp2, align 16
%tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
%tmp5 = add nuw nsw i32 %tmp, 1
%tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp5
%base = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp
%tmp3 = load <4 x i32>, ptr addrspace(5) %base, align 16
%base1 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp
%tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 1
%tmp7 = load <4 x i32>, ptr addrspace(5) %tmp6, align 16
%tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp5
%tmp9 = add nuw nsw i32 %tmp, 2
%tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp9
%tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 1
%tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 2
%tmp11 = load <4 x i32>, ptr addrspace(5) %tmp10, align 16
%tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp9
%tmp13 = add nuw nsw i32 %tmp, 3
%tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp13
%tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 2
%tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 3
%tmp15 = load <4 x i32>, ptr addrspace(5) %tmp14, align 16
%tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp13
store <4 x i32> %tmp3, ptr addrspace(5) %tmp4, align 16
%tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 3
store <4 x i32> %tmp3, ptr addrspace(5) %base1, align 16
store <4 x i32> %tmp7, ptr addrspace(5) %tmp8, align 16
store <4 x i32> %tmp11, ptr addrspace(5) %tmp12, align 16
store <4 x i32> %tmp15, ptr addrspace(5) %tmp16, align 16
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=separate-const-offset-from-gep -S | FileCheck %s

; The inbounds flags cannot be preserved here: If the pointers point to the
; beginning of an object and %i is 1, the intermediate GEPs are out of bounds.
define ptr @maybe_oob(ptr %p, i64 %i) {
; CHECK-LABEL: @maybe_oob(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDX1:%.*]] = sub i64 0, [[I:%.*]]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX1]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
; CHECK-NEXT: ret ptr [[ARRAYIDX2]]
;
entry:
%idx = sub nsw i64 1, %i
%arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
ret ptr %arrayidx
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocaptu
; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64
; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64
; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]]
; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 4
; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 128
; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 132
; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 4
; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 128
; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 132
; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4
; IR-NEXT: ret void
;
Expand Down Expand Up @@ -51,7 +51,7 @@ define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, pt
; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]]
; IR-NEXT: [[TMP6:%.*]] = add i32 [[Y]], 255
; IR-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 1020
; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 1020
; IR-NEXT: [[TMP12:%.*]] = add i32 [[X]], 256
; IR-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]]
Expand Down Expand Up @@ -91,13 +91,13 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y
; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]]
; IR-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4
; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 1020
; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1020
; IR-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4
; IR-NEXT: [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]]
; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 64512
; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 64512
; IR-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4
; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]]
; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 65532
; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532
; IR-NEXT: [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4
; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]]
; IR-NEXT: store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4
Expand Down
Loading