Skip to content

Commit ae212c4

Browse files
mmereckiigcbot
authored andcommitted
Improve alignment calculation in constant coalescing
Use `computeKnownBits` to check the minimum alignemnt of a buffer offset.
1 parent af4fb17 commit ae212c4

File tree

3 files changed

+64
-3
lines changed

3 files changed

+64
-3
lines changed

IGC/Compiler/CISACodeGen/ConstantCoalescing.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ SPDX-License-Identifier: MIT
2020
#include "common/LLVMWarningsPush.hpp"
2121
#include "llvmWrapper/IR/DerivedTypes.h"
2222
#include "llvmWrapper/Support/Alignment.h"
23+
#include "llvm/Analysis/ValueTracking.h"
24+
#include "llvm/Support/KnownBits.h"
2325
#include "common/LLVMWarningsPop.hpp"
2426
#include "Probe/Assertion.h"
2527

@@ -632,6 +634,12 @@ bool ConstantCoalescing::CompareBufferBase(
632634
return false;
633635
}
634636

637+
bool ConstantCoalescing::IsDwordAligned(Value* val) const
638+
{
639+
KnownBits knownBits = computeKnownBits(val, *dataLayout);
640+
return knownBits.countMinTrailingZeros() >= 2;
641+
}
642+
635643
void ConstantCoalescing::MergeScatterLoad(Instruction* load,
636644
Value* bufIdxV, uint addrSpace,
637645
Value* eltIdxV, uint offsetInBytes,
@@ -648,7 +656,8 @@ void ConstantCoalescing::MergeScatterLoad(Instruction* load,
648656
// Current assumption is that a chunk start needs to be DWORD aligned. In
649657
// the future we can consider adding support for merging 4 bytes or
650658
// 2 i16s/halfs into a single non-aligned DWORD.
651-
const bool isDwordAligned = ((offsetInBytes % 4) == 0 && (eltIdxV == nullptr || alignment >= 4));
659+
const bool isDwordAligned = ((offsetInBytes % 4) == 0 &&
660+
(alignment >= 4 || eltIdxV == nullptr || IsDwordAligned(eltIdxV)));
652661

653662
BufChunk* cov_chunk = nullptr;
654663
for (std::vector<BufChunk*>::reverse_iterator rit = chunk_vec.rbegin(),
@@ -1054,8 +1063,8 @@ void ConstantCoalescing::MergeUniformLoad(Instruction* load,
10541063
// Current assumption is that a chunk start needs to be DWORD aligned. In
10551064
// the future we can consider adding support for merging 4 bytes or
10561065
// 2 i16s/halfs into a single non-aligned DWORD.
1057-
const bool isDwordAligned =
1058-
((offsetInBytes % 4) == 0 && (eltIdxV == nullptr || alignment >= 4));
1066+
const bool isDwordAligned = ((offsetInBytes % 4) == 0 &&
1067+
(alignment >= 4 || eltIdxV == nullptr || IsDwordAligned(eltIdxV)));
10591068

10601069
auto shouldMerge = [&](const BufChunk* cur_chunk)
10611070
{

IGC/Compiler/CISACodeGen/ConstantCoalescing.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ namespace IGC
303303
bool IsSamplerAlignedAddress(Value* addr) const;
304304
Value* GetSamplerAlignedAddress(Value* inst);
305305

306+
bool IsDwordAligned(Value* val) const;
307+
306308
alignment_t GetAlignment(Instruction* load) const;
307309
void SetAlignment(Instruction* load, uint alignment);
308310

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2022-2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: igc_opt --opaque-pointers %s -S -o - -igc-constant-coalescing -instcombine -dce | FileCheck %s
10+
11+
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f80:128:128-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-a:64:64-f80:128:128-n8:16:32:64"
12+
13+
define <4 x half> @f0(i32 %src) {
14+
entry:
15+
%bso = inttoptr i32 %src to ptr addrspace(2490373)
16+
%ox = shl i32 %src, 2
17+
%x = call half @llvm.genx.GenISA.ldraw.indexed.f16.p2490373i8(ptr addrspace(2490373) %bso, i32 %ox, i32 2, i1 false)
18+
%oy = add i32 %ox, 2
19+
%y = call half @llvm.genx.GenISA.ldraw.indexed.f16.p2490373i8(ptr addrspace(2490373) %bso, i32 %oy, i32 2, i1 false)
20+
%oz = add i32 %ox, 4
21+
%z = call half @llvm.genx.GenISA.ldraw.indexed.f16.p2490373i8(ptr addrspace(2490373) %bso, i32 %oz, i32 2, i1 false)
22+
%ow = add i32 %ox, 6
23+
%w = call half @llvm.genx.GenISA.ldraw.indexed.f16.p2490373i8(ptr addrspace(2490373) %bso, i32 %ow, i32 2, i1 false)
24+
%ret.x = insertelement <4 x half> undef, half %x, i32 0
25+
%ret.xy = insertelement <4 x half> %ret.x, half %y, i32 1
26+
%ret.xyz = insertelement <4 x half> %ret.xy, half %z, i32 2
27+
%ret.xyzw = insertelement <4 x half> %ret.xyz, half %w, i32 3
28+
ret <4 x half> %ret.xyzw
29+
}
30+
; CHECK-LABEL: define <4 x half> @f0
31+
; CHECK: [[BSO:%.*]] = inttoptr i32 %src to ptr addrspace(2490373)
32+
; CHECK: [[OFF:%.*]] = shl i32 %src, 2
33+
; CHECK: [[RET:%.*]] = call <4 x half> @llvm.genx.GenISA.ldrawvector.indexed.v4f16.p2490373(ptr addrspace(2490373) [[BSO]], i32 [[OFF]], i32 4, i1 false)
34+
; CHECK: ret <4 x half> [[RET]]
35+
36+
; Function Attrs: argmemonly nounwind readonly
37+
declare float @llvm.genx.GenISA.ldraw.indexed.f32.p2490373i8(ptr addrspace(2490373), i32, i32, i1) argmemonly nounwind readonly
38+
declare half @llvm.genx.GenISA.ldraw.indexed.f16.p2490373i8(ptr addrspace(2490373), i32, i32, i1) argmemonly nounwind readonly
39+
40+
; Function Attrs: argmemonly nounwind writeonly
41+
declare void @llvm.genx.GenISA.storeraw.indexed.p2490368i8.f32(ptr addrspace(2490373), i32, float, i32, i1) argmemonly nounwind writeonly
42+
43+
44+
!igc.functions = !{!0}
45+
46+
!0 = !{<4 x half> (i32)* @f0, !1}
47+
48+
!1 = !{!2}
49+
!2 = !{!"function_type", i32 0}
50+

0 commit comments

Comments
 (0)