Skip to content

Commit 840a25e

Browse files
ichenkaiigcbot
authored andcommitted
Replace EATOMIC_IADD with EATOMIC_INC and EATOMIC_DEC when
immediate is 1 or -1 as increment or decrement In cases a shader is doing typed atomics with typed, or untyped atomics with ugm, or untyped atomics with slm and just increment or decrement atomic operation using an immediate as -1 and 1, we can use EATOMIC_INC(2) or EATOMIC_DEC(3) to replace EATOMIC_IADD.
1 parent 148f0bd commit 840a25e

File tree

3 files changed

+190
-1
lines changed

3 files changed

+190
-1
lines changed

IGC/Compiler/CustomSafeOptPass.cpp

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2017-2024 Intel Corporation
3+
Copyright (C) 2017-2025 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -472,6 +472,66 @@ static bool isTruncInvariant(unsigned Opcode) {
472472
}
473473
}
474474

475+
// clang-format off
476+
// In AtomicTyped, convert EATOMIC_IADD(0) to EATOMIC_INC(2) and EATOMIC_DEC(3) when value of 1 is used as increment or -1 as decrement
477+
// From:
478+
// %7 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 1, i32 0)
479+
// %8 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 -1, i32 0)
480+
// To:
481+
// %7 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 1, i32 2)
482+
// %8 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 -1, i32 3)
483+
// clang-format on
484+
void CustomSafeOptPass::visitIntAtomicTyped(CallInst *I) {
485+
GenIntrinsicInst *instr = dyn_cast<GenIntrinsicInst>(I);
486+
487+
// for immediate 1 or -1
488+
if (auto *constInt1 = llvm::dyn_cast<llvm::ConstantInt>(instr->getOperand(4))) {
489+
// for atomic_iadd
490+
if (auto *constInt2 = llvm::dyn_cast<llvm::ConstantInt>(instr->getOperand(5))) {
491+
if (AtomicOp::EATOMIC_IADD == constInt2->getZExtValue()) {
492+
if (constInt1->getSExtValue() == 1) {
493+
instr->setOperand(5, llvm::ConstantInt::get(instr->getOperand(5)->getType(), AtomicOp::EATOMIC_INC));
494+
} else if (constInt1->getSExtValue() == -1) {
495+
instr->setOperand(5, llvm::ConstantInt::get(instr->getOperand(5)->getType(), AtomicOp::EATOMIC_DEC));
496+
}
497+
}
498+
}
499+
}
500+
}
501+
502+
// clang-format off
503+
// In AtomicRaw or AtomicRawA64, convert EATOMIC_IADD(0) to EATOMIC_INC(2) and EATOMIC_DEC(3) when value of 1 is used as increment or -1 as decrement
504+
// From:
505+
// %10 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 1, i32 0)
506+
// %11 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 -1, i32 0)
507+
// or
508+
// %13 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 1, i32 0)
509+
// %14 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 -1, i32 0)
510+
// To:
511+
// %10 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 1, i32 2)
512+
// %11 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 -1, i32 3)
513+
// or
514+
// %13 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 1, i32 2)
515+
// %14 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 -1, i32 3)
516+
// clang-format on
517+
void CustomSafeOptPass::visitIntAtomicRawOrRawA64(CallInst *I) {
518+
GenIntrinsicInst *instr = dyn_cast<GenIntrinsicInst>(I);
519+
520+
// for immediate 1 or -1
521+
if (auto *constInt1 = llvm::dyn_cast<llvm::ConstantInt>(instr->getOperand(2))) {
522+
// for atomic_iadd
523+
if (auto *constInt2 = llvm::dyn_cast<llvm::ConstantInt>(instr->getOperand(3))) {
524+
if (AtomicOp::EATOMIC_IADD == constInt2->getZExtValue()) {
525+
if (constInt1->getSExtValue() == 1) {
526+
instr->setOperand(3, llvm::ConstantInt::get(instr->getOperand(3)->getType(), AtomicOp::EATOMIC_INC));
527+
} else if (constInt1->getSExtValue() == -1) {
528+
instr->setOperand(3, llvm::ConstantInt::get(instr->getOperand(3)->getType(), AtomicOp::EATOMIC_DEC));
529+
}
530+
}
531+
}
532+
}
533+
}
534+
475535
// Searches for following pattern:
476536
// %mul = mul i64 %conv, %conv2
477537
// %conv3 = and i64 %mul, 0xFFFFFFFF
@@ -838,6 +898,17 @@ void CustomSafeOptPass::visitCallInst(CallInst &C) {
838898
break;
839899
}
840900

901+
case GenISAIntrinsic::GenISA_intatomictyped: {
902+
visitIntAtomicTyped(inst);
903+
break;
904+
}
905+
906+
case GenISAIntrinsic::GenISA_intatomicraw:
907+
case GenISAIntrinsic::GenISA_intatomicrawA64: {
908+
visitIntAtomicRawOrRawA64(inst);
909+
break;
910+
}
911+
841912
default:
842913
break;
843914
}

IGC/Compiler/CustomSafeOptPass.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ class CustomSafeOptPass : public llvm::FunctionPass, public llvm::InstVisitor<Cu
7272
void visitSelectInst(llvm::SelectInst &S);
7373
void mergeDotAddToDp4a(llvm::CallInst *I);
7474
void visitTruncInst(llvm::TruncInst &I);
75+
void visitIntAtomicTyped(llvm::CallInst *I);
76+
void visitIntAtomicRawOrRawA64(llvm::CallInst *I);
7577

7678
//
7779
// IEEE Floating point arithmetic is not associative. Any pattern
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: llvm-14-plus
10+
; RUN: igc_opt --opaque-pointers --platformdg2 -igc-custom-safe-opt -S < %s --dce | FileCheck %s
11+
12+
target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n8:16:32-S32"
13+
target triple = "dxil-ms-dx"
14+
15+
%__Buffer_Typed_DIM_Resource = type opaque
16+
%"class.RWBuffer<unsigned int>" = type { i32 }
17+
%"class.RWStructuredBuffer<xs>" = type { %struct.xs }
18+
%struct.xs = type { i32 }
19+
20+
@"\01?outputg@@3PAIA" = external addrspace(3) global [32 x i32], align 4
21+
@ThreadGroupSize_X = constant i32 32
22+
@ThreadGroupSize_Y = constant i32 1
23+
@ThreadGroupSize_Z = constant i32 1
24+
25+
; -------------------------------------------------------------
26+
; CustomSafeOptPass: EATOMIC_IADD to EATOMIC_INC or EATOMIC_DEC
27+
; -------------------------------------------------------------
28+
define void @main(<8 x i32> %r0) {
29+
; CHECK-LABEL: @main(
30+
; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 14)
31+
; CHECK-NEXT: [[GROUPX:%.*]] = bitcast float [[TMP1]] to i32
32+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.genx.GenISA.RuntimeValue.i32(i32 0)
33+
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 64
34+
; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
35+
; CHECK-NEXT: [[U0:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(2490369)
36+
; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP2]] to i64
37+
; CHECK-NEXT: [[U01:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(2490368)
38+
; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[GROUPX]], 5
39+
; CHECK-NEXT: [[LOCALIDX:%.*]] = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
40+
; CHECK-NEXT: [[THREADIDX:%.*]] = add i32 [[TMP6]], [[LOCALIDX]]
41+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(ptr addrspace(2490368) [[U01]], i32 [[THREADIDX]], i32 undef, i32 undef, i32 1, i32 2)
42+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(ptr addrspace(2490368) [[U01]], i32 [[THREADIDX]], i32 undef, i32 undef, i32 -1, i32 3)
43+
; CHECK-NEXT: [[TMP9:%.*]] = shl i32 [[THREADIDX]], 2
44+
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(ptr addrspace(2490369) [[U0]], i32 [[TMP9]], i32 1, i32 2)
45+
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(ptr addrspace(2490369) [[U0]], i32 [[TMP9]], i32 -1, i32 3)
46+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr [32 x i32], ptr addrspace(3) null, i32 0, i32 [[THREADIDX]]
47+
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(ptr addrspace(3) [[TMP12]], ptr addrspace(3) [[TMP12]], i32 1, i32 2)
48+
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(ptr addrspace(3) [[TMP12]], ptr addrspace(3) [[TMP12]], i32 -1, i32 3)
49+
; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(3) [[TMP12]], align 4, !tbaa !18
50+
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <1 x i32> undef, i32 [[TMP15]], i64 0
51+
; CHECK-NEXT: call void @llvm.genx.GenISA.storerawvector.indexed.p2490369v4f32.v1i32(ptr addrspace(2490369) [[U0]], i32 0, <1 x i32> [[TMP16]], i32 4, i1 false)
52+
; CHECK-NEXT: ret void
53+
;
54+
%1 = call fast float @llvm.genx.GenISA.DCL.SystemValue.f32(i32 14)
55+
%GroupID_X = bitcast float %1 to i32
56+
%2 = call i32 @llvm.genx.GenISA.RuntimeValue.i32(i32 0)
57+
%3 = add i32 %2, 64
58+
%4 = zext i32 %3 to i64
59+
%u0 = inttoptr i64 %4 to <4 x float> addrspace(2490369)*
60+
%5 = zext i32 %2 to i64
61+
%u01 = inttoptr i64 %5 to %__Buffer_Typed_DIM_Resource addrspace(2490368)*
62+
%6 = shl i32 %GroupID_X, 5
63+
%LocalID_X = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
64+
%ThreadID_X = add i32 %6, %LocalID_X
65+
%7 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 1, i32 0)
66+
%8 = call i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)* %u01, i32 %ThreadID_X, i32 undef, i32 undef, i32 -1, i32 0)
67+
%9 = shl i32 %ThreadID_X, 2
68+
%10 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 1, i32 0)
69+
%11 = call i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)* %u0, i32 %9, i32 -1, i32 0)
70+
%12 = getelementptr [32 x i32], [32 x i32] addrspace(3)* null, i32 0, i32 %ThreadID_X
71+
%13 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 1, i32 0)
72+
%14 = call i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)* %12, i32 addrspace(3)* %12, i32 -1, i32 0)
73+
%15 = load i32, i32 addrspace(3)* %12, align 4, !tbaa !18
74+
%16 = insertelement <1 x i32> undef, i32 %15, i64 0
75+
call void @llvm.genx.GenISA.storerawvector.indexed.p2490369v4f32.v1i32(<4 x float> addrspace(2490369)* %u0, i32 0, <1 x i32> %16, i32 4, i1 false)
76+
ret void
77+
}
78+
79+
declare i32 @llvm.genx.GenISA.RuntimeValue.i32(i32)
80+
declare i32 @llvm.genx.GenISA.intatomicrawA64.i32.p3i32.p3i32(i32 addrspace(3)*, i32 addrspace(3)*, i32, i32)
81+
declare float @llvm.genx.GenISA.DCL.SystemValue.f32(i32)
82+
declare i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32)
83+
declare i32 @llvm.genx.GenISA.intatomictyped.i32.p2490368__Buffer_Typed_DIM_Resource(%__Buffer_Typed_DIM_Resource addrspace(2490368)*, i32, i32, i32, i32, i32)
84+
declare i32 @llvm.genx.GenISA.intatomicraw.i32.p2490369v4f32(<4 x float> addrspace(2490369)*, i32, i32, i32)
85+
declare void @llvm.genx.GenISA.storerawvector.indexed.p2490369v4f32.v1i32(<4 x float> addrspace(2490369)*, i32, <1 x i32>, i32, i1)
86+
87+
!llvm.ident = !{!0}
88+
!dx.version = !{!1}
89+
!dx.valver = !{!2}
90+
!dx.shaderModel = !{!3}
91+
!dx.resources = !{!4}
92+
!dx.entryPoints = !{!10}
93+
!igc.functions = !{!13}
94+
95+
!0 = !{!"dxcoob 1.8.2502.11 (239921522)"}
96+
!1 = !{i32 1, i32 6}
97+
!2 = !{i32 1, i32 8}
98+
!3 = !{!"cs", i32 6, i32 6}
99+
!4 = !{null, !5, null, null}
100+
!5 = !{!6, !8}
101+
!6 = !{i32 0, %"class.RWBuffer<unsigned int>"* undef, !"", i32 0, i32 0, i32 1, i32 10, i1 false, i1 false, i1 false, !7}
102+
!7 = !{i32 0, i32 5}
103+
!8 = !{i32 1, %"class.RWStructuredBuffer<xs>"* undef, !"", i32 0, i32 1, i32 1, i32 12, i1 false, i1 false, i1 false, !9}
104+
!9 = !{i32 1, i32 4}
105+
!10 = distinct !{null, !"main", null, !4, !11}
106+
!11 = !{i32 0, i64 8388624, i32 4, !12}
107+
!12 = !{i32 32, i32 1, i32 1}
108+
!13 = !{void (<8 x i32>)* @main, !14}
109+
!14 = !{!15, !16}
110+
!15 = !{!"function_type", i32 0}
111+
!16 = !{!"implicit_arg_desc", !17}
112+
!17 = !{i32 0}
113+
!18 = !{!19, !19, i64 0}
114+
!19 = !{!"int", !20, i64 0}
115+
!20 = !{!"omnipotent char", !21, i64 0}
116+
!21 = !{!"Simple C/C++ TBAA"}

0 commit comments

Comments
 (0)