Skip to content

Commit 4604762

Browse files
authored
[AMDGPU] Add builtins for wave reduction intrinsics (#161816)
1 parent 30b1d14 commit 4604762

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
402402
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
403403
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
404404
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
405+
BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc")
406+
BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc")
407+
BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc")
408+
BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc")
405409

406410
//===----------------------------------------------------------------------===//
407411
// R600-NI only builtins.

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,18 +390,26 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
390390
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
391391
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
392392
return Intrinsic::amdgcn_wave_reduce_add;
393+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
394+
return Intrinsic::amdgcn_wave_reduce_fadd;
393395
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
394396
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
395397
return Intrinsic::amdgcn_wave_reduce_sub;
398+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
399+
return Intrinsic::amdgcn_wave_reduce_fsub;
396400
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
397401
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
398402
return Intrinsic::amdgcn_wave_reduce_min;
403+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
404+
return Intrinsic::amdgcn_wave_reduce_fmin;
399405
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
400406
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
401407
return Intrinsic::amdgcn_wave_reduce_umin;
402408
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
403409
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
404410
return Intrinsic::amdgcn_wave_reduce_max;
411+
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
412+
return Intrinsic::amdgcn_wave_reduce_fmax;
405413
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
406414
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
407415
return Intrinsic::amdgcn_wave_reduce_umax;
@@ -423,11 +431,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
423431
llvm::SyncScope::ID SSID;
424432
switch (BuiltinID) {
425433
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
434+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
426435
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
436+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
427437
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
428438
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
439+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
429440
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
430441
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
442+
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
431443
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
432444
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
433445
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in)
405405
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
406406
}
407407

408+
// CHECK-LABEL: @test_wave_reduce_fadd_f32_default
409+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
410+
void test_wave_reduce_fadd_f32_default(global float* out, float in)
411+
{
412+
*out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
413+
}
414+
408415
// CHECK-LABEL: @test_wave_reduce_add_u32_iterative
409416
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
410417
void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in)
419426
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
420427
}
421428

429+
// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative
430+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
431+
void test_wave_reduce_fadd_f32_iterative(global float* out, float in)
432+
{
433+
*out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
434+
}
435+
422436
// CHECK-LABEL: @test_wave_reduce_add_u32_dpp
423437
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
424438
void test_wave_reduce_add_u32_dpp(global int* out, int in)
@@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in)
433447
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
434448
}
435449

450+
// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp
451+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
452+
void test_wave_reduce_fadd_f32_dpp(global float* out, float in)
453+
{
454+
*out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
455+
}
456+
436457
// CHECK-LABEL: @test_wave_reduce_sub_u32_default
437458
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
438459
void test_wave_reduce_sub_u32_default(global int* out, int in)
@@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in)
447468
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
448469
}
449470

471+
// CHECK-LABEL: @test_wave_reduce_fsub_f32_default
472+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
473+
void test_wave_reduce_fsub_f32_default(global float* out, float in)
474+
{
475+
*out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
476+
}
477+
450478
// CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
451479
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
452480
void test_wave_reduce_sub_u32_iterative(global int* out, int in)
@@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in)
461489
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
462490
}
463491

492+
// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative
493+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
494+
void test_wave_reduce_fsub_f32_iterative(global float* out, float in)
495+
{
496+
*out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
497+
}
498+
464499
// CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
465500
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
466501
void test_wave_reduce_sub_u32_dpp(global int* out, int in)
@@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in)
475510
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
476511
}
477512

513+
// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp
514+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
515+
void test_wave_reduce_fsub_f32_dpp(global float* out, float in)
516+
{
517+
*out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
518+
}
519+
478520
// CHECK-LABEL: @test_wave_reduce_and_b32_default
479521
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
480522
void test_wave_reduce_and_b32_default(global int* out, int in)
@@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in)
615657
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
616658
}
617659

660+
// CHECK-LABEL: @test_wave_reduce_fmin_f32_default
661+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
662+
void test_wave_reduce_fmin_f32_default(global float* out, float in)
663+
{
664+
*out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
665+
}
666+
618667
// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
619668
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
620669
void test_wave_reduce_min_i32_iterative(global int* out, int in)
@@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in)
629678
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
630679
}
631680

681+
// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative
682+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
683+
void test_wave_reduce_fmin_f32_iterative(global float* out, float in)
684+
{
685+
*out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
686+
}
687+
632688
// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
633689
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
634690
void test_wave_reduce_min_i32_dpp(global int* out, int in)
@@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in)
643699
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
644700
}
645701

702+
// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp
703+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
704+
void test_wave_reduce_fmin_f32_dpp(global float* out, float in)
705+
{
706+
*out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
707+
}
708+
646709
// CHECK-LABEL: @test_wave_reduce_min_u32_default
647710
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
648711
void test_wave_reduce_min_u32_default(global int* out, int in)
@@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in)
699762
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
700763
}
701764

765+
// CHECK-LABEL: @test_wave_reduce_fmax_f32_default
766+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
767+
void test_wave_reduce_fmax_f32_default(global float* out, float in)
768+
{
769+
*out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
770+
}
771+
702772
// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
703773
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
704774
void test_wave_reduce_max_i32_iterative(global int* out, int in)
@@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in)
713783
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
714784
}
715785

786+
// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative
787+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
788+
void test_wave_reduce_fmax_f32_iterative(global float* out, float in)
789+
{
790+
*out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
791+
}
792+
716793
// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
717794
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
718795
void test_wave_reduce_max_i32_dpp(global int* out, int in)
@@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in)
727804
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
728805
}
729806

807+
// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp
808+
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
809+
void test_wave_reduce_fmax_f32_dpp(global float* out, float in)
810+
{
811+
*out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
812+
}
813+
730814
// CHECK-LABEL: @test_wave_reduce_max_u32_default
731815
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
732816
void test_wave_reduce_max_u32_default(global int* out, int in)

0 commit comments

Comments
 (0)