Skip to content

Commit ecc70b8

Browse files
committed
Merge remote-tracking branch 'origin/main' into lv-vectorize-fmaxnum-without-fmfs
2 parents e441974 + f6641e2 commit ecc70b8

File tree

123 files changed

+28200
-25960
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

123 files changed

+28200
-25960
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
684684
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
685685
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
686686
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
687+
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
688+
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
687689

688690
// GFX1250 WMMA builtins
689691
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
66

77
typedef unsigned int uint;
8+
typedef unsigned short int ushort;
9+
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
810
typedef half __attribute__((ext_vector_type(2))) half2;
911

1012
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -368,6 +370,76 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
368370
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
369371
}
370372

373+
// CHECK-LABEL: @test_sat_pk4_i4_i8(
374+
// CHECK-NEXT: entry:
375+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
376+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
377+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
378+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
379+
// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
380+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
381+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
382+
// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
383+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
384+
// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2
385+
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
386+
// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
387+
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
388+
// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2
389+
// CHECK-NEXT: ret void
390+
//
391+
void test_sat_pk4_i4_i8(ushort *out, uint src)
392+
{
393+
*out = __builtin_amdgcn_sat_pk4_i4_i8(src);
394+
*out = __builtin_amdgcn_sat_pk4_u4_u8(src);
395+
}
396+
397+
// CHECK-LABEL: @test_permlane16_swap(
398+
// CHECK-NEXT: entry:
399+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
400+
// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
401+
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
402+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
403+
// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
404+
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
405+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
406+
// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
407+
// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
408+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
409+
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
410+
// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
411+
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
412+
// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
413+
// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
414+
// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
415+
// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
416+
// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
417+
// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
418+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
419+
// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
420+
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
421+
// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
422+
// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
423+
// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
424+
// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
425+
// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
426+
// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
427+
// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
428+
// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
429+
// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
430+
// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
431+
// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
432+
// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
433+
// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
434+
// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
435+
// CHECK-NEXT: ret void
436+
//
437+
void test_permlane16_swap(global uint2* out, uint old, uint src) {
438+
*out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
439+
*out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
440+
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
441+
}
442+
371443
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
372444
// CHECK-NEXT: entry:
373445
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

flang/lib/Semantics/check-do-forall.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1180,7 +1180,9 @@ void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
11801180
void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
11811181
const auto &control{std::get<parser::IoImpliedDoControl>(outputImpliedDo.t)};
11821182
const parser::Name &name{control.name.thing.thing};
1183-
context_.CheckIndexVarRedefine(name.source, *name.symbol);
1183+
if (name.symbol) {
1184+
context_.CheckIndexVarRedefine(name.source, *name.symbol);
1185+
}
11841186
}
11851187

11861188
void DoForallChecker::Leave(const parser::StatVariable &statVariable) {

flang/test/Semantics/resolve40.f90

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,10 @@ subroutine s12(x)
9696
!BECAUSE: 'x' is an INTENT(IN) dummy argument
9797
read(*,nml=nl)
9898
end
99+
100+
subroutine s13()
101+
implicit none
102+
!ERROR: No explicit type declared for 'i'
103+
!ERROR: No explicit type declared for 'i'
104+
print *, (i, i = 1, 2)
105+
end

libc/shared/math.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "math/exp.h"
1515
#include "math/exp10.h"
1616
#include "math/exp10f.h"
17+
#include "math/exp10f16.h"
1718
#include "math/expf.h"
1819
#include "math/expf16.h"
1920
#include "math/frexpf.h"

libc/shared/math/exp10f16.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
//===-- Shared exp10f16 function --------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H
10+
#define LLVM_LIBC_SHARED_MATH_EXP10F_H
11+
12+
#include "include/llvm-libc-macros/float16-macros.h"
13+
14+
#ifdef LIBC_TYPES_HAS_FLOAT16
15+
16+
#include "shared/libc_common.h"
17+
#include "src/__support/math/exp10f16.h"
18+
19+
namespace LIBC_NAMESPACE_DECL {
20+
namespace shared {
21+
22+
using math::exp10f16;
23+
24+
} // namespace shared
25+
} // namespace LIBC_NAMESPACE_DECL
26+
27+
#endif // LIBC_TYPES_HAS_FLOAT16
28+
29+
#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H

libc/src/__support/math/CMakeLists.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,37 @@ add_header_library(
198198
libc.src.__support.FPUtil.rounding_mode
199199
libc.src.__support.macros.optimization
200200
)
201+
202+
add_header_library(
203+
exp10_float16_constants
204+
HDRS
205+
exp10_float16_constants.h
206+
DEPENDS
207+
libc.src.__support.CPP.array
208+
)
209+
210+
add_header_library(
211+
exp10f16_utils
212+
HDRS
213+
exp10f16_utils.h
214+
DEPENDS
215+
.expf16_utils
216+
.exp10_float16_constants
217+
libc.src.__support.FPUtil.fp_bits
218+
)
219+
220+
add_header_library(
221+
exp10f16
222+
HDRS
223+
exp10f16.h
224+
DEPENDS
225+
.exp10f16_utils
226+
libc.src.__support.FPUtil.fp_bits
227+
src.__support.FPUtil.FEnvImpl
228+
src.__support.FPUtil.FPBits
229+
src.__support.FPUtil.cast
230+
src.__support.FPUtil.rounding_mode
231+
src.__support.FPUtil.except_value_utils
232+
src.__support.macros.optimization
233+
src.__support.macros.properties.cpu_features
234+
)
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//===-- Constants for exp10f16 function -------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
10+
#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
11+
12+
#include "include/llvm-libc-macros/float16-macros.h"
13+
#include <stdint.h>
14+
15+
#ifdef LIBC_TYPES_HAS_FLOAT16
16+
17+
#include "src/__support/CPP/array.h"
18+
19+
namespace LIBC_NAMESPACE_DECL {
20+
21+
// Generated by Sollya with the following commands:
22+
// > display = hexadecimal;
23+
// > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
24+
static constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = {
25+
0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U,
26+
0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U,
27+
};
28+
29+
// Generated by Sollya with the following commands:
30+
// > display = hexadecimal;
31+
// > round(log2(10), SG, RN);
32+
static constexpr float LOG2F_10 = 0x1.a934fp+1f;
33+
34+
// Generated by Sollya with the following commands:
35+
// > display = hexadecimal;
36+
// > round(log10(2), SG, RN);
37+
static constexpr float LOG10F_2 = 0x1.344136p-2f;
38+
39+
} // namespace LIBC_NAMESPACE_DECL
40+
41+
#endif // LIBC_TYPES_HAS_FLOAT16
42+
43+
#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H

libc/src/__support/math/exp10f16.h

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
//===-- Implementation header for exp10f16 ----------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
10+
#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
11+
12+
#include "include/llvm-libc-macros/float16-macros.h"
13+
14+
#ifdef LIBC_TYPES_HAS_FLOAT16
15+
16+
#include "exp10f16_utils.h"
17+
#include "src/__support/FPUtil/FEnvImpl.h"
18+
#include "src/__support/FPUtil/FPBits.h"
19+
#include "src/__support/FPUtil/cast.h"
20+
#include "src/__support/FPUtil/except_value_utils.h"
21+
#include "src/__support/FPUtil/rounding_mode.h"
22+
#include "src/__support/macros/config.h"
23+
#include "src/__support/macros/optimization.h"
24+
#include "src/__support/macros/properties/cpu_features.h"
25+
26+
namespace LIBC_NAMESPACE_DECL {
27+
28+
namespace math {
29+
30+
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
31+
#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
32+
static constexpr size_t N_EXP10F16_EXCEPTS = 5;
33+
#else
34+
static constexpr size_t N_EXP10F16_EXCEPTS = 8;
35+
#endif
36+
37+
static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
38+
EXP10F16_EXCEPTS = {{
39+
// x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ)
40+
{0x363dU, 0x40e8U, 1U, 0U, 1U},
41+
// x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ)
42+
{0x3657U, 0x40fbU, 1U, 0U, 0U},
43+
// x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ)
44+
{0xac06U, 0x3aecU, 1U, 0U, 0U},
45+
// x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ)
46+
{0xbf0aU, 0x2473U, 1U, 0U, 0U},
47+
// x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
48+
{0xc387U, 0x09a5U, 1U, 0U, 0U},
49+
#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
50+
// x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
51+
{0x4030U, 0x57c1U, 1U, 0U, 1U},
52+
// x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
53+
{0x406eU, 0x591fU, 1U, 0U, 1U},
54+
// x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ)
55+
{0x446eU, 0x7690U, 1U, 0U, 1U},
56+
#endif
57+
}};
58+
#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
59+
60+
static constexpr float16 exp10f16(float16 x) {
61+
using FPBits = fputil::FPBits<float16>;
62+
FPBits x_bits(x);
63+
64+
uint16_t x_u = x_bits.uintval();
65+
uint16_t x_abs = x_u & 0x7fffU;
66+
67+
// When |x| >= 5, or x is NaN.
68+
if (LIBC_UNLIKELY(x_abs >= 0x4500U)) {
69+
// exp10(NaN) = NaN
70+
if (x_bits.is_nan()) {
71+
if (x_bits.is_signaling_nan()) {
72+
fputil::raise_except_if_required(FE_INVALID);
73+
return FPBits::quiet_nan().get_val();
74+
}
75+
76+
return x;
77+
}
78+
79+
// When x >= 5.
80+
if (x_bits.is_pos()) {
81+
// exp10(+inf) = +inf
82+
if (x_bits.is_inf())
83+
return FPBits::inf().get_val();
84+
85+
switch (fputil::quick_get_round()) {
86+
case FE_TONEAREST:
87+
case FE_UPWARD:
88+
fputil::set_errno_if_required(ERANGE);
89+
fputil::raise_except_if_required(FE_OVERFLOW);
90+
return FPBits::inf().get_val();
91+
default:
92+
return FPBits::max_normal().get_val();
93+
}
94+
}
95+
96+
// When x <= -8.
97+
if (x_u >= 0xc800U) {
98+
// exp10(-inf) = +0
99+
if (x_bits.is_inf())
100+
return FPBits::zero().get_val();
101+
102+
fputil::set_errno_if_required(ERANGE);
103+
fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
104+
105+
if (fputil::fenv_is_round_up())
106+
return FPBits::min_subnormal().get_val();
107+
return FPBits::zero().get_val();
108+
}
109+
}
110+
111+
// When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results.
112+
if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
113+
switch (x_u) {
114+
case 0x3c00U: // x = 1.0f16
115+
return fputil::cast<float16>(10.0);
116+
case 0x4000U: // x = 2.0f16
117+
return fputil::cast<float16>(100.0);
118+
case 0x4200U: // x = 3.0f16
119+
return fputil::cast<float16>(1'000.0);
120+
case 0x4400U: // x = 4.0f16
121+
return fputil::cast<float16>(10'000.0);
122+
}
123+
}
124+
125+
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
126+
if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
127+
return r.value();
128+
#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
129+
130+
// 10^x = 2^((hi + mid) * log2(10)) * 10^lo
131+
auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
132+
return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
133+
}
134+
135+
} // namespace math
136+
137+
} // namespace LIBC_NAMESPACE_DECL
138+
139+
#endif // LIBC_TYPES_HAS_FLOAT16
140+
141+
#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H

0 commit comments

Comments
 (0)