Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {

break;
case Intrinsic::sincos:
LC = RTLIB::getSINCOS(ScalarVT);
LC = RTLIB::getSINCOS(VT);
if (LC == RTLIB::UNKNOWN_LIBCALL)
LC = RTLIB::getSINCOS(ScalarVT);
else if (VT.isVector())
IsVectorCall = true;

break;
default:
return std::nullopt;
Expand Down
13 changes: 13 additions & 0 deletions llvm/include/llvm/IR/RuntimeLibcalls.td
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in {
}

foreach VecTy = ["V4F32", "V2F64", "NXV4F32", "NXV2F64"] in {
def SINCOS_#VecTy : RuntimeLibcall;
def SINCOSPI_#VecTy : RuntimeLibcall;
}

Expand Down Expand Up @@ -1092,6 +1093,11 @@ def __security_check_cookie_arm64ec : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE,
//===----------------------------------------------------------------------===//

defset list<RuntimeLibcallImpl> SleefLibcalls = {
def _ZGVnN2vl8l8_sincos : RuntimeLibcallImpl<SINCOS_V2F64>;
def _ZGVnN4vl4l4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>;
def _ZGVsNxvl8l8_sincos : RuntimeLibcallImpl<SINCOS_NXV2F64>;
def _ZGVsNxvl4l4_sincosf : RuntimeLibcallImpl<SINCOS_NXV4F32>;

def _ZGVnN4vl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_V4F32>;
def _ZGVnN2vl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_V2F64>;
def _ZGVsNxvl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
Expand All @@ -1103,6 +1109,13 @@ defset list<RuntimeLibcallImpl> SleefLibcalls = {
//===----------------------------------------------------------------------===//

defset list<RuntimeLibcallImpl> ARMPLLibcalls = {
def armpl_vsincosq_f64
: RuntimeLibcallImpl<SINCOS_V2F64>; // CallingConv::AArch64_VectorCall
def armpl_vsincosq_f32
: RuntimeLibcallImpl<SINCOS_V4F32>; // CallingConv::AArch64_VectorCall
def armpl_svsincos_f64_x : RuntimeLibcallImpl<SINCOS_NXV2F64>;
def armpl_svsincos_f32_x : RuntimeLibcallImpl<SINCOS_NXV4F32>;

def armpl_vsincospiq_f32 : RuntimeLibcallImpl<SINCOSPI_V4F32>;
def armpl_vsincospiq_f64 : RuntimeLibcallImpl<SINCOSPI_V2F64>;
def armpl_svsincospi_f32_x : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
Expand Down
14 changes: 4 additions & 10 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1268,10 +1268,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
return;

break;

case ISD::FSINCOS:
case ISD::FSINCOSPI: {
EVT VT = Node->getValueType(0);
RTLIB::Libcall LC = RTLIB::getSINCOSPI(VT);
RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
? RTLIB::getSINCOS(VT)
: RTLIB::getSINCOSPI(VT);
if (LC != RTLIB::UNKNOWN_LIBCALL &&
DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT))
return;
Expand All @@ -1280,14 +1282,6 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
// scalarizing.
break;
}
case ISD::FSINCOS: {
// FIXME: Try to directly match vector case like fsincospi
EVT VT = Node->getValueType(0).getVectorElementType();
RTLIB::Libcall LC = RTLIB::getSINCOS(VT);
if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT))
return;
break;
}
case ISD::FMODF: {
EVT VT = Node->getValueType(0).getVectorElementType();
RTLIB::Libcall LC = RTLIB::getMODF(VT);
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,24 @@ RTLIB::Libcall RTLIB::getCOS(EVT RetVT) {
}

RTLIB::Libcall RTLIB::getSINCOS(EVT RetVT) {
// TODO: Tablegen should generate this function
if (RetVT.isVector()) {
if (!RetVT.isSimple())
return RTLIB::UNKNOWN_LIBCALL;
switch (RetVT.getSimpleVT().SimpleTy) {
case MVT::v4f32:
return RTLIB::SINCOS_V4F32;
case MVT::v2f64:
return RTLIB::SINCOS_V2F64;
case MVT::nxv4f32:
return RTLIB::SINCOS_NXV4F32;
case MVT::nxv2f64:
return RTLIB::SINCOS_NXV2F64;
default:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this handle SINCOS_V8F64? AMDLIBM has this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could but I'm only handling cases that are tested

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Andarwinux please raise an issue if you can find AMDLIBM methods llvm doesn't currently handle

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Andarwinux please raise an issue if you can find AMDLIBM methods llvm doesn't currently handle

No problems. It looks like AMDLIBM is still handling by TLI for now. I thought it would also switch to RuntimeLibcalls soon.

But veclib does indeed have issues on x86, see #164642.

return RTLIB::UNKNOWN_LIBCALL;
}
}

return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
SINCOS_PPCF128);
}
Expand Down
36 changes: 32 additions & 4 deletions llvm/lib/IR/RuntimeLibcalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,26 @@ RuntimeLibcallsInfo::RuntimeLibcallsInfo(const Triple &TT,
switch (ClVectorLibrary) {
case VectorLibrary::SLEEFGNUABI:
for (RTLIB::LibcallImpl Impl :
{RTLIB::impl__ZGVnN4vl4l4_sincospif, RTLIB::impl__ZGVnN2vl8l8_sincospi,
{RTLIB::impl__ZGVnN2vl8l8_sincos, RTLIB::impl__ZGVnN4vl4l4_sincosf,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems this has somehow broken sincos vectorization from the Clang side. Now you have to do: -fveclib=ArmPL -mllvm --vector-library=ArmPL, just -O3 -fno-math-errno -fveclib=ArmPL results in the default sincos expansion rather than a vector call.

Sadly, this means all the LLVM IR tests pass, but it still is broken in Clang.

See: https://godbolt.org/z/63PKzsG48

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is why TargetOptions should go away

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed by #167996

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMDLIBM sincos still not working

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want that to work, can you add tests for all of the functions it it? We only have any test coverage for sleef and armpl. If there's no test, it's not going to work

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want that to work, can you add tests for all of the functions it it? We only have any test coverage for sleef and armpl. If there's no test, it's not going to work

AMDLIBM did have some tests

define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; CHECK-LABEL: define void @sincos_f64
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
; CHECK-VF2-NOT: call void @amd_vrd2_sincos(<2 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK-VF4-NOT: call void @amd_vrd4_sincos(<4 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK-VF8-NOT: call void @amd_vrd8_sincos(<8 x double> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gepa = getelementptr double, ptr %a, i64 %indvars.iv
%num = load double, ptr %gepa, align 8
%gepb = getelementptr double, ptr %b, i64 %indvars.iv
%gepc = getelementptr double, ptr %c, i64 %indvars.iv
call void @sincos(double %num, ptr %gepb, ptr %gepc)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup:
ret void
}
define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
; CHECK-LABEL: define void @sincos_f32
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]])
; CHECK-VF4-NOT: call void @amd_vrs4_sincosf(<4 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK-VF8-NOT: call void @amd_vrs8_sincosf(<8 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK-VF16-NOT: call void @amd_vrs16_sincosf(<16 x float> [[WIDE_LOAD:%.*]], ptr [[TMP5:%.*]], ptr [[TMP6:%.*]])
; CHECK: ret void
;
entry:
br label %for.body
for.body:
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%gepa = getelementptr float, ptr %a, i64 %indvars.iv
%num = load float, ptr %gepa, align 8
%gepb = getelementptr float, ptr %b, i64 %indvars.iv
%gepc = getelementptr float, ptr %c, i64 %indvars.iv
call void @sincosf(float %num, ptr %gepb, ptr %gepc)
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 1000
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup:
ret void
}
attributes #0 = { nounwind readnone }
declare double @exp10(double) #0
declare float @exp10f(float) #0
declare double @llvm.exp10.f64(double) #0
declare float @llvm.exp10.f32(float) #0
declare void @sincos(double, ptr, ptr)
declare void @sincosf(float, ptr, ptr)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not the standard codegen usage that's broken. The fact that LoopVectorize directly emits these calls is actually really weird (#162239 (comment)).

What's missing is codegen tests for using these calls for legalization of the intrinsics. e.g., the important bit of SLEEF and ArmPL coverage is here:

https://github.com/llvm/llvm-project/blob/0fa6a67a4200ea1516f56e298df4a671af8a0642/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
https://github.com/llvm/llvm-project/blob/0fa6a67a4200ea1516f56e298df4a671af8a0642/llvm/test/CodeGen/AArch64/veclib-llvm.sincospi.ll
https://github.com/llvm/llvm-project/blob/0fa6a67a4200ea1516f56e298df4a671af8a0642/llvm/test/CodeGen/AArch64/veclib-llvm.modf.ll

RTLIB::impl__ZGVsNxvl8l8_sincos, RTLIB::impl__ZGVsNxvl4l4_sincosf,
RTLIB::impl__ZGVnN4vl4l4_sincospif, RTLIB::impl__ZGVnN2vl8l8_sincospi,
RTLIB::impl__ZGVsNxvl4l4_sincospif,
RTLIB::impl__ZGVsNxvl8l8_sincospi})
setAvailable(Impl);
break;
case VectorLibrary::ArmPL:
for (RTLIB::LibcallImpl Impl :
{RTLIB::impl_armpl_vsincospiq_f32, RTLIB::impl_armpl_vsincospiq_f64,
{RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32,
RTLIB::impl_armpl_svsincos_f64_x, RTLIB::impl_armpl_svsincos_f32_x,
RTLIB::impl_armpl_vsincospiq_f32, RTLIB::impl_armpl_vsincospiq_f64,
RTLIB::impl_armpl_svsincospi_f32_x,
RTLIB::impl_armpl_svsincospi_f64_x})
setAvailable(Impl);

for (RTLIB::LibcallImpl Impl :
{RTLIB::impl_armpl_vsincosq_f64, RTLIB::impl_armpl_vsincosq_f32})
setLibcallImplCallingConv(Impl, CallingConv::AArch64_VectorCall);

break;
default:
break;
Expand Down Expand Up @@ -188,6 +197,14 @@ RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
fcNegNormal));
return {FuncTy, Attrs};
}
case RTLIB::impl__ZGVnN2vl8l8_sincos:
case RTLIB::impl__ZGVnN4vl4l4_sincosf:
case RTLIB::impl__ZGVsNxvl8l8_sincos:
case RTLIB::impl__ZGVsNxvl4l4_sincosf:
case RTLIB::impl_armpl_vsincosq_f64:
case RTLIB::impl_armpl_vsincosq_f32:
case RTLIB::impl_armpl_svsincos_f64_x:
case RTLIB::impl_armpl_svsincos_f32_x:
case RTLIB::impl__ZGVnN4vl4l4_sincospif:
case RTLIB::impl__ZGVnN2vl8l8_sincospi:
case RTLIB::impl__ZGVsNxvl4l4_sincospif:
Expand All @@ -201,11 +218,20 @@ RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
bool IsF32 = LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincospif ||
LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif ||
LibcallImpl == RTLIB::impl_armpl_vsincospiq_f32 ||
LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x;
LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x ||
LibcallImpl == RTLIB::impl__ZGVnN4vl4l4_sincosf ||
LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf ||
LibcallImpl == RTLIB::impl_armpl_vsincosq_f32 ||
LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x;

Type *ScalarTy = IsF32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
unsigned EC = IsF32 ? 4 : 2;

bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif ||
bool IsScalable = LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincos ||
LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincosf ||
LibcallImpl == RTLIB::impl_armpl_svsincos_f32_x ||
LibcallImpl == RTLIB::impl_armpl_svsincos_f64_x ||
LibcallImpl == RTLIB::impl__ZGVsNxvl4l4_sincospif ||
LibcallImpl == RTLIB::impl__ZGVsNxvl8l8_sincospi ||
LibcallImpl == RTLIB::impl_armpl_svsincospi_f32_x ||
LibcallImpl == RTLIB::impl_armpl_svsincospi_f64_x;
Expand Down Expand Up @@ -245,6 +271,8 @@ bool RuntimeLibcallsInfo::hasVectorMaskArgument(RTLIB::LibcallImpl Impl) {
/// FIXME: This should be generated by tablegen and support the argument at an
/// arbitrary position
switch (Impl) {
case RTLIB::impl_armpl_svsincos_f32_x:
case RTLIB::impl_armpl_svsincos_f64_x:
case RTLIB::impl_armpl_svsincospi_f32_x:
case RTLIB::impl_armpl_svsincospi_f64_x:
return true;
Expand Down
13 changes: 11 additions & 2 deletions llvm/test/Transforms/Util/DeclareRuntimeLibcalls/armpl.ll
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
; REQUIRES: aarch64-registered-target
; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s

; CHECK: declare void @armpl_svsincospi_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]]
; CHECK: declare void @armpl_svsincos_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS:#[0-9]+]]

; CHECK: declare void @armpl_svsincospi_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS:#[0-9]+]]
; CHECK: declare void @armpl_svsincos_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]]

; CHECK: declare void @armpl_svsincospi_f32_x(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 4 x i1>) [[ATTRS]]

; CHECK: declare void @armpl_svsincospi_f64_x(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16, <vscale x 2 x i1>) [[ATTRS]]

; CHECK: declare void @armpl_vsincospiq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @armpl_vsincospiq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f32(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare aarch64_vector_pcs void @armpl_vsincosq_f64(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]


; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
14 changes: 11 additions & 3 deletions llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sleef.ll
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
; REQUIRES: aarch64-registered-target
; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=aarch64-unknown-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s

; CHECK: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]]
; CHECK: declare void @_ZGVnN2vl8l8_sincos(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]]

; CHECK: declare void @_ZGVnN2vl8l8_sincospi(<2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVnN4vl4l4_sincosf(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVnN4vl4l4_sincospif(<4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]]
; CHECK: declare void @_ZGVsNxvl4l4_sincosf(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVsNxvl4l4_sincospif(<vscale x 4 x float>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVsNxvl8l8_sincos(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: declare void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS:#[0-9]+]]
; CHECK: declare void @_ZGVsNxvl8l8_sincospi(<vscale x 2 x double>, ptr noalias nonnull writeonly align 16, ptr noalias nonnull writeonly align 16) [[ATTRS]]

; CHECK: attributes [[ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
Loading