Skip to content

Commit d599c7a

Browse files
committed
[AMDGPU] Update log lowering to remove contract for AMDGCN backend
1 parent c6775e2 commit d599c7a

File tree

4 files changed

+660
-9
lines changed

4 files changed

+660
-9
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2766,7 +2766,9 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
27662766
EVT VT = Op.getValueType();
27672767
SDNodeFlags Flags = Op->getFlags();
27682768
SDLoc DL(Op);
2769-
2769+
// Our implementation of LOG is not contract safe, so disable instruction
2770+
// contraction.
2771+
Flags.setAllowContract(false);
27702772
const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
27712773
assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
27722774

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3508,6 +3508,9 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
35083508
MachineRegisterInfo &MRI = *B.getMRI();
35093509
Register Dst = MI.getOperand(0).getReg();
35103510
Register X = MI.getOperand(1).getReg();
3511+
// Our implementation of LOG is not contract safe, so disable contraction in
3512+
// the flags before reading the field.
3513+
MI.clearFlags(MachineInstr::FmContract);
35113514
unsigned Flags = MI.getFlags();
35123515
const LLT Ty = MRI.getType(X);
35133516
MachineFunction &MF = B.getMF();

llvm/test/CodeGen/AMDGPU/llvm.log.ll

Lines changed: 327 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
316316
ret void
317317
}
318318

319+
define amdgpu_kernel void @s_log_contract_f32(ptr addrspace(1) %out, float %in) {
320+
; SI-SDAG-LABEL: s_log_contract_f32:
321+
; SI-SDAG: ; %bb.0:
322+
; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb
323+
; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
324+
; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
325+
; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
326+
; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217
327+
; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
328+
; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
329+
; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
330+
; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
331+
; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
332+
; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2
333+
; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
334+
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
335+
; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
336+
; SI-SDAG-NEXT: s_mov_b32 s2, -1
337+
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
338+
; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2
339+
; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf
340+
; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3
341+
; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
342+
; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
343+
; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
344+
; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
345+
; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
346+
; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
347+
; SI-SDAG-NEXT: s_endpgm
348+
;
349+
; SI-GISEL-LABEL: s_log_contract_f32:
350+
; SI-GISEL: ; %bb.0:
351+
; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb
352+
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
353+
; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
354+
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
355+
; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf
356+
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
357+
; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
358+
; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
359+
; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
360+
; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0
361+
; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
362+
; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
363+
; SI-GISEL-NEXT: s_mov_b32 s6, -1
364+
; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
365+
; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
366+
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
367+
; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
368+
; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1
369+
; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
370+
; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
371+
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
372+
; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
373+
; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
374+
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
375+
; SI-GISEL-NEXT: s_endpgm
376+
;
377+
; VI-SDAG-LABEL: s_log_contract_f32:
378+
; VI-SDAG: ; %bb.0:
379+
; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
380+
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
381+
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
382+
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
383+
; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
384+
; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
385+
; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
386+
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
387+
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
388+
; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
389+
; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
390+
; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
391+
; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
392+
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
393+
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
394+
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
395+
; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
396+
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
397+
; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
398+
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
399+
; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
400+
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
401+
; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2
402+
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
403+
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
404+
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
405+
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
406+
; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
407+
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
408+
; VI-SDAG-NEXT: s_endpgm
409+
;
410+
; VI-GISEL-LABEL: s_log_contract_f32:
411+
; VI-GISEL: ; %bb.0:
412+
; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
413+
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
414+
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
415+
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000
416+
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
417+
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
418+
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
419+
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
420+
; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
421+
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
422+
; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
423+
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
424+
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
425+
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
426+
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
427+
; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
428+
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
429+
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
430+
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
431+
; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1
432+
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
433+
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
434+
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
435+
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
436+
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
437+
; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
438+
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
439+
; VI-GISEL-NEXT: s_endpgm
440+
;
441+
; GFX900-SDAG-LABEL: s_log_contract_f32:
442+
; GFX900-SDAG: ; %bb.0:
443+
; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c
444+
; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
445+
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
446+
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
447+
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
448+
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
449+
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
450+
; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
451+
; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0
452+
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
453+
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
454+
; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1
455+
; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
456+
; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217
457+
; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf
458+
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1
459+
; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3
460+
; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4
461+
; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
462+
; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
463+
; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2
464+
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
465+
; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
466+
; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
467+
; GFX900-SDAG-NEXT: s_endpgm
468+
;
469+
; GFX900-GISEL-LABEL: s_log_contract_f32:
470+
; GFX900-GISEL: ; %bb.0:
471+
; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
472+
; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
473+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
474+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
475+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
476+
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
477+
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
478+
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
479+
; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
480+
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
481+
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
482+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
483+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
484+
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0
485+
; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5
486+
; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2
487+
; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
488+
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
489+
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
490+
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218
491+
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
492+
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
493+
; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
494+
; GFX900-GISEL-NEXT: s_endpgm
495+
;
496+
; GFX1100-SDAG-LABEL: s_log_contract_f32:
497+
; GFX1100-SDAG: ; %bb.0:
498+
; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c
499+
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
500+
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
501+
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
502+
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s1
503+
; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
504+
; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0
505+
; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1
506+
; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
507+
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
508+
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
509+
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
510+
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
511+
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
512+
; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3f317217, v1, -v2
513+
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
514+
; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3377d1cf, v3
515+
; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
516+
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
517+
; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
518+
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
519+
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
520+
; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
521+
; GFX1100-SDAG-NEXT: s_endpgm
522+
;
523+
; GFX1100-GISEL-LABEL: s_log_contract_f32:
524+
; GFX1100-GISEL: ; %bb.0:
525+
; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
526+
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
527+
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
528+
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
529+
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
530+
; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
531+
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
532+
; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
533+
; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
534+
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
535+
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
536+
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
537+
; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
538+
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
539+
; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
540+
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
541+
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
542+
; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
543+
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
544+
; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
545+
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
546+
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
547+
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
548+
; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1]
549+
; GFX1100-GISEL-NEXT: s_endpgm
550+
;
551+
; R600-LABEL: s_log_contract_f32:
552+
; R600: ; %bb.0:
553+
; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[]
554+
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
555+
; R600-NEXT: CF_END
556+
; R600-NEXT: PAD
557+
; R600-NEXT: ALU clause starting at 4:
558+
; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z,
559+
; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
560+
; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x,
561+
; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00)
562+
; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W,
563+
; R600-NEXT: LOG_IEEE * T0.X, PV.W,
564+
; R600-NEXT: AND_INT * T1.W, PS, literal.x,
565+
; R600-NEXT: -4096(nan), 0(0.000000e+00)
566+
; R600-NEXT: ADD * T2.W, T0.X, -PV.W,
567+
; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x,
568+
; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
569+
; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W,
570+
; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
571+
; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W,
572+
; R600-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00)
573+
; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
574+
; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|,
575+
; R600-NEXT: 1060204544(6.931152e-01), 2139095040(INF)
576+
; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W,
577+
; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x,
578+
; R600-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00)
579+
; R600-NEXT: ADD T0.X, PV.W, -PS,
580+
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
581+
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
582+
;
583+
; CM-LABEL: s_log_contract_f32:
584+
; CM: ; %bb.0:
585+
; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[]
586+
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
587+
; CM-NEXT: CF_END
588+
; CM-NEXT: PAD
589+
; CM-NEXT: ALU clause starting at 4:
590+
; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z,
591+
; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
592+
; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x,
593+
; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00)
594+
; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W,
595+
; CM-NEXT: LOG_IEEE T0.X, T1.W,
596+
; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W,
597+
; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W,
598+
; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W,
599+
; CM-NEXT: AND_INT * T1.W, PV.X, literal.x,
600+
; CM-NEXT: -4096(nan), 0(0.000000e+00)
601+
; CM-NEXT: ADD * T2.W, T0.X, -PV.W,
602+
; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x,
603+
; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
604+
; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W,
605+
; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
606+
; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W,
607+
; CM-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00)
608+
; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W,
609+
; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|,
610+
; CM-NEXT: 1060204544(6.931152e-01), 2139095040(INF)
611+
; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z,
612+
; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x,
613+
; CM-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00)
614+
; CM-NEXT: ADD * T0.X, PV.Z, -PV.W,
615+
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
616+
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
617+
%result = call contract float @llvm.log.f32(float %in)
618+
store float %result, ptr addrspace(1) %out
619+
ret void
620+
}
621+
319622
; FIXME: We should be able to merge these packets together on Cayman so we
320623
; have a maximum of 4 instructions.
321624
define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
@@ -6439,6 +6742,8 @@ define half @v_log_f16_fast(half %in) {
64396742
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
64406743
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
64416744
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
6745+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
6746+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
64426747
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
64436748
;
64446749
; SI-GISEL-LABEL: v_log_f16_fast:
@@ -7100,6 +7405,10 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
71007405
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
71017406
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
71027407
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
7408+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7409+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7410+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7411+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
71037412
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
71047413
;
71057414
; SI-GISEL-LABEL: v_log_v2f16_fast:
@@ -7365,6 +7674,12 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
73657674
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
73667675
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
73677676
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
7677+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7678+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7679+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7680+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7681+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7682+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
73687683
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
73697684
;
73707685
; SI-GISEL-LABEL: v_log_v3f16_fast:
@@ -7691,20 +8006,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
76918006
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76928007
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
76938008
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7694-
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
76958009
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8010+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
76968011
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
76978012
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7698-
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
76998013
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
8014+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
77008015
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
77018016
; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
7702-
; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
77038017
; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
8018+
; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
77048019
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
77058020
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
7706-
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
77078021
; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
8022+
; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
8023+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
8024+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8025+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
8026+
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8027+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
8028+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8029+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8030+
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
77088031
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
77098032
;
77108033
; SI-GISEL-LABEL: v_log_v4f16_fast:

0 commit comments

Comments
 (0)