@@ -316,6 +316,309 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
316316 ret void
317317}
318318
319+ define amdgpu_kernel void @s_log_contract_f32 (ptr addrspace (1 ) %out , float %in ) {
320+ ; SI-SDAG-LABEL: s_log_contract_f32:
321+ ; SI-SDAG: ; %bb.0:
322+ ; SI-SDAG-NEXT: s_load_dword s6, s[4:5], 0xb
323+ ; SI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
324+ ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
325+ ; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
326+ ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217
327+ ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
328+ ; SI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
329+ ; SI-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
330+ ; SI-SDAG-NEXT: s_cselect_b32 s2, 32, 0
331+ ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
332+ ; SI-SDAG-NEXT: v_mov_b32_e32 v1, s2
333+ ; SI-SDAG-NEXT: v_ldexp_f32_e32 v1, s6, v1
334+ ; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
335+ ; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000
336+ ; SI-SDAG-NEXT: s_mov_b32 s2, -1
337+ ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
338+ ; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, -v2
339+ ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3377d1cf
340+ ; SI-SDAG-NEXT: v_fma_f32 v3, v1, s4, v3
341+ ; SI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
342+ ; SI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
343+ ; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
344+ ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
345+ ; SI-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
346+ ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
347+ ; SI-SDAG-NEXT: s_endpgm
348+ ;
349+ ; SI-GISEL-LABEL: s_log_contract_f32:
350+ ; SI-GISEL: ; %bb.0:
351+ ; SI-GISEL-NEXT: s_load_dword s0, s[4:5], 0xb
352+ ; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
353+ ; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
354+ ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217
355+ ; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3377d1cf
356+ ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
357+ ; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
358+ ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
359+ ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
360+ ; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, s0, v0
361+ ; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
362+ ; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x7f800000
363+ ; SI-GISEL-NEXT: s_mov_b32 s6, -1
364+ ; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000
365+ ; SI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3f317217, v0
366+ ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v4
367+ ; SI-GISEL-NEXT: v_fma_f32 v1, v0, v2, v1
368+ ; SI-GISEL-NEXT: v_add_f32_e32 v1, v4, v1
369+ ; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v3
370+ ; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
371+ ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
372+ ; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
373+ ; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
374+ ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
375+ ; SI-GISEL-NEXT: s_endpgm
376+ ;
377+ ; VI-SDAG-LABEL: s_log_contract_f32:
378+ ; VI-SDAG: ; %bb.0:
379+ ; VI-SDAG-NEXT: s_load_dword s2, s[4:5], 0x2c
380+ ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
381+ ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
382+ ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
383+ ; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
384+ ; VI-SDAG-NEXT: s_and_b64 s[0:1], vcc, exec
385+ ; VI-SDAG-NEXT: s_cselect_b32 s0, 32, 0
386+ ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
387+ ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s0
388+ ; VI-SDAG-NEXT: v_ldexp_f32 v1, s2, v1
389+ ; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
390+ ; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
391+ ; VI-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
392+ ; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xfffff000, v1
393+ ; VI-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
394+ ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
395+ ; VI-SDAG-NEXT: v_mul_f32_e32 v5, 0x3f317000, v3
396+ ; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v3
397+ ; VI-SDAG-NEXT: v_add_f32_e32 v3, v4, v3
398+ ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
399+ ; VI-SDAG-NEXT: v_add_f32_e32 v3, v5, v3
400+ ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
401+ ; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2
402+ ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
403+ ; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
404+ ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
405+ ; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
406+ ; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
407+ ; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
408+ ; VI-SDAG-NEXT: s_endpgm
409+ ;
410+ ; VI-GISEL-LABEL: s_log_contract_f32:
411+ ; VI-GISEL: ; %bb.0:
412+ ; VI-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
413+ ; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
414+ ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
415+ ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x7f800000
416+ ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
417+ ; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
418+ ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
419+ ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
420+ ; VI-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
421+ ; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
422+ ; VI-GISEL-NEXT: v_and_b32_e32 v2, 0xfffff000, v0
423+ ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
424+ ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2
425+ ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3805fdf4, v3
426+ ; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3
427+ ; VI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
428+ ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2
429+ ; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4
430+ ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
431+ ; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v1
432+ ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218
433+ ; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
434+ ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
435+ ; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
436+ ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
437+ ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
438+ ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
439+ ; VI-GISEL-NEXT: s_endpgm
440+ ;
441+ ; GFX900-SDAG-LABEL: s_log_contract_f32:
442+ ; GFX900-SDAG: ; %bb.0:
443+ ; GFX900-SDAG-NEXT: s_load_dword s6, s[4:5], 0x2c
444+ ; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
445+ ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
446+ ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x41b17218
447+ ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
448+ ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
449+ ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
450+ ; GFX900-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
451+ ; GFX900-SDAG-NEXT: s_cselect_b32 s2, 32, 0
452+ ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
453+ ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, s2
454+ ; GFX900-SDAG-NEXT: v_ldexp_f32 v1, s6, v1
455+ ; GFX900-SDAG-NEXT: v_log_f32_e32 v1, v1
456+ ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x3f317217
457+ ; GFX900-SDAG-NEXT: s_mov_b32 s3, 0x3377d1cf
458+ ; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317217, v1
459+ ; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s2, -v3
460+ ; GFX900-SDAG-NEXT: v_fma_f32 v4, v1, s3, v4
461+ ; GFX900-SDAG-NEXT: s_mov_b32 s2, 0x7f800000
462+ ; GFX900-SDAG-NEXT: v_add_f32_e32 v3, v3, v4
463+ ; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s2
464+ ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
465+ ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
466+ ; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
467+ ; GFX900-SDAG-NEXT: s_endpgm
468+ ;
469+ ; GFX900-GISEL-LABEL: s_log_contract_f32:
470+ ; GFX900-GISEL: ; %bb.0:
471+ ; GFX900-GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
472+ ; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
473+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
474+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3f317217
475+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf
476+ ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
477+ ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
478+ ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
479+ ; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
480+ ; GFX900-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
481+ ; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
482+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x7f800000
483+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
484+ ; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, 0x3f317217, v0
485+ ; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v2, -v5
486+ ; GFX900-GISEL-NEXT: v_fma_f32 v2, v0, v3, v2
487+ ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v5, v2
488+ ; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, v4
489+ ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
490+ ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x41b17218
491+ ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
492+ ; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
493+ ; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
494+ ; GFX900-GISEL-NEXT: s_endpgm
495+ ;
496+ ; GFX1100-SDAG-LABEL: s_log_contract_f32:
497+ ; GFX1100-SDAG: ; %bb.0:
498+ ; GFX1100-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c
499+ ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
500+ ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s0
501+ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
502+ ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x41b17218, s1
503+ ; GFX1100-SDAG-NEXT: s_and_b32 s1, s1, exec_lo
504+ ; GFX1100-SDAG-NEXT: s_cselect_b32 s1, 32, 0
505+ ; GFX1100-SDAG-NEXT: v_ldexp_f32 v1, s0, s1
506+ ; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
507+ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
508+ ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
509+ ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
510+ ; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317217, v1
511+ ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
512+ ; GFX1100-SDAG-NEXT: v_fma_f32 v3, 0x3f317217, v1, -v2
513+ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
514+ ; GFX1100-SDAG-NEXT: v_fmamk_f32 v3, v1, 0x3377d1cf, v3
515+ ; GFX1100-SDAG-NEXT: v_add_f32_e32 v2, v2, v3
516+ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
517+ ; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
518+ ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
519+ ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
520+ ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
521+ ; GFX1100-SDAG-NEXT: s_endpgm
522+ ;
523+ ; GFX1100-GISEL-LABEL: s_log_contract_f32:
524+ ; GFX1100-GISEL: ; %bb.0:
525+ ; GFX1100-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
526+ ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
527+ ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s0
528+ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
529+ ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
530+ ; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v0, 5, v0
531+ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
532+ ; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, s0, v0
533+ ; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
534+ ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
535+ ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
536+ ; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
537+ ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
538+ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
539+ ; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
540+ ; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
541+ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
542+ ; GFX1100-GISEL-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
543+ ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
544+ ; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
545+ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
546+ ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
547+ ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
548+ ; GFX1100-GISEL-NEXT: global_store_b32 v2, v0, s[0:1]
549+ ; GFX1100-GISEL-NEXT: s_endpgm
550+ ;
551+ ; R600-LABEL: s_log_contract_f32:
552+ ; R600: ; %bb.0:
553+ ; R600-NEXT: ALU 23, @4, KC0[CB0:0-32], KC1[]
554+ ; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
555+ ; R600-NEXT: CF_END
556+ ; R600-NEXT: PAD
557+ ; R600-NEXT: ALU clause starting at 4:
558+ ; R600-NEXT: SETGT * T0.W, literal.x, KC0[2].Z,
559+ ; R600-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
560+ ; R600-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x,
561+ ; R600-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00)
562+ ; R600-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W,
563+ ; R600-NEXT: LOG_IEEE * T0.X, PV.W,
564+ ; R600-NEXT: AND_INT * T1.W, PS, literal.x,
565+ ; R600-NEXT: -4096(nan), 0(0.000000e+00)
566+ ; R600-NEXT: ADD * T2.W, T0.X, -PV.W,
567+ ; R600-NEXT: MUL_IEEE * T3.W, PV.W, literal.x,
568+ ; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
569+ ; R600-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W,
570+ ; R600-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
571+ ; R600-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W,
572+ ; R600-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00)
573+ ; R600-NEXT: MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
574+ ; R600-NEXT: SETGT * T2.W, literal.y, |T0.X|,
575+ ; R600-NEXT: 1060204544(6.931152e-01), 2139095040(INF)
576+ ; R600-NEXT: CNDE T1.W, PS, T0.X, PV.W,
577+ ; R600-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x,
578+ ; R600-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00)
579+ ; R600-NEXT: ADD T0.X, PV.W, -PS,
580+ ; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
581+ ; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
582+ ;
583+ ; CM-LABEL: s_log_contract_f32:
584+ ; CM: ; %bb.0:
585+ ; CM-NEXT: ALU 26, @4, KC0[CB0:0-32], KC1[]
586+ ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
587+ ; CM-NEXT: CF_END
588+ ; CM-NEXT: PAD
589+ ; CM-NEXT: ALU clause starting at 4:
590+ ; CM-NEXT: SETGT * T0.W, literal.x, KC0[2].Z,
591+ ; CM-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
592+ ; CM-NEXT: CNDE * T1.W, PV.W, 1.0, literal.x,
593+ ; CM-NEXT: 1333788672(4.294967e+09), 0(0.000000e+00)
594+ ; CM-NEXT: MUL_IEEE * T1.W, KC0[2].Z, PV.W,
595+ ; CM-NEXT: LOG_IEEE T0.X, T1.W,
596+ ; CM-NEXT: LOG_IEEE T0.Y (MASKED), T1.W,
597+ ; CM-NEXT: LOG_IEEE T0.Z (MASKED), T1.W,
598+ ; CM-NEXT: LOG_IEEE * T0.W (MASKED), T1.W,
599+ ; CM-NEXT: AND_INT * T1.W, PV.X, literal.x,
600+ ; CM-NEXT: -4096(nan), 0(0.000000e+00)
601+ ; CM-NEXT: ADD * T2.W, T0.X, -PV.W,
602+ ; CM-NEXT: MUL_IEEE * T3.W, PV.W, literal.x,
603+ ; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
604+ ; CM-NEXT: MULADD_IEEE * T3.W, T1.W, literal.x, PV.W,
605+ ; CM-NEXT: 939916788(3.194618e-05), 0(0.000000e+00)
606+ ; CM-NEXT: MULADD_IEEE * T2.W, T2.W, literal.x, PV.W,
607+ ; CM-NEXT: 1060204544(6.931152e-01), 0(0.000000e+00)
608+ ; CM-NEXT: MULADD_IEEE T0.Z, T1.W, literal.x, PV.W,
609+ ; CM-NEXT: SETGT * T1.W, literal.y, |T0.X|,
610+ ; CM-NEXT: 1060204544(6.931152e-01), 2139095040(INF)
611+ ; CM-NEXT: CNDE T0.Z, PV.W, T0.X, PV.Z,
612+ ; CM-NEXT: CNDE * T0.W, T0.W, 0.0, literal.x,
613+ ; CM-NEXT: 1102148120(2.218071e+01), 0(0.000000e+00)
614+ ; CM-NEXT: ADD * T0.X, PV.Z, -PV.W,
615+ ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
616+ ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
617+ %result = call contract float @llvm.log.f32 (float %in )
618+ store float %result , ptr addrspace (1 ) %out
619+ ret void
620+ }
621+
319622; FIXME: We should be able to merge these packets together on Cayman so we
320623; have a maximum of 4 instructions.
321624define amdgpu_kernel void @s_log_v2f32 (ptr addrspace (1 ) %out , <2 x float > %in ) {
@@ -6439,6 +6742,8 @@ define half @v_log_f16_fast(half %in) {
64396742; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
64406743; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
64416744; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
6745+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
6746+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
64426747; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
64436748;
64446749; SI-GISEL-LABEL: v_log_f16_fast:
@@ -7100,6 +7405,10 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
71007405; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
71017406; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
71027407; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
7408+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7409+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7410+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7411+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
71037412; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
71047413;
71057414; SI-GISEL-LABEL: v_log_v2f16_fast:
@@ -7365,6 +7674,12 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
73657674; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
73667675; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
73677676; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
7677+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
7678+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7679+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
7680+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
7681+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7682+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
73687683; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
73697684;
73707685; SI-GISEL-LABEL: v_log_v3f16_fast:
@@ -7691,20 +8006,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
76918006; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76928007; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
76938008; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
7694- ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
76958009; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8010+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
76968011; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
76978012; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
7698- ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
76998013; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
8014+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
77008015; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
77018016; SI-SDAG-NEXT: v_log_f32_e32 v1, v1
7702- ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
77038017; SI-SDAG-NEXT: v_log_f32_e32 v3, v3
8018+ ; SI-SDAG-NEXT: v_log_f32_e32 v2, v2
77048019; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
77058020; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317218, v1
7706- ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
77078021; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317218, v3
8022+ ; SI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3f317218, v2
8023+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
8024+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
8025+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
8026+ ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
8027+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
8028+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
8029+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
8030+ ; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
77088031; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
77098032;
77108033; SI-GISEL-LABEL: v_log_v4f16_fast:
0 commit comments