@@ -18,6 +18,8 @@ declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>)
1818declare <2 x i16 > @llvm.smax.v2i16 (<2 x i16 >, <2 x i16 >)
1919declare <4 x i16 > @llvm.smin.v4i16 (<4 x i16 >, <4 x i16 >)
2020declare <4 x i16 > @llvm.smax.v4i16 (<4 x i16 >, <4 x i16 >)
21+ declare <3 x i16 > @llvm.smin.v3i16 (<3 x i16 >, <3 x i16 >)
22+ declare <3 x i16 > @llvm.smax.v3i16 (<3 x i16 >, <3 x i16 >)
2123declare <8 x i16 > @llvm.smin.v8i16 (<8 x i16 >, <8 x i16 >)
2224declare <8 x i16 > @llvm.smax.v8i16 (<8 x i16 >, <8 x i16 >)
2325
@@ -2428,3 +2430,181 @@ define i32 @basic_smax_smin_v4i16_input_2(<4 x i16> %src) {
24282430 %cast = bitcast <4 x i8 > %vec.trunc to i32
24292431 ret i32 %cast
24302432}
2433+
2434+ define i24 @basic_smax_smin_vec_v3i16 (<3 x i16 > %src ) {
2435+ ; SDAG-VI-LABEL: basic_smax_smin_vec_v3i16:
2436+ ; SDAG-VI: ; %bb.0:
2437+ ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2438+ ; SDAG-VI-NEXT: v_mov_b32_e32 v3, s4
2439+ ; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff
2440+ ; SDAG-VI-NEXT: v_min_i16_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2441+ ; SDAG-VI-NEXT: v_mov_b32_e32 v4, s4
2442+ ; SDAG-VI-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2443+ ; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0
2444+ ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
2445+ ; SDAG-VI-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2446+ ; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0
2447+ ; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1
2448+ ; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0
2449+ ; SDAG-VI-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2450+ ; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v2
2451+ ; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2452+ ; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2453+ ; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
2454+ ;
2455+ ; SDAG-GFX9-LABEL: basic_smax_smin_vec_v3i16:
2456+ ; SDAG-GFX9: ; %bb.0:
2457+ ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2458+ ; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff
2459+ ; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
2460+ ; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0
2461+ ; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s4
2462+ ; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2463+ ; SDAG-GFX9-NEXT: v_pk_max_i16 v1, v1, 0
2464+ ; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
2465+ ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v2
2466+ ; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
2467+ ; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2
2468+ ; SDAG-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2469+ ; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2470+ ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
2471+ ;
2472+ ; SDAG-GFX11-LABEL: basic_smax_smin_vec_v3i16:
2473+ ; SDAG-GFX11: ; %bb.0:
2474+ ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2475+ ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
2476+ ; SDAG-GFX11-NEXT: v_pk_min_i16 v1, 0xff, v1
2477+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2478+ ; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0
2479+ ; SDAG-GFX11-NEXT: v_pk_max_i16 v1, v1, 0
2480+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2481+ ; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2482+ ; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2483+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2484+ ; SDAG-GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
2485+ ; SDAG-GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
2486+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2487+ ; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v2
2488+ ; SDAG-GFX11-NEXT: v_or_b32_e32 v1, v1, v3
2489+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2490+ ; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
2491+ ; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2492+ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2493+ ; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1
2494+ ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
2495+ ;
2496+ ; SDAG-GFX12-LABEL: basic_smax_smin_vec_v3i16:
2497+ ; SDAG-GFX12: ; %bb.0:
2498+ ; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
2499+ ; SDAG-GFX12-NEXT: s_wait_expcnt 0x0
2500+ ; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0
2501+ ; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0
2502+ ; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0
2503+ ; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
2504+ ; SDAG-GFX12-NEXT: v_pk_min_i16 v1, 0xff, v1
2505+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2506+ ; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0
2507+ ; SDAG-GFX12-NEXT: v_pk_max_i16 v1, v1, 0
2508+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2509+ ; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2510+ ; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2511+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2512+ ; SDAG-GFX12-NEXT: v_lshlrev_b16 v2, 8, v2
2513+ ; SDAG-GFX12-NEXT: v_lshlrev_b16 v3, 8, v3
2514+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2515+ ; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v2
2516+ ; SDAG-GFX12-NEXT: v_or_b32_e32 v1, v1, v3
2517+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2518+ ; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
2519+ ; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2520+ ; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2521+ ; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1
2522+ ; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31]
2523+ ;
2524+ ; GISEL-VI-LABEL: basic_smax_smin_vec_v3i16:
2525+ ; GISEL-VI: ; %bb.0:
2526+ ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2527+ ; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff
2528+ ; GISEL-VI-NEXT: v_min_i16_e32 v3, 0xff, v0
2529+ ; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2530+ ; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0
2531+ ; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0
2532+ ; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
2533+ ; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v3
2534+ ; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
2535+ ; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1
2536+ ; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2537+ ; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xffff
2538+ ; GISEL-VI-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2539+ ; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2540+ ; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
2541+ ;
2542+ ; GISEL-GFX9-LABEL: basic_smax_smin_vec_v3i16:
2543+ ; GISEL-GFX9: ; %bb.0:
2544+ ; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545+ ; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff00ff
2546+ ; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v2, v0
2547+ ; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
2548+ ; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0
2549+ ; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
2550+ ; GISEL-GFX9-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2551+ ; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2552+ ; GISEL-GFX9-NEXT: s_mov_b32 s4, 0xffff
2553+ ; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2554+ ; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2555+ ; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2556+ ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
2557+ ;
2558+ ; GISEL-GFX11-LABEL: basic_smax_smin_vec_v3i16:
2559+ ; GISEL-GFX11: ; %bb.0:
2560+ ; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2561+ ; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
2562+ ; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
2563+ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2564+ ; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0
2565+ ; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
2566+ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2567+ ; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2568+ ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
2569+ ; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
2570+ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2571+ ; GISEL-GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
2572+ ; GISEL-GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
2573+ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2574+ ; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v2
2575+ ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
2576+ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2577+ ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2578+ ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
2579+ ;
2580+ ; GISEL-GFX12-LABEL: basic_smax_smin_vec_v3i16:
2581+ ; GISEL-GFX12: ; %bb.0:
2582+ ; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
2583+ ; GISEL-GFX12-NEXT: s_wait_expcnt 0x0
2584+ ; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0
2585+ ; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
2586+ ; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0
2587+ ; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0
2588+ ; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff
2589+ ; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2590+ ; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0
2591+ ; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1
2592+ ; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2593+ ; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
2594+ ; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
2595+ ; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
2596+ ; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2597+ ; GISEL-GFX12-NEXT: v_and_b32_e32 v2, 0xff, v2
2598+ ; GISEL-GFX12-NEXT: v_lshlrev_b16 v2, 8, v2
2599+ ; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2600+ ; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v2
2601+ ; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
2602+ ; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2603+ ; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2604+ ; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
2605+ %smin = call <3 x i16 > @llvm.smin.v3i16 (<3 x i16 > <i16 255 , i16 255 , i16 255 >, <3 x i16 > %src )
2606+ %smed = call <3 x i16 > @llvm.smax.v3i16 (<3 x i16 > <i16 0 , i16 0 , i16 0 >, <3 x i16 > %smin )
2607+ %vec.trunc = trunc <3 x i16 > %smed to <3 x i8 >
2608+ %cast = bitcast <3 x i8 > %vec.trunc to i24
2609+ ret i24 %cast
2610+ }
0 commit comments