diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index 0b04f75a5b23..6a9a57db40bf 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -27,12 +27,38 @@ macro_rules! isle_common_prelude_methods { self.checked_add_with_type(ty, a, b).is_none() } + #[inline] + fn imm64_clz(&mut self, ty: Type, a: Imm64) -> Imm64 { + let bits = ty.bits(); + assert!(bits <= 64); + let clz_offset = 64 - bits; + let a_v: u64 = a.bits().cast_unsigned(); + let lz = a_v.leading_zeros() - clz_offset; + Imm64::new(i64::from(lz)) + } + + #[inline] + fn imm64_ctz(&mut self, ty: Type, a: Imm64) -> Imm64 { + let bits = ty.bits(); + assert!(bits <= 64); + let a_v: u64 = a.bits().cast_unsigned(); + if a_v == 0 { + // ctz(0) is defined to be the number of bits in the type. + Imm64::new(i64::from(bits)) + } else { + let lz = a_v.trailing_zeros(); + Imm64::new(i64::from(lz)) + } + } + #[inline] fn imm64_sdiv(&mut self, ty: Type, x: Imm64, y: Imm64) -> Option { // Sign extend `x` and `y`. - let shift = u32::checked_sub(64, ty.bits()).unwrap_or(0); - let x = (x.bits() << shift) >> shift; - let y = (y.bits() << shift) >> shift; + let type_width = ty.bits(); + assert!(type_width <= 64); + let x = x.sign_extend_from_width(type_width).bits(); + let y = y.sign_extend_from_width(type_width).bits(); + let shift = 64 - type_width; // NB: We can't rely on `checked_div` to detect `ty::MIN / -1` // (which overflows and should trap) because we are working with @@ -44,9 +70,23 @@ macro_rules! isle_common_prelude_methods { return None; } - let ty_mask = self.ty_mask(ty) as i64; - let result = x.checked_div(y)? & ty_mask; - Some(Imm64::new(result)) + let result = x.checked_div(y)?; + Some(Imm64::new(result).mask_to_width(type_width)) + } + + #[inline] + fn imm64_srem(&mut self, ty: Type, x: Imm64, y: Imm64) -> Option { + // Sign extend `x` and `y`. + let type_width = ty.bits(); + assert!(type_width <= 64); + let x = x.sign_extend_from_width(type_width).bits(); + let y = y.sign_extend_from_width(type_width).bits(); + + // iN::min % -1 is defined as 0 in wasm so no need + // to check for it + + let result = x.checked_rem(y)?; + Some(Imm64::new(result).mask_to_width(type_width)) } #[inline] diff --git a/cranelift/codegen/src/opts/arithmetic.isle b/cranelift/codegen/src/opts/arithmetic.isle index 48578730bc07..aba392058def 100644 --- a/cranelift/codegen/src/opts/arithmetic.isle +++ b/cranelift/codegen/src/opts/arithmetic.isle @@ -339,5 +339,3 @@ ;; (x + y) - y --> x (rule (simplify (isub ty (iadd ty x y) x)) y) (rule (simplify (isub ty (iadd ty x y) y)) x) - - diff --git a/cranelift/codegen/src/opts/cprop.isle b/cranelift/codegen/src/opts/cprop.isle index 2e9ea307ec0f..e6ec5c84db40 100644 --- a/cranelift/codegen/src/opts/cprop.isle +++ b/cranelift/codegen/src/opts/cprop.isle @@ -1,5 +1,16 @@ ;; Constant propagation. +(rule (simplify + (clz (fits_in_64 ty) + (iconst ty kx))) + (subsume (iconst ty (imm64_clz ty kx)))) + + +(rule (simplify + (ctz (fits_in_64 ty) + (iconst ty kx))) + (subsume (iconst ty (imm64_ctz ty kx)))) + (rule (simplify (iadd (fits_in_64 ty) (iconst ty (u64_from_imm64 k1)) @@ -20,16 +31,28 @@ (rule (simplify_skeleton (sdiv (iconst ty k1) - (iconst _ k2))) + (iconst ty k2))) (if-let d (imm64_sdiv ty k1 k2)) (iconst ty d)) +(rule (simplify_skeleton + (srem (iconst ty k1) + (iconst ty k2))) + (if-let d (imm64_srem ty k1 k2)) + (iconst ty d)) + (rule (simplify_skeleton (udiv (iconst_u ty k1) (iconst_u ty k2))) (if-let d (u64_checked_div k1 k2)) (iconst ty (imm64_masked ty d))) +(rule (simplify_skeleton + (urem (iconst_u ty k1) + (iconst_u ty k2))) + (if-let d (u64_checked_rem k1 k2)) + (iconst ty (imm64_masked ty d))) + (rule (simplify (bor (fits_in_64 ty) (iconst ty (u64_from_imm64 k1)) diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 20a61cde2e67..f6493c7987f7 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -73,6 +73,9 @@ (decl pure partial imm64_sdiv (Type Imm64 Imm64) Imm64) (extern constructor imm64_sdiv imm64_sdiv) +(decl pure partial imm64_srem (Type Imm64 Imm64) Imm64) +(extern constructor imm64_srem imm64_srem) + (decl pure imm64_shl (Type Imm64 Imm64) Imm64) (extern constructor imm64_shl imm64_shl) @@ -96,6 +99,12 @@ (decl pure imm64_icmp (Type IntCC Imm64 Imm64) Imm64) (extern constructor imm64_icmp imm64_icmp) +(decl pure imm64_clz (Type Imm64) Imm64) +(extern constructor imm64_clz imm64_clz) + +(decl pure imm64_ctz (Type Imm64) Imm64) +(extern constructor imm64_ctz imm64_ctz) + ;; Each of these extractors tests whether the upper half of the input equals the ;; lower half of the input (decl u128_replicated_u64 (u64) u128) diff --git a/cranelift/filetests/filetests/egraph/cprop.clif b/cranelift/filetests/filetests/egraph/cprop.clif index 87559027a07c..fd3d88cb2202 100644 --- a/cranelift/filetests/filetests/egraph/cprop.clif +++ b/cranelift/filetests/filetests/egraph/cprop.clif @@ -22,6 +22,43 @@ block0: ; check: v3 = iconst.i16 -2 ; nextln: return v3 +function %f0() -> i8 { +block0: + v1 = iconst.i8 51 + v2 = clz.i8 v1 + return v2 +} + +function %f0() -> i16 { +block0: + v1 = iconst.i16 51 + v2 = clz.i16 v1 + return v2 +} + +; check: v3 = iconst.i16 10 +; nextln: return v3 + +function %f0() -> i16 { +block0: + v1 = iconst.i16 48 + v2 = ctz.i16 v1 + return v2 +} + +; check: v3 = iconst.i16 4 +; nextln: return v3 + +function %f0() -> i16 { +block0: + v1 = iconst.i16 0 + v2 = ctz.i16 v1 + return v2 +} + +; check: v3 = iconst.i16 16 +; nextln: return v3 + function %ishl() -> i8 { block0: v0 = iconst.i8 1 diff --git a/cranelift/filetests/filetests/egraph/skeleton.clif b/cranelift/filetests/filetests/egraph/skeleton.clif index 81289c74e886..a1c3c79f3a75 100644 --- a/cranelift/filetests/filetests/egraph/skeleton.clif +++ b/cranelift/filetests/filetests/egraph/skeleton.clif @@ -95,6 +95,21 @@ block0: ; return v18 ; v18 = 1 ; } +function %cprop_urem() -> i32 { +block0: + v0 = iconst.i32 13 + v1 = iconst.i32 7 + v2 = urem v0, v1 + return v2 +} + +; function %cprop_urem() -> i32 fast { +; block0: +; v37 = iconst.i32 6 +; v2 -> v37 +; return v37 ; v37 = 6 +; } + function %cprop_sdiv() -> i32 { block0: v0 = iconst.i32 -7 @@ -110,6 +125,68 @@ block0: ; return v11 ; v11 = -1 ; } +function %cprop_sdiv_i8_min() -> i8 { +block0: + v0 = iconst.i8 -128 + v1 = iconst.i8 -1 + v2 = sdiv v0, v1 + return v2 +} + +;function %cprop_sdiv_i8_min() -> i8 fast { +;block0: +; v0 = iconst.i8 -128 +; v1 = iconst.i8 -1 +; v2 = sdiv v0, v1 ; v0 = -128, v1 = -1 +; return v2 +;} + +function %cprop_srem_i8_min() -> i8 { +block0: + v0 = iconst.i8 -128 + v1 = iconst.i8 -1 + v2 = srem v0, v1 + return v2 +} + +;function %cprop_srem_i8_min() -> i8 fast { +;block0: +; v3 = iconst.i8 0 +; v2 -> v3 +; return v3 ; v3 = 0 +;} + +function %cprop_srem_i64_min() -> i64 { +block0: + v0 = iconst.i64 -9223372036854775808 + v1 = iconst.i64 -1 + v2 = srem v0, v1 + return v2 +} + +;function %cprop_srem_i64_min() -> i64 fast { +;block0: +; v3 = iconst.i64 0 +; v2 -> v3 +; return v3 ; v3 = 0 +;} + +function %cprop_srem() -> i32 { +block0: + v0 = iconst.i32 -17 + v1 = iconst.i32 7 + v2 = srem v0, v1 + return v2 +} + +; function %cprop_srem() -> i32 fast { +; block0: +; v28 = iconst.i32 -3 +; v2 -> v28 +; return v28 ; v28 = -3 +; } + + function %udiv_by_one(i32) -> i32 { block0(v0: i32): v1 = iconst.i32 1 @@ -228,4 +305,3 @@ block0: ; v2 = uadd_overflow_trap v0, v1, user42 ; v0 = -1, v1 = 1 ; return v2 ; } - diff --git a/cranelift/filetests/filetests/runtests/srem_opts.clif b/cranelift/filetests/filetests/runtests/srem_opts.clif new file mode 100644 index 000000000000..8ba1aa13a929 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/srem_opts.clif @@ -0,0 +1,405 @@ +test interpret +test run +set opt_level=none +target aarch64 +target x86_64 +target x86_64 has_avx +target s390x +target riscv64 +target riscv64 has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be + +set opt_level=speed +target aarch64 +target x86_64 +target x86_64 has_avx +target s390x +target riscv64 +target riscv64 has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be + + +function %srem_i64(i64, i64) -> i64 { +block0(v0: i64,v1: i64): + v2 = srem v0, v1 + return v2 +} +; run: %srem_i64(0, 1) == 0 +; run: %srem_i64(2, 2) == 0 +; run: %srem_i64(1, -1) == 0 +; run: %srem_i64(3, 2) == 1 +; run: %srem_i64(19, 7) == 5 +; run: %srem_i64(3, -2) == 1 +; run: %srem_i64(-19, 7) == -5 +; run: %srem_i64(-57, -5) == -2 +; run: %srem_i64(0, 104857600000) == 0 +; run: %srem_i64(104857600000, 511) == 398 +; run: %srem_i64(0xC0FFEEEE_DECAFFFF, 8) == -1 +; run: %srem_i64(0xC0FFEEEE_DECAFFFF, -8) == -1 +; run: %srem_i64(0x80000000_00000000, -2) == 0 + +function %srem_i32(i32, i32) -> i32 { +block0(v0: i32,v1: i32): + v2 = srem v0, v1 + return v2 +} +; run: %srem_i32(0, 1) == 0 +; run: %srem_i32(2, 2) == 0 +; run: %srem_i32(1, -1) == 0 +; run: %srem_i32(3, 2) == 1 +; run: %srem_i32(19, 7) == 5 +; run: %srem_i32(3, -2) == 1 +; run: %srem_i32(-19, 7) == -5 +; run: %srem_i32(0, 13) == 0 +; run: %srem_i32(1048576, 8192) == 0 +; run: %srem_i32(-1024, 255) == -4 +; run: %srem_i32(0xC0FFEEEE, 8) == -2 +; run: %srem_i32(0xC0FFEEEE, -8) == -2 +; run: %srem_i32(0x80000000, -2) == 0 + +function %srem_i16(i16, i16) -> i16 { +block0(v0: i16,v1: i16): + v2 = srem v0, v1 + return v2 +} +; run: %srem_i16(0, 1) == 0 +; run: %srem_i16(2, 2) == 0 +; run: %srem_i16(1, -1) == 0 +; run: %srem_i16(3, 2) == 1 +; run: %srem_i16(19, 7) == 5 +; run: %srem_i16(3, -2) == 1 +; run: %srem_i16(13, 5) == 3 +; run: %srem_i16(0, 42) == 0 +; run: %srem_i16(4, -2) == 0 +; run: %srem_i16(-19, 7) == -5 +; run: %srem_i16(0xC0FF, 8) == -1 +; run: %srem_i16(0xC0FF, -8) == -1 +; run: %srem_i16(0x8000, -2) == 0 + +function %srem_i8(i8, i8) -> i8 { +block0(v0: i8,v1: i8): + v2 = srem v0, v1 + return v2 +} +; run: %srem_i8(0, 1) == 0 +; run: %srem_i8(2, 2) == 0 +; run: %srem_i8(1, -1) == 0 +; run: %srem_i8(2, 7) == 2 +; run: %srem_i8(3, 2) == 1 +; run: %srem_i8(19, 7) == 5 +; run: %srem_i8(3, -2) == 1 +; run: %srem_i8(-19, 7) == -5 +; run: %srem_i8(0xC0, 8) == 0 +; run: %srem_i8(0xC0, -8) == 0 +; run: %srem_i8(0x80, -2) == 0 + + +function %srem_imm_i64(i64) -> i64 { +block0(v0: i64): + v1 = srem_imm v0, 3 + return v1 +} +; run: %srem_imm_i64(0) == 0 +; run: %srem_imm_i64(1) == 1 +; run: %srem_imm_i64(2) == 2 +; run: %srem_imm_i64(3) == 0 +; run: %srem_imm_i64(19) == 1 +; run: %srem_imm_i64(-19) == -1 +; run: %srem_imm_i64(-57) == 0 +; run: %srem_imm_i64(104857600000) == 1 +; run: %srem_imm_i64(0xC0FFEEEE_DECAFFFF) == -1 +; run: %srem_imm_i64(0x80000000_00000000) == -2 + +function %srem_imm_i32(i32) -> i32 { +block0(v0: i32): + v1 = srem_imm v0, 3 + return v1 +} +; run: %srem_imm_i32(0) == 0 +; run: %srem_imm_i32(1) == 1 +; run: %srem_imm_i32(2) == 2 +; run: %srem_imm_i32(3) == 0 +; run: %srem_imm_i32(4) == 1 +; run: %srem_imm_i32(19) == 1 +; run: %srem_imm_i32(-19) == -1 +; run: %srem_imm_i32(-42) == 0 +; run: %srem_imm_i32(1057) == 1 +; run: %srem_imm_i32(0xC0FFEEEE) == -2 + +function %srem_imm_i16(i16) -> i16 { +block0(v0: i16): + v1 = srem_imm v0, 3 + return v1 +} +; run: %srem_imm_i16(0) == 0 +; run: %srem_imm_i16(1) == 1 +; run: %srem_imm_i16(2) == 2 +; run: %srem_imm_i16(3) == 0 +; run: %srem_imm_i16(4) == 1 +; run: %srem_imm_i16(19) == 1 +; run: %srem_imm_i16(-19) == -1 +; run: %srem_imm_i16(0xC0FF) == -1 +; run: %srem_imm_i16(0x8000) == -2 + +function %srem_imm_i8(i8) -> i8 { +block0(v0: i8): + v1 = srem_imm v0, 3 + return v1 +} +; run: %srem_imm_i8(0) == 0 +; run: %srem_imm_i8(1) == 1 +; run: %srem_imm_i8(2) == 2 +; run: %srem_imm_i8(3) == 0 +; run: %srem_imm_i8(19) == 1 +; run: %srem_imm_i8(-19) == -1 +; run: %srem_imm_i8(0xC0) == -1 +; run: %srem_imm_i8(0x80) == -2 + +function %srem_with_bmask(i64, i8) -> i8 { +block0(v0: i64, v1: i8): + v2 = bmask.i8 v0 + v3 = srem v2, v1 + return v3 +} +; run: %srem_with_bmask(4352, -1) == 0 +; run: %srem_with_bmask(4352, 1) == 0 + +; === Constant Propagation Tests (Guaranteed Optimization) === + +; Basic constant folding tests - these should be optimized to constants +function %const_srem_basic_i64() -> i64 { +block0: + v0 = iconst.i64 17 + v1 = iconst.i64 5 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_basic_i64() == 2 + +function %const_srem_negative_dividend_i64() -> i64 { +block0: + v0 = iconst.i64 -17 + v1 = iconst.i64 5 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_negative_dividend_i64() == -2 + +function %const_srem_negative_divisor_i64() -> i64 { +block0: + v0 = iconst.i64 17 + v1 = iconst.i64 -5 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_negative_divisor_i64() == 2 + +function %const_srem_both_negative_i64() -> i64 { +block0: + v0 = iconst.i64 -17 + v1 = iconst.i64 -5 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_both_negative_i64() == -2 + +; Power of 2 divisors - these should be optimized to bit operations +function %const_srem_pow2_8_i64() -> i64 { +block0: + v0 = iconst.i64 100 + v1 = iconst.i64 8 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_pow2_8_i64() == 4 + +function %const_srem_pow2_16_i64() -> i64 { +block0: + v0 = iconst.i64 100 + v1 = iconst.i64 16 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_pow2_16_i64() == 4 + +function %const_srem_pow2_negative_i64() -> i64 { +block0: + v0 = iconst.i64 -100 + v1 = iconst.i64 8 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_pow2_negative_i64() == -4 + +; Large constants that test overflow boundaries +function %const_srem_large_i64() -> i64 { +block0: + v0 = iconst.i64 0x7FFFFFFFFFFFFFFE ; i64::MAX - 1 + v1 = iconst.i64 0x7FFFFFFFFFFFFFFF ; i64::MAX + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_large_i64() == 0x7FFFFFFFFFFFFFFE + +; Test with 32-bit constants +function %const_srem_basic_i32() -> i32 { +block0: + v0 = iconst.i32 2147483647 ; i32::MAX + v1 = iconst.i32 1000000 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_basic_i32() == 483647 + +function %const_srem_pow2_i32() -> i32 { +block0: + v0 = iconst.i32 1000 + v1 = iconst.i32 64 + v2 = srem v0, v1 + return v2 +} +; run: %const_srem_pow2_i32() == 40 + +; Test immediate forms with guaranteed constant optimization +function %const_srem_imm_pow2_i64() -> i64 { +block0: + v0 = iconst.i64 1000 + v1 = srem_imm v0, 16 + return v1 +} +; run: %const_srem_imm_pow2_i64() == 8 + +function %const_srem_imm_non_pow2_i64() -> i64 { +block0: + v0 = iconst.i64 1000 + v1 = srem_imm v0, 7 + return v1 +} +; run: %const_srem_imm_non_pow2_i64() == 6 + +function %const_srem_imm_negative_i64() -> i64 { +block0: + v0 = iconst.i64 -1000 + v1 = srem_imm v0, 7 + return v1 +} +; run: %const_srem_imm_negative_i64() == -6 + +; === Additional Edge Cases === + +; Test zero dividend +function %srem_zero_dividend_i64(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 0 + v2 = srem v1, v0 + return v2 +} +; run: %srem_zero_dividend_i64(5) == 0 +; run: %srem_zero_dividend_i64(-5) == 0 +; run: %srem_zero_dividend_i64(1) == 0 +; run: %srem_zero_dividend_i64(-1) == 0 + +; Test remainder that equals dividend (divisor > dividend) +function %srem_larger_divisor_i64() -> i64 { +block0: + v0 = iconst.i64 5 + v1 = iconst.i64 10 + v2 = srem v0, v1 + return v2 +} +; run: %srem_larger_divisor_i64() == 5 + +function %srem_larger_divisor_negative_i64() -> i64 { +block0: + v0 = iconst.i64 -5 + v1 = iconst.i64 10 + v2 = srem v0, v1 + return v2 +} +; run: %srem_larger_divisor_negative_i64() == -5 + +; Test powers of 2 with various patterns +function %srem_pow2_pattern_test_i64() -> i64 { +block0: + v0 = iconst.i64 -72340172838076674 ; + v1 = iconst.i64 256 + v2 = srem v0, v1 + return v2 +} +; run: %srem_pow2_pattern_test_i64() == -2 + +function %srem_pow2_pattern_test2_i64() -> i64 { +block0: + v0 = iconst.i64 -72340172838076673 ; + v1 = iconst.i64 256 + v2 = srem v0, v1 + return v2 +} +; run: %srem_pow2_pattern_test2_i64() == -1 + +; Test mixed sign edge cases +function %srem_mixed_signs_edge1_i64() -> i64 { +block0: + v0 = iconst.i64 -1 + v1 = iconst.i64 0x7FFFFFFFFFFFFFFF + v2 = srem v0, v1 + return v2 +} +; run: %srem_mixed_signs_edge1_i64() == -1 + +function %srem_mixed_signs_edge2_i64() -> i64 { +block0: + v0 = iconst.i64 0x7FFFFFFFFFFFFFFF + v1 = iconst.i64 -2 + v2 = srem v0, v1 + return v2 +} +; run: %srem_mixed_signs_edge2_i64() == 1 + + +function %srem_imm_pow2_more_i64(i64) -> i64 { +block0(v0: i64): + v1 = srem_imm v0, 32 + return v1 +} +; run: %srem_imm_pow2_more_i64(100) == 4 +; run: %srem_imm_pow2_more_i64(-100) == -4 +; run: %srem_imm_pow2_more_i64(31) == 31 +; run: %srem_imm_pow2_more_i64(-31) == -31 + +function %srem_imm_pow2_more_i32(i32) -> i32 { +block0(v0: i32): + v1 = srem_imm v0, 64 + return v1 +} +; run: %srem_imm_pow2_more_i32(100) == 36 +; run: %srem_imm_pow2_more_i32(-100) == -36 +; run: %srem_imm_pow2_more_i32(63) == 63 +; run: %srem_imm_pow2_more_i32(-63) == -63 + +; Test larger non-power-of-2 immediates +function %srem_imm_large_i64(i64) -> i64 { +block0(v0: i64): + v1 = srem_imm v0, 1337 + return v1 +} +; run: %srem_imm_large_i64(10000) == 641 +; run: %srem_imm_large_i64(-10000) == -641 +; run: %srem_imm_large_i64(1336) == 1336 +; run: %srem_imm_large_i64(-1336) == -1336 + +function %srem_imm_prime_i64(i64) -> i64 { +block0(v0: i64): + v1 = srem_imm v0, 97 + return v1 +} +; run: %srem_imm_prime_i64(1000) == 30 +; run: %srem_imm_prime_i64(-1000) == -30 +; run: %srem_imm_prime_i64(96) == 96 +; run: %srem_imm_prime_i64(-96) == -96