From e01ffb6d5764db23ef30e1f45878316a5460987e Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Sun, 21 Sep 2025 19:13:06 -0700 Subject: [PATCH 1/2] Emulate all 128 bit ops I thought we could use the fallback code from compiler-builtins but I guess not? Most things were broken. I had AI bang out our implementations, I did not write them. This is generating a lot of code, perhaps better to generate a fn and then have everything call it? I added an example and confirmed it works / passes. --- .github/workflows/ci_windows.yml | 4 +- Cargo.toml | 2 + crates/rustc_codegen_nvvm/libintrinsics.bc | Bin 8388 -> 6196 bytes crates/rustc_codegen_nvvm/libintrinsics.ll | 107 --- crates/rustc_codegen_nvvm/src/builder.rs | 306 +------ .../src/builder/emulate_i128.rs | 800 ++++++++++++++++++ .../rustc_codegen_nvvm/src/ctx_intrinsics.rs | 62 -- crates/rustc_codegen_nvvm/src/intrinsic.rs | 9 + examples/cuda/i128_demo/Cargo.toml | 10 + examples/cuda/i128_demo/build.rs | 18 + examples/cuda/i128_demo/kernels/Cargo.toml | 10 + examples/cuda/i128_demo/kernels/src/lib.rs | 48 ++ examples/cuda/i128_demo/src/main.rs | 293 +++++++ 13 files changed, 1230 insertions(+), 439 deletions(-) create mode 100644 crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs create mode 100644 examples/cuda/i128_demo/Cargo.toml create mode 100644 examples/cuda/i128_demo/build.rs create mode 100644 examples/cuda/i128_demo/kernels/Cargo.toml create mode 100644 examples/cuda/i128_demo/kernels/src/lib.rs create mode 100644 examples/cuda/i128_demo/src/main.rs diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml index c166f025..8917729a 100644 --- a/.github/workflows/ci_windows.yml +++ b/.github/workflows/ci_windows.yml @@ -79,7 +79,7 @@ jobs: run: cargo build --all-features -p cust_raw - name: Build - run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "sha2*" + run: cargo build --workspace --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "sha2*" --exclude "i128*" # Don't currently test because many tests rely on the system having a CUDA GPU # - name: Test @@ -88,7 +88,7 @@ jobs: - name: Check documentation env: RUSTDOCFLAGS: -Dwarnings - run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "sha2*" --exclude "cust_raw" + run: cargo doc --workspace --all-features --document-private-items --no-deps --exclude "optix*" --exclude "path-tracer" --exclude "denoiser" --exclude "vecadd*" --exclude "gemm*" --exclude "ex*" --exclude "cudnn*" --exclude "sha2*" --exclude "i128*"--exclude "cust_raw" # Disabled due to dll issues, someone with Windows knowledge needed # - name: Compiletest # run: cargo run -p compiletests --release --no-default-features -- --target-arch compute_61,compute_70,compute_90 diff --git a/Cargo.toml b/Cargo.toml index 18931ef0..56592723 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ members = [ "examples/cuda/path_tracer/kernels", "examples/cuda/sha2_crates_io", "examples/cuda/sha2_crates_io/kernels", + "examples/cuda/i128_demo", + "examples/cuda/i128_demo/kernels", "examples/optix/*", "tests/compiletests", diff --git a/crates/rustc_codegen_nvvm/libintrinsics.bc b/crates/rustc_codegen_nvvm/libintrinsics.bc index c22e92db132d0ac424c4695c5f309a0d7c924b5e..ec9a764226ad055991744097f1fbb820e4b63ee9 100644 GIT binary patch delta 1238 zcmb7@+iMd+6voePX47rl)=7+Xb3xX$HYu$d*Qx;#lBhMMt<=jyy|iv>8>_ZbQM@5$ zOkxRXk+dsPrAUQXiwI(?qCUk^P!SdMK}8>|BKQZ0_@vHmv-qMy=i&R#`Sv%n%rG+@ zy5rAQusx>1O79B$Mm9V&FPy`&J-~|%7I|wrlC?|g>(bZ2A%{_YX?D;v-5lC-c3>v? zC4;I4&zw5w0sz?)DV93UA=Q&hQTd#@nzh>+GXwKdxkh~Y2vVBaU=)p_yX!Csa>;=2!&VpGnN<+D%~TJRzyelIYTp$vDHH zcqI>np|Cc{;#W@d8-((DMez& zkwv<(@8K!{+|L}P*-DwhB zU8A}xnsR&0GeJ<7TgtgMqhedlsvuM%SD%SdON9#LFp9E5!7ZePHp^8qm1Y;4O!A8QrK*OYoSU#cq`OQT5j>ef5;?8hiLpz9ch4UqL zt|b){T-Vm!#P^mPpJ*dBKe?~A{luR$`;HtWwNK6ZA3N#Q-bJ5`&2-_TlP22hxq;L} zHfPmpS}L4Q!7T)^fRu6-5@*e^Pu0-&1qWi&Tj%lR6BFwI4p w8*cDBY3`=E;Rb)Kz#DGx=LDJcFZta$1@}$#))Dr3LZ|k2v{{R30 delta 3187 zcmbuAUu;uV9LLXXZ*On6ZrpAaI;ctM>ZoSMtpjWnP1}uGbS8EBFbU9Z4D3jOvC;9L zdpatVQlO0Yc;+zmA-=DGP`-pYZb1Zggtc=rG&!0BsE*ed2<}HpNPCTP! znch04DH&tP93yp%JNX&y70xnR^vBIB42o^Z^Y~c770I{$G_jdsSkY!cm{_C>FRxk3 zrCHXa#3(Nq7HRz&WCIXN#FLDJVFUn8ZPQ<&GYIhjEcUU@Xq^YK0F0U6o3R5S4S=J5 zXPu`;K(sO>(x%lhYE;#+UR}p?>N-A9*KsR1R#wN^cW|Jrjvrh4on>`As&j~Dr4YG} znyRH0rR&j|;&@%@dOZD#_E_0^?Ecv6D_f7@?^Sg?>{He8>Z<-msDnQLQUAnJ$KtX& zHdR?GOY1oE1$Vrxj=oLWZ_4U0f9~xmtE2OW&`pFa^ttF|u|JIT@A4p{93o`Q%&pF^ z>41>vQU9mQ`a_7#10%2hkhY_kL9oRh>(6J%>(_#(W*M8jW+Hp62ce1Ex)(IeF%VvD?WqJ^Nj=7?e!dohC{b+#Dr7>n-32Q zqoI7{x(Q8GO$oh1HQB5GD?h)${K*C8Us+)O_yY5D|DC_YAg%vYBF12w#lP9c*2T5t zJ909G5|$ z@~MVf2-R~2vfJ{{wtJ1!VV|%eR0vO_nbl^d>8@b(aFz|_*EX^{ zr*!S924g?K{-z4e%z)a|ux!2FEvTxcV%xTl-UZNkPsb>p)^Y)x=;%loixlymlM6S3n8 z7xqQ4^A0yYy#qI5{hV(fZW`A+TMporU+Tk04r2Fr`tj?B@S30Xb6OF5{?a?!61eRY zAC4c#{tNtgA&EP#@pB_79JpKj)7V1J7%i#1Mmu67N3`_uS~-Mtq~E5*0n|cztl8 zQ1CY?_XP$2OUnI4=s?0+Wdcx0cvG1G6#N?$KPdRODSlA!4=R2|#p0%mDG8vEkR+V$ zVNjUhJmE_S2c?{EMCdBv^sgNh{4<1G2?v$^YF(lGQgs|9=&C!T# sR#bxBgJGE#JAelyGr@94!V!py9SQI09_SAahND}zMzYBzpO91W9~QdwhX4Qo diff --git a/crates/rustc_codegen_nvvm/libintrinsics.ll b/crates/rustc_codegen_nvvm/libintrinsics.ll index d9cb5e2d..5da51b20 100644 --- a/crates/rustc_codegen_nvvm/libintrinsics.ll +++ b/crates/rustc_codegen_nvvm/libintrinsics.ll @@ -239,113 +239,6 @@ start: } declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) #0 -; This is a bit weird, we need to use functions defined in rust crates (compiler_builtins) -; as intrinsics in the codegen, but we can't directly use their name, otherwise we will have -; really odd and incorrect behavior in the crate theyre defined in. So we need to make a wrapper for them that is opaque -; to the codegen, which is what this is doing. - -define {<2 x i64>, i1} @__nvvm_i128_addo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_i128_addo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_i128_addo(<2 x i64>, <2 x i64>) #0 - -define {<2 x i64>, i1} @__nvvm_u128_addo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_u128_addo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_u128_addo(<2 x i64>, <2 x i64>) #0 - -define {<2 x i64>, i1} @__nvvm_i128_subo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_i128_subo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_i128_subo(<2 x i64>, <2 x i64>) #0 - -define {<2 x i64>, i1} @__nvvm_u128_subo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_u128_subo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_u128_subo(<2 x i64>, <2 x i64>) #0 - -define {<2 x i64>, i1} @__nvvm_i128_mulo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_i128_mulo(<2 x i64>, <2 x i64>) #0 - -define {<2 x i64>, i1} @__nvvm_u128_mulo(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64> %0, <2 x i64> %1) - ret {<2 x i64>, i1} %2 -} -declare {<2 x i64>, i1} @__rust_u128_mulo(<2 x i64>, <2 x i64>) #0 - -; Division operations from compiler-builtins -define <2 x i64> @__nvvm_divti3(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call <2 x i64> @__divti3(<2 x i64> %0, <2 x i64> %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__divti3(<2 x i64>, <2 x i64>) #0 - -define <2 x i64> @__nvvm_udivti3(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call <2 x i64> @__udivti3(<2 x i64> %0, <2 x i64> %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__udivti3(<2 x i64>, <2 x i64>) #0 - -; Remainder operations from compiler-builtins -define <2 x i64> @__nvvm_modti3(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call <2 x i64> @__modti3(<2 x i64> %0, <2 x i64> %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__modti3(<2 x i64>, <2 x i64>) #0 - -define <2 x i64> @__nvvm_umodti3(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call <2 x i64> @__umodti3(<2 x i64> %0, <2 x i64> %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__umodti3(<2 x i64>, <2 x i64>) #0 - -; Multiplication from compiler-builtins -define <2 x i64> @__nvvm_multi3(<2 x i64>, <2 x i64>) #0 { -start: - %2 = call <2 x i64> @__multi3(<2 x i64> %0, <2 x i64> %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__multi3(<2 x i64>, <2 x i64>) #0 - -; Shift operations from compiler-builtins -define <2 x i64> @__nvvm_ashlti3(<2 x i64>, i32) #0 { -start: - %2 = call <2 x i64> @__ashlti3(<2 x i64> %0, i32 %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__ashlti3(<2 x i64>, i32) #0 - -define <2 x i64> @__nvvm_ashrti3(<2 x i64>, i32) #0 { -start: - %2 = call <2 x i64> @__ashrti3(<2 x i64> %0, i32 %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__ashrti3(<2 x i64>, i32) #0 - -define <2 x i64> @__nvvm_lshrti3(<2 x i64>, i32) #0 { -start: - %2 = call <2 x i64> @__lshrti3(<2 x i64> %0, i32 %1) - ret <2 x i64> %2 -} -declare <2 x i64> @__lshrti3(<2 x i64>, i32) #0 - ; Required because we need to explicitly generate { i32, i1 } for the following intrinsics ; except rustc will not generate them (it will make { i32, i8 }) which libnvvm rejects. diff --git a/crates/rustc_codegen_nvvm/src/builder.rs b/crates/rustc_codegen_nvvm/src/builder.rs index bee8d182..05f879ac 100644 --- a/crates/rustc_codegen_nvvm/src/builder.rs +++ b/crates/rustc_codegen_nvvm/src/builder.rs @@ -32,6 +32,8 @@ use crate::int_replace::{get_transformed_type, transmute_llval}; use crate::llvm::{self, BasicBlock, Type, Value}; use crate::ty::LayoutLlvmExt; +mod emulate_i128; + pub(crate) enum CountZerosKind { Leading, Trailing, @@ -133,7 +135,7 @@ impl<'ll, 'tcx> Deref for Builder<'_, 'll, 'tcx> { macro_rules! imath_builder_methods { ($($self_:ident.$name:ident($($arg:ident),*) => $llvm_capi:ident => $op:block)+) => { $(fn $name(&mut $self_, $($arg: &'ll Value),*) -> &'ll Value { - // Dispatch to i128 emulation or `compiler_builtins`-based intrinsic + // Dispatch to i128 emulation when any operand is 128 bits wide. if $($self_.is_i128($arg))||* $op else { @@ -319,40 +321,30 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { self.unchecked_usub(a, b) => LLVMBuildNUWSub => { self.emulate_i128_sub(a, b) } self.unchecked_ssub(a, b) => LLVMBuildNSWSub => { self.emulate_i128_sub(a, b) } - self.mul(a, b) => LLVMBuildMul => { self.call_intrinsic("__nvvm_multi3", &[a, b]) } - self.unchecked_umul(a, b) => LLVMBuildNUWMul => { - self.call_intrinsic("__nvvm_multi3", &[a, b]) - } - self.unchecked_smul(a, b) => LLVMBuildNSWMul => { - self.call_intrinsic("__nvvm_multi3", &[a, b]) - } + self.mul(a, b) => LLVMBuildMul => { self.emulate_i128_mul(a, b) } + self.unchecked_umul(a, b) => LLVMBuildNUWMul => { self.emulate_i128_mul(a, b) } + self.unchecked_smul(a, b) => LLVMBuildNSWMul => { self.emulate_i128_mul(a, b) } - self.udiv(a, b) => LLVMBuildUDiv => { self.call_intrinsic("__nvvm_udivti3", &[a, b]) } - self.exactudiv(a, b) => LLVMBuildExactUDiv => { - self.call_intrinsic("__nvvm_udivti3", &[a, b]) - } - self.sdiv(a, b) => LLVMBuildSDiv => { self.call_intrinsic("__nvvm_divti3", &[a, b]) } - self.exactsdiv(a, b) => LLVMBuildExactSDiv => { - self.call_intrinsic("__nvvm_divti3", &[a, b]) - } - self.urem(a, b) => LLVMBuildURem => { self.call_intrinsic("__nvvm_umodti3", &[a, b]) } - self.srem(a, b) => LLVMBuildSRem => { self.call_intrinsic("__nvvm_modti3", &[a, b]) } + self.udiv(a, b) => LLVMBuildUDiv => { self.emulate_i128_udiv(a, b) } + self.exactudiv(a, b) => LLVMBuildExactUDiv => { self.emulate_i128_udiv(a, b) } + self.sdiv(a, b) => LLVMBuildSDiv => { self.emulate_i128_sdiv(a, b) } + self.exactsdiv(a, b) => LLVMBuildExactSDiv => { self.emulate_i128_sdiv(a, b) } + self.urem(a, b) => LLVMBuildURem => { self.emulate_i128_urem(a, b) } + self.srem(a, b) => LLVMBuildSRem => { self.emulate_i128_srem(a, b) } self.shl(a, b) => LLVMBuildShl => { - // Convert shift amount to i32 for compiler-builtins. let b = self.trunc(b, self.type_i32()); - self.call_intrinsic("__nvvm_ashlti3", &[a, b]) + self.emulate_i128_shl(a, b) } self.lshr(a, b) => LLVMBuildLShr => { - // Convert shift amount to i32 for compiler-builtins. let b = self.trunc(b, self.type_i32()); - self.call_intrinsic("__nvvm_lshrti3", &[a, b]) + self.emulate_i128_lshr(a, b) } self.ashr(a, b) => LLVMBuildAShr => { - // Convert shift amount to i32 for compiler-builtins. let b = self.trunc(b, self.type_i32()); - self.call_intrinsic("__nvvm_ashrti3", &[a, b]) + self.emulate_i128_ashr(a, b) } + self.and(a, b) => LLVMBuildAnd => { self.emulate_i128_and(a, b) } self.or(a, b) => LLVMBuildOr => { self.emulate_i128_or(a, b) } self.xor(a, b) => LLVMBuildXor => { self.emulate_i128_xor(a, b) } @@ -404,19 +396,39 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { _ => panic!("tried to get overflow intrinsic for op applied to non-int type"), }; + match (oop, new_kind) { + (OverflowOp::Add, Int(I128)) => { + return self.emulate_i128_add_with_overflow(lhs, rhs, true); + } + (OverflowOp::Add, Uint(U128)) => { + return self.emulate_i128_add_with_overflow(lhs, rhs, false); + } + (OverflowOp::Sub, Int(I128)) => { + return self.emulate_i128_sub_with_overflow(lhs, rhs, true); + } + (OverflowOp::Sub, Uint(U128)) => { + return self.emulate_i128_sub_with_overflow(lhs, rhs, false); + } + (OverflowOp::Mul, Int(I128)) => { + return self.emulate_i128_mul_with_overflow(lhs, rhs, true); + } + (OverflowOp::Mul, Uint(U128)) => { + return self.emulate_i128_mul_with_overflow(lhs, rhs, false); + } + _ => {} + } + let name = match oop { OverflowOp::Add => match new_kind { Int(I8) => "__nvvm_i8_addo", Int(I16) => "llvm.sadd.with.overflow.i16", Int(I32) => "llvm.sadd.with.overflow.i32", Int(I64) => "llvm.sadd.with.overflow.i64", - Int(I128) => "__nvvm_i128_addo", Uint(U8) => "__nvvm_u8_addo", Uint(U16) => "llvm.uadd.with.overflow.i16", Uint(U32) => "llvm.uadd.with.overflow.i32", Uint(U64) => "llvm.uadd.with.overflow.i64", - Uint(U128) => "__nvvm_u128_addo", _ => unreachable!(), }, OverflowOp::Sub => match new_kind { @@ -424,13 +436,11 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { Int(I16) => "llvm.ssub.with.overflow.i16", Int(I32) => "llvm.ssub.with.overflow.i32", Int(I64) => "llvm.ssub.with.overflow.i64", - Int(I128) => "__nvvm_i128_subo", Uint(U8) => "__nvvm_u8_subo", Uint(U16) => "llvm.usub.with.overflow.i16", Uint(U32) => "llvm.usub.with.overflow.i32", Uint(U64) => "llvm.usub.with.overflow.i64", - Uint(U128) => "__nvvm_u128_subo", _ => unreachable!(), }, @@ -439,13 +449,11 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { Int(I16) => "llvm.smul.with.overflow.i16", Int(I32) => "llvm.smul.with.overflow.i32", Int(I64) => "llvm.smul.with.overflow.i64", - Int(I128) => "__nvvm_i128_mulo", Uint(U8) => "__nvvm_u8_mulo", Uint(U16) => "llvm.umul.with.overflow.i16", Uint(U32) => "llvm.umul.with.overflow.i32", Uint(U64) => "llvm.umul.with.overflow.i64", - Uint(U128) => "__nvvm_u128_mulo", _ => unreachable!(), }, @@ -1333,244 +1341,6 @@ impl<'ll> StaticBuilderMethods for Builder<'_, 'll, '_> { } impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { - // Helper function to check if a value is 128-bit integer - fn is_i128(&self, val: &'ll Value) -> bool { - let ty = self.val_ty(val); - if unsafe { llvm::LLVMRustGetTypeKind(ty) == llvm::TypeKind::Integer } { - unsafe { llvm::LLVMGetIntTypeWidth(ty) == 128 } - } else { - false - } - } - - // Helper to split i128 into low and high u64 parts - fn split_i128(&mut self, val: &'ll Value) -> (&'ll Value, &'ll Value) { - let i64_ty = self.type_i64(); - let const_64 = self.const_u128(64); - - let lo = self.trunc(val, i64_ty); - let shifted = unsafe { llvm::LLVMBuildLShr(self.llbuilder, val, const_64, UNNAMED) }; - let hi = self.trunc(shifted, i64_ty); - - (lo, hi) - } - - // Helper to combine two u64 values into i128 - fn combine_i128(&mut self, lo: &'ll Value, hi: &'ll Value) -> &'ll Value { - let i128_ty = self.type_i128(); - let const_64 = self.const_u128(64); - - let lo_ext = self.zext(lo, i128_ty); - let hi_ext = self.zext(hi, i128_ty); - let hi_shifted = unsafe { llvm::LLVMBuildShl(self.llbuilder, hi_ext, const_64, UNNAMED) }; - unsafe { llvm::LLVMBuildOr(self.llbuilder, lo_ext, hi_shifted, UNNAMED) } - } - - // Emulate 128-bit addition using two 64-bit additions with carry - fn emulate_i128_add(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let i64_ty = self.type_i64(); - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - // Add low parts - let sum_lo = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - - // Check for carry from low addition - let carry = self.icmp(IntPredicate::IntULT, sum_lo, lhs_lo); - let carry_ext = self.zext(carry, i64_ty); - - // Add high parts with carry - let sum_hi_temp = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - let sum_hi = unsafe { llvm::LLVMBuildAdd(self.llbuilder, sum_hi_temp, carry_ext, UNNAMED) }; - - self.combine_i128(sum_lo, sum_hi) - } - - // Emulate 128-bit subtraction - fn emulate_i128_sub(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let i64_ty = self.type_i64(); - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - // Subtract low parts - let diff_lo = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - - // Check for borrow - let borrow = self.icmp(IntPredicate::IntUGT, rhs_lo, lhs_lo); - let borrow_ext = self.zext(borrow, i64_ty); - - // Subtract high parts with borrow - let diff_hi_temp = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - let diff_hi = - unsafe { llvm::LLVMBuildSub(self.llbuilder, diff_hi_temp, borrow_ext, UNNAMED) }; - - self.combine_i128(diff_lo, diff_hi) - } - - // Emulate 128-bit bitwise AND - fn emulate_i128_and(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - let and_lo = unsafe { llvm::LLVMBuildAnd(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - let and_hi = unsafe { llvm::LLVMBuildAnd(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - - self.combine_i128(and_lo, and_hi) - } - - // Emulate 128-bit bitwise OR - fn emulate_i128_or(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - let or_lo = unsafe { llvm::LLVMBuildOr(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - let or_hi = unsafe { llvm::LLVMBuildOr(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - - self.combine_i128(or_lo, or_hi) - } - - // Emulate 128-bit bitwise XOR - fn emulate_i128_xor(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - let xor_lo = unsafe { llvm::LLVMBuildXor(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - let xor_hi = unsafe { llvm::LLVMBuildXor(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - - self.combine_i128(xor_lo, xor_hi) - } - - // Emulate 128-bit bitwise NOT - fn emulate_i128_not(&mut self, val: &'ll Value) -> &'ll Value { - let (lo, hi) = self.split_i128(val); - - let not_lo = unsafe { llvm::LLVMBuildNot(self.llbuilder, lo, UNNAMED) }; - let not_hi = unsafe { llvm::LLVMBuildNot(self.llbuilder, hi, UNNAMED) }; - - self.combine_i128(not_lo, not_hi) - } - - // Emulate 128-bit negation (two's complement) - fn emulate_i128_neg(&mut self, val: &'ll Value) -> &'ll Value { - // Two's complement: ~val + 1 - let not_val = self.emulate_i128_not(val); - let one = self.const_u128(1); - self.emulate_i128_add(not_val, one) - } - - pub(crate) fn emulate_i128_bswap(&mut self, val: &'ll Value) -> &'ll Value { - // Split the 128-bit value into two 64-bit halves - let (lo, hi) = self.split_i128(val); - - // Byte-swap each 64-bit half using the LLVM intrinsic (which exists in LLVM 7.1) - let swapped_lo = self.call_intrinsic("llvm.bswap.i64", &[lo]); - let swapped_hi = self.call_intrinsic("llvm.bswap.i64", &[hi]); - - // Swap the halves: the high part becomes low and vice versa - self.combine_i128(swapped_hi, swapped_lo) - } - - pub(crate) fn emulate_i128_count_zeros( - &mut self, - val: &'ll Value, - kind: CountZerosKind, - is_nonzero: bool, - ) -> &'ll Value { - // Split the 128-bit value into two 64-bit halves - let (lo, hi) = self.split_i128(val); - - match kind { - CountZerosKind::Leading => { - // Count leading zeros: check high part first - let hi_is_zero = self.icmp(IntPredicate::IntEQ, hi, self.const_u64(0)); - let hi_ctlz = - self.call_intrinsic("llvm.ctlz.i64", &[hi, self.const_bool(is_nonzero)]); - let lo_ctlz = - self.call_intrinsic("llvm.ctlz.i64", &[lo, self.const_bool(is_nonzero)]); - - // If high part is zero, result is 64 + ctlz(lo), otherwise ctlz(hi) - let lo_ctlz_plus_64 = self.add(lo_ctlz, self.const_u64(64)); - let result_64 = self.select(hi_is_zero, lo_ctlz_plus_64, hi_ctlz); - - // Zero-extend to i128 - self.zext(result_64, self.type_i128()) - } - CountZerosKind::Trailing => { - // Count trailing zeros: check low part first - let lo_is_zero = self.icmp(IntPredicate::IntEQ, lo, self.const_u64(0)); - let lo_cttz = - self.call_intrinsic("llvm.cttz.i64", &[lo, self.const_bool(is_nonzero)]); - let hi_cttz = - self.call_intrinsic("llvm.cttz.i64", &[hi, self.const_bool(is_nonzero)]); - - // If low part is zero, result is 64 + cttz(hi), otherwise cttz(lo) - let hi_cttz_plus_64 = self.add(hi_cttz, self.const_u64(64)); - let result_64 = self.select(lo_is_zero, hi_cttz_plus_64, lo_cttz); - - // Zero-extend to i128 - self.zext(result_64, self.type_i128()) - } - } - } - - pub(crate) fn emulate_i128_ctpop(&mut self, val: &'ll Value) -> &'ll Value { - // Split the 128-bit value into two 64-bit halves - let (lo, hi) = self.split_i128(val); - - // Count population (number of 1 bits) in each half - let lo_popcount = self.call_intrinsic("llvm.ctpop.i64", &[lo]); - let hi_popcount = self.call_intrinsic("llvm.ctpop.i64", &[hi]); - - // Add the two counts - let total_64 = self.add(lo_popcount, hi_popcount); - - // Zero-extend to i128 - self.zext(total_64, self.type_i128()) - } - - pub(crate) fn emulate_i128_rotate( - &mut self, - val: &'ll Value, - shift: &'ll Value, - is_left: bool, - ) -> &'ll Value { - // Rotate is implemented as: (val << shift) | (val >> (128 - shift)) - // For rotate right: (val >> shift) | (val << (128 - shift)) - - // Ensure shift is i128 - let shift_128 = if self.val_ty(shift) == self.type_i128() { - shift - } else { - self.zext(shift, self.type_i128()) - }; - - // Calculate 128 - shift for the complementary shift - let bits_128 = self.const_u128(128); - let shift_complement = self.sub(bits_128, shift_128); - - // Perform the two shifts - let (first_shift, second_shift) = if is_left { - (self.shl(val, shift_128), self.lshr(val, shift_complement)) - } else { - (self.lshr(val, shift_128), self.shl(val, shift_complement)) - }; - - // Combine with OR - self.or(first_shift, second_shift) - } - - pub(crate) fn emulate_i128_bitreverse(&mut self, val: &'ll Value) -> &'ll Value { - // Split the 128-bit value into two 64-bit halves - let (lo, hi) = self.split_i128(val); - - // Reverse bits in each half using the 64-bit intrinsic - let reversed_lo = self.call_intrinsic("llvm.bitreverse.i64", &[lo]); - let reversed_hi = self.call_intrinsic("llvm.bitreverse.i64", &[hi]); - - // Swap the halves: reversed high becomes low and vice versa - self.combine_i128(reversed_hi, reversed_lo) - } - fn with_cx(cx: &'a CodegenCx<'ll, 'tcx>) -> Self { // Create a fresh builder from the crate context. let llbuilder = unsafe { llvm::LLVMCreateBuilderInContext(cx.llcx) }; diff --git a/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs b/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs new file mode 100644 index 00000000..1ba12717 --- /dev/null +++ b/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs @@ -0,0 +1,800 @@ +use rustc_codegen_ssa::common::IntPredicate; +use rustc_codegen_ssa::traits::*; + +use crate::llvm::{self, Value}; + +use super::{Builder, CountZerosKind, UNNAMED}; + +impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { + pub(super) fn is_i128(&self, val: &'ll Value) -> bool { + let ty = self.val_ty(val); + if ty == self.type_i128() { + return true; + } + if unsafe { llvm::LLVMRustGetTypeKind(ty) == llvm::TypeKind::Integer } { + unsafe { llvm::LLVMGetIntTypeWidth(ty) == 128 } + } else { + false + } + } + + // Helper to split i128 into low and high u64 parts + fn split_i128(&mut self, val: &'ll Value) -> (&'ll Value, &'ll Value) { + let vec_ty = self.type_vector(self.type_i64(), 2); + let bitcast = unsafe { llvm::LLVMBuildBitCast(self.llbuilder, val, vec_ty, UNNAMED) }; + let lo = unsafe { + llvm::LLVMBuildExtractElement(self.llbuilder, bitcast, self.cx.const_i32(0), UNNAMED) + }; + let hi = unsafe { + llvm::LLVMBuildExtractElement(self.llbuilder, bitcast, self.cx.const_i32(1), UNNAMED) + }; + + (lo, hi) + } + + fn uadd_with_overflow_i64( + &mut self, + lhs: &'ll Value, + rhs: &'ll Value, + ) -> (&'ll Value, &'ll Value) { + let call = self.call_intrinsic("llvm.uadd.with.overflow.i64", &[lhs, rhs]); + (self.extract_value(call, 0), self.extract_value(call, 1)) + } + + fn usub_with_overflow_i64( + &mut self, + lhs: &'ll Value, + rhs: &'ll Value, + ) -> (&'ll Value, &'ll Value) { + let call = self.call_intrinsic("llvm.usub.with.overflow.i64", &[lhs, rhs]); + (self.extract_value(call, 0), self.extract_value(call, 1)) + } + + // Helper to combine two u64 values into i128 + fn combine_i128(&mut self, lo: &'ll Value, hi: &'ll Value) -> &'ll Value { + let vec_ty = self.type_vector(self.type_i64(), 2); + let mut vec = self.const_undef(vec_ty); + vec = unsafe { + llvm::LLVMBuildInsertElement(self.llbuilder, vec, lo, self.cx.const_i32(0), UNNAMED) + }; + vec = unsafe { + llvm::LLVMBuildInsertElement(self.llbuilder, vec, hi, self.cx.const_i32(1), UNNAMED) + }; + unsafe { llvm::LLVMBuildBitCast(self.llbuilder, vec, self.type_i128(), UNNAMED) } + } + + // Multiply two u64 values and return the full 128-bit product as (lo, hi) + fn mul_u64_to_u128(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> (&'ll Value, &'ll Value) { + let i64_ty = self.type_i64(); + let i32_ty = self.type_i32(); + let shift_32 = self.const_u64(32); + let mask_32 = self.const_u64(0xFFFF_FFFF); + + // Split the operands into 32-bit halves + let lhs_lo32 = self.trunc(lhs, i32_ty); + let lhs_shifted = self.lshr(lhs, shift_32); + let lhs_hi32 = self.trunc(lhs_shifted, i32_ty); + let rhs_lo32 = self.trunc(rhs, i32_ty); + let rhs_shifted = self.lshr(rhs, shift_32); + let rhs_hi32 = self.trunc(rhs_shifted, i32_ty); + + // Extend halves to 64 bits for the partial products + let lhs_lo64 = self.zext(lhs_lo32, i64_ty); + let lhs_hi64 = self.zext(lhs_hi32, i64_ty); + let rhs_lo64 = self.zext(rhs_lo32, i64_ty); + let rhs_hi64 = self.zext(rhs_hi32, i64_ty); + + // Compute partial products (32-bit x 32-bit -> 64-bit) + let p0 = unsafe { llvm::LLVMBuildMul(self.llbuilder, lhs_lo64, rhs_lo64, UNNAMED) }; + let p1 = unsafe { llvm::LLVMBuildMul(self.llbuilder, lhs_lo64, rhs_hi64, UNNAMED) }; + let p2 = unsafe { llvm::LLVMBuildMul(self.llbuilder, lhs_hi64, rhs_lo64, UNNAMED) }; + let p3 = unsafe { llvm::LLVMBuildMul(self.llbuilder, lhs_hi64, rhs_hi64, UNNAMED) }; + + // Sum cross terms and track the carry that escapes the low 64 bits + let (cross, cross_carry_bit) = self.uadd_with_overflow_i64(p1, p2); + + let cross_low = self.and(cross, mask_32); + let cross_low_shifted = self.shl(cross_low, shift_32); + let (lo, lo_carry_bit) = self.uadd_with_overflow_i64(p0, cross_low_shifted); + + let cross_high = self.lshr(cross, shift_32); + let cross_carry_ext = self.zext(cross_carry_bit, i64_ty); + let cross_carry_high = self.shl(cross_carry_ext, shift_32); + let cross_total_high = self.add(cross_high, cross_carry_high); + + let (hi_temp, _) = self.uadd_with_overflow_i64(p3, cross_total_high); + let lo_carry = self.zext(lo_carry_bit, i64_ty); + let (hi, _) = self.uadd_with_overflow_i64(hi_temp, lo_carry); + + (lo, hi) + } + + // Emulate 128-bit addition using two 64-bit additions with carry + pub(super) fn emulate_i128_add(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let i64_ty = self.type_i64(); + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + // Add low parts + let sum_lo = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; + + // Check for carry from low addition + let carry = self.icmp(IntPredicate::IntULT, sum_lo, lhs_lo); + let carry_ext = self.zext(carry, i64_ty); + + // Add high parts with carry + let sum_hi_temp = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; + let sum_hi = unsafe { llvm::LLVMBuildAdd(self.llbuilder, sum_hi_temp, carry_ext, UNNAMED) }; + + self.combine_i128(sum_lo, sum_hi) + } + + // Emulate 128-bit subtraction + pub(super) fn emulate_i128_sub(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let i64_ty = self.type_i64(); + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + // Subtract low parts + let diff_lo = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; + + // Check for borrow + let borrow = self.icmp(IntPredicate::IntUGT, rhs_lo, lhs_lo); + let borrow_ext = self.zext(borrow, i64_ty); + + // Subtract high parts with borrow + let diff_hi_temp = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; + let diff_hi = + unsafe { llvm::LLVMBuildSub(self.llbuilder, diff_hi_temp, borrow_ext, UNNAMED) }; + + self.combine_i128(diff_lo, diff_hi) + } + + // Emulate 128-bit bitwise AND + pub(super) fn emulate_i128_and(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let and_lo = unsafe { llvm::LLVMBuildAnd(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; + let and_hi = unsafe { llvm::LLVMBuildAnd(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; + + self.combine_i128(and_lo, and_hi) + } + + // Emulate 128-bit bitwise OR + pub(super) fn emulate_i128_or(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let or_lo = unsafe { llvm::LLVMBuildOr(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; + let or_hi = unsafe { llvm::LLVMBuildOr(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; + + self.combine_i128(or_lo, or_hi) + } + + // Emulate 128-bit bitwise XOR + pub(super) fn emulate_i128_xor(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let xor_lo = unsafe { llvm::LLVMBuildXor(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; + let xor_hi = unsafe { llvm::LLVMBuildXor(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; + + self.combine_i128(xor_lo, xor_hi) + } + + // Emulate 128-bit multiplication using 64-bit partial products + pub(super) fn emulate_i128_mul(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + // Compute partial products + let (prod_ll_lo, prod_ll_hi) = self.mul_u64_to_u128(lhs_lo, rhs_lo); + let partial_ll = self.combine_i128(prod_ll_lo, prod_ll_hi); + + let (prod_lh_lo, prod_lh_hi) = self.mul_u64_to_u128(lhs_lo, rhs_hi); + let partial_lh = self.combine_i128(prod_lh_lo, prod_lh_hi); + + let (prod_hl_lo, prod_hl_hi) = self.mul_u64_to_u128(lhs_hi, rhs_lo); + let partial_hl = self.combine_i128(prod_hl_lo, prod_hl_hi); + + let shift_64 = self.cx.const_i32(64); + + // Shift cross terms by 64 bits to align them within the 128-bit space + let partial_lh_shifted = self.emulate_i128_shl(partial_lh, shift_64); + let partial_hl_shifted = self.emulate_i128_shl(partial_hl, shift_64); + + // Sum the partial products modulo 2^128 + let acc = self.emulate_i128_add(partial_ll, partial_lh_shifted); + self.emulate_i128_add(acc, partial_hl_shifted) + } + + fn emulate_i128_udivrem( + &mut self, + num: &'ll Value, + den: &'ll Value, + ) -> (&'ll Value, &'ll Value) { + let zero_i128 = self.const_u128(0); + let one_i128 = self.const_u128(1); + let one_i32 = self.cx.const_i32(1); + let i32_ty = self.cx.type_i32(); + let i128_ty = self.type_i128(); + + let trap_bb = self.append_sibling_block("i128_udiv_trap"); + let main_bb = self.append_sibling_block("i128_udiv_main"); + let lt_bb = self.append_sibling_block("i128_udiv_lt"); + let loop_header_bb = self.append_sibling_block("i128_udiv_loop_header"); + let loop_body_bb = self.append_sibling_block("i128_udiv_loop_body"); + let exit_bb = self.append_sibling_block("i128_udiv_exit"); + + let mut exit_bx = Self::build(self.cx, exit_bb); + let quot_res = exit_bx.phi(i128_ty, &[], &[]); + let rem_res = exit_bx.phi(i128_ty, &[], &[]); + + let den_is_zero = self.icmp(IntPredicate::IntEQ, den, zero_i128); + self.cond_br(den_is_zero, trap_bb, main_bb); + + let mut trap_bx = Self::build(self.cx, trap_bb); + trap_bx.call_intrinsic("llvm.trap", &[]); + trap_bx.unreachable(); + + let mut main_bx = Self::build(self.cx, main_bb); + let num_lt_den = main_bx.icmp(IntPredicate::IntULT, num, den); + let bit_init = main_bx.cx.const_int(i32_ty, 128); + main_bx.cond_br(num_lt_den, lt_bb, loop_header_bb); + + let mut lt_bx = Self::build(self.cx, lt_bb); + lt_bx.br(exit_bb); + exit_bx.add_incoming_to_phi(quot_res, zero_i128, lt_bb); + exit_bx.add_incoming_to_phi(rem_res, num, lt_bb); + + let mut header_bx = Self::build(self.cx, loop_header_bb); + let bit_phi = header_bx.phi(i32_ty, &[bit_init], &[main_bb]); + let quot_phi = header_bx.phi(i128_ty, &[zero_i128], &[main_bb]); + let rem_phi = header_bx.phi(i128_ty, &[zero_i128], &[main_bb]); + let bit_zero = header_bx.icmp(IntPredicate::IntEQ, bit_phi, header_bx.cx.const_i32(0)); + header_bx.cond_br(bit_zero, exit_bb, loop_body_bb); + exit_bx.add_incoming_to_phi(quot_res, quot_phi, loop_header_bb); + exit_bx.add_incoming_to_phi(rem_res, rem_phi, loop_header_bb); + + let mut body_bx = Self::build(self.cx, loop_body_bb); + let bit_next = body_bx.sub(bit_phi, one_i32); + let num_shifted = body_bx.emulate_i128_lshr(num, bit_next); + let bit_val = body_bx.and(num_shifted, one_i128); + let rem_shift = body_bx.emulate_i128_shl(rem_phi, one_i32); + let rem_candidate = body_bx.or(rem_shift, bit_val); + let rem_ge = body_bx.icmp(IntPredicate::IntUGE, rem_candidate, den); + let rem_sub = body_bx.emulate_i128_sub(rem_candidate, den); + let rem_new = body_bx.select(rem_ge, rem_sub, rem_candidate); + let one_shifted = body_bx.emulate_i128_shl(one_i128, bit_next); + let quot_candidate = body_bx.or(quot_phi, one_shifted); + let quot_new = body_bx.select(rem_ge, quot_candidate, quot_phi); + let loop_body_exit_bb = body_bx.llbb(); + body_bx.br(loop_header_bb); + header_bx.add_incoming_to_phi(bit_phi, bit_next, loop_body_exit_bb); + header_bx.add_incoming_to_phi(quot_phi, quot_new, loop_body_exit_bb); + header_bx.add_incoming_to_phi(rem_phi, rem_new, loop_body_exit_bb); + + *self = exit_bx; + (quot_res, rem_res) + } + + pub(super) fn emulate_i128_udiv(&mut self, num: &'ll Value, den: &'ll Value) -> &'ll Value { + self.emulate_i128_udivrem(num, den).0 + } + + pub(super) fn emulate_i128_urem(&mut self, num: &'ll Value, den: &'ll Value) -> &'ll Value { + self.emulate_i128_udivrem(num, den).1 + } + + fn emulate_i128_sdivrem( + &mut self, + num: &'ll Value, + den: &'ll Value, + ) -> (&'ll Value, &'ll Value) { + let min_i128 = self.const_u128(1u128 << 127); + let neg_one_i128 = self.const_u128(u128::MAX); + let num_is_min = self.icmp(IntPredicate::IntEQ, num, min_i128); + let den_is_neg_one = self.icmp(IntPredicate::IntEQ, den, neg_one_i128); + let overflow_case = self.and(num_is_min, den_is_neg_one); + + let trap_bb = self.append_sibling_block("i128_sdiv_overflow_trap"); + let cont_bb = self.append_sibling_block("i128_sdiv_overflow_cont"); + + self.cond_br(overflow_case, trap_bb, cont_bb); + + let mut trap_bx = Self::build(self.cx, trap_bb); + trap_bx.call_intrinsic("llvm.trap", &[]); + trap_bx.unreachable(); + + let mut bx = Self::build(self.cx, cont_bb); + + let zero_u64 = bx.const_u64(0); + let (_, num_hi) = bx.split_i128(num); + let (_, den_hi) = bx.split_i128(den); + let num_neg = bx.icmp(IntPredicate::IntSLT, num_hi, zero_u64); + let den_neg = bx.icmp(IntPredicate::IntSLT, den_hi, zero_u64); + let num_negated = bx.emulate_i128_neg(num); + let num_abs = bx.select(num_neg, num_negated, num); + let den_negated = bx.emulate_i128_neg(den); + let den_abs = bx.select(den_neg, den_negated, den); + let (quot_raw, rem_raw) = bx.emulate_i128_udivrem(num_abs, den_abs); + let sign_diff = bx.xor(num_neg, den_neg); + let quot_negated = bx.emulate_i128_neg(quot_raw); + let quot = bx.select(sign_diff, quot_negated, quot_raw); + let rem_negated = bx.emulate_i128_neg(rem_raw); + let rem = bx.select(num_neg, rem_negated, rem_raw); + + *self = bx; + (quot, rem) + } + + pub(super) fn emulate_i128_sdiv(&mut self, num: &'ll Value, den: &'ll Value) -> &'ll Value { + self.emulate_i128_sdivrem(num, den).0 + } + + pub(super) fn emulate_i128_srem(&mut self, num: &'ll Value, den: &'ll Value) -> &'ll Value { + self.emulate_i128_sdivrem(num, den).1 + } + + pub(super) fn emulate_i128_add_with_overflow( + &mut self, + lhs: &'ll Value, + rhs: &'ll Value, + signed: bool, + ) -> (&'ll Value, &'ll Value) { + let i64_ty = self.type_i64(); + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let (sum_lo, carry_lo) = self.uadd_with_overflow_i64(lhs_lo, rhs_lo); + let carry_lo_ext = self.zext(carry_lo, i64_ty); + + let (sum_hi_temp, carry_hi1) = self.uadd_with_overflow_i64(lhs_hi, rhs_hi); + let (sum_hi, carry_hi2) = self.uadd_with_overflow_i64(sum_hi_temp, carry_lo_ext); + + let unsigned_overflow = self.or(carry_hi1, carry_hi2); + + let overflow_flag = if signed { + let zero = self.const_u64(0); + let lhs_neg = self.icmp(IntPredicate::IntSLT, lhs_hi, zero); + let rhs_neg = self.icmp(IntPredicate::IntSLT, rhs_hi, zero); + let res_neg = self.icmp(IntPredicate::IntSLT, sum_hi, zero); + let same_sign = self.icmp(IntPredicate::IntEQ, lhs_neg, rhs_neg); + let sign_diff = self.icmp(IntPredicate::IntNE, lhs_neg, res_neg); + self.and(same_sign, sign_diff) + } else { + unsigned_overflow + }; + + let result = self.combine_i128(sum_lo, sum_hi); + (result, overflow_flag) + } + + pub(super) fn emulate_i128_sub_with_overflow( + &mut self, + lhs: &'ll Value, + rhs: &'ll Value, + signed: bool, + ) -> (&'ll Value, &'ll Value) { + let i64_ty = self.type_i64(); + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let (diff_lo, borrow_lo) = self.usub_with_overflow_i64(lhs_lo, rhs_lo); + let borrow_lo_ext = self.zext(borrow_lo, i64_ty); + + let (diff_hi_temp, borrow_hi1) = self.usub_with_overflow_i64(lhs_hi, rhs_hi); + let (diff_hi, borrow_hi2) = self.usub_with_overflow_i64(diff_hi_temp, borrow_lo_ext); + + let unsigned_overflow = self.or(borrow_hi1, borrow_hi2); + + let overflow_flag = if signed { + let zero = self.const_u64(0); + let lhs_neg = self.icmp(IntPredicate::IntSLT, lhs_hi, zero); + let rhs_neg = self.icmp(IntPredicate::IntSLT, rhs_hi, zero); + let res_neg = self.icmp(IntPredicate::IntSLT, diff_hi, zero); + let lhs_diff_rhs = self.icmp(IntPredicate::IntNE, lhs_neg, rhs_neg); + let res_diff_lhs = self.icmp(IntPredicate::IntNE, res_neg, lhs_neg); + self.and(lhs_diff_rhs, res_diff_lhs) + } else { + unsigned_overflow + }; + + let result = self.combine_i128(diff_lo, diff_hi); + (result, overflow_flag) + } + + pub(super) fn emulate_i128_mul_with_overflow( + &mut self, + lhs: &'ll Value, + rhs: &'ll Value, + signed: bool, + ) -> (&'ll Value, &'ll Value) { + let i64_ty = self.type_i64(); + let (lhs_lo, lhs_hi) = self.split_i128(lhs); + let (rhs_lo, rhs_hi) = self.split_i128(rhs); + + let (ll_lo, ll_hi) = self.mul_u64_to_u128(lhs_lo, rhs_lo); + let (lh_lo, lh_hi) = self.mul_u64_to_u128(lhs_lo, rhs_hi); + let (hl_lo, hl_hi) = self.mul_u64_to_u128(lhs_hi, rhs_lo); + let (hh_lo, hh_hi) = self.mul_u64_to_u128(lhs_hi, rhs_hi); + + let (mid_temp, carry_mid1) = self.uadd_with_overflow_i64(ll_hi, lh_lo); + let (mid_sum, carry_mid2) = self.uadd_with_overflow_i64(mid_temp, hl_lo); + let carry_mid1_ext = self.zext(carry_mid1, i64_ty); + let carry_mid2_ext = self.zext(carry_mid2, i64_ty); + let mid_carry_sum = self.add(carry_mid1_ext, carry_mid2_ext); + + let result = self.combine_i128(ll_lo, mid_sum); + + let (upper_temp0, carry_upper1) = self.uadd_with_overflow_i64(lh_hi, hl_hi); + let (upper_temp1, carry_upper2) = self.uadd_with_overflow_i64(upper_temp0, hh_lo); + let (upper_low, carry_upper3) = self.uadd_with_overflow_i64(upper_temp1, mid_carry_sum); + + let carry_upper1_ext = self.zext(carry_upper1, i64_ty); + let carry_upper2_ext = self.zext(carry_upper2, i64_ty); + let carry_upper3_ext = self.zext(carry_upper3, i64_ty); + let carries_sum = self.add(carry_upper1_ext, carry_upper2_ext); + let carries_sum = self.add(carries_sum, carry_upper3_ext); + + let (upper_high, carry_upper4) = self.uadd_with_overflow_i64(hh_hi, carries_sum); + + let zero = self.const_u64(0); + let upper_low_nonzero = self.icmp(IntPredicate::IntNE, upper_low, zero); + let upper_high_nonzero = self.icmp(IntPredicate::IntNE, upper_high, zero); + let upper_nonzero = self.or(upper_low_nonzero, upper_high_nonzero); + let unsigned_overflow = self.or(upper_nonzero, carry_upper4); + + let overflow_flag = if signed { + let max = self.const_u64(u64::MAX); + let (_, res_hi) = self.split_i128(result); + let res_neg = self.icmp(IntPredicate::IntSLT, res_hi, zero); + + let upper_low_zero = self.icmp(IntPredicate::IntEQ, upper_low, zero); + let upper_high_zero = self.icmp(IntPredicate::IntEQ, upper_high, zero); + let high_all_zero = self.and(upper_low_zero, upper_high_zero); + + let upper_low_ones = self.icmp(IntPredicate::IntEQ, upper_low, max); + let upper_high_ones = self.icmp(IntPredicate::IntEQ, upper_high, max); + let high_all_ones = self.and(upper_low_ones, upper_high_ones); + + let not_high_all_zero = self.not(high_all_zero); + let overflow_pos = self.or(not_high_all_zero, carry_upper4); + let not_high_all_ones = self.not(high_all_ones); + let overflow_neg = self.or(not_high_all_ones, carry_upper4); + + self.select(res_neg, overflow_neg, overflow_pos) + } else { + unsigned_overflow + }; + + (result, overflow_flag) + } + + pub(super) fn emulate_i128_shl(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { + let (lo, hi) = self.split_i128(val); + let zero_i32 = self.cx.const_i32(0); + let sixty_four = self.cx.const_i32(64); + let one_twenty_eight = self.cx.const_i32(128); + + let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); + let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); + let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); + + let ge128_bb = self.append_sibling_block("i128_shl_ge128"); + let lt128_bb = self.append_sibling_block("i128_shl_lt128"); + let ge64_bb = self.append_sibling_block("i128_shl_ge64"); + let lt64_bb = self.append_sibling_block("i128_shl_lt64"); + let zero_bb = self.append_sibling_block("i128_shl_zero"); + let nz_bb = self.append_sibling_block("i128_shl_lt64_nz"); + let merge_bb = self.append_sibling_block("i128_shl_merge"); + + self.cond_br(shift_ge128, ge128_bb, lt128_bb); + + let mut ge128_bx = Self::build(self.cx, ge128_bb); + let ge128_res = ge128_bx.const_u128(0); + ge128_bx.br(merge_bb); + + let mut lt128_bx = Self::build(self.cx, lt128_bb); + lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); + + let mut ge64_bx = Self::build(self.cx, ge64_bb); + let shift_minus64 = ge64_bx.sub(shift, sixty_four); + let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); + let hi_from_lo = ge64_bx.shl(lo, shift_minus64_i64); + let ge64_res = ge64_bx.combine_i128(ge64_bx.const_u64(0), hi_from_lo); + ge64_bx.br(merge_bb); + + let mut lt64_bx = Self::build(self.cx, lt64_bb); + lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); + + let mut zero_bx = Self::build(self.cx, zero_bb); + zero_bx.br(merge_bb); + + let mut nz_bx = Self::build(self.cx, nz_bb); + let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); + let new_lo = nz_bx.shl(lo, shift_i64); + let hi_shifted = nz_bx.shl(hi, shift_i64); + let inv = nz_bx.sub(sixty_four, shift); + let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); + let carry = nz_bx.lshr(lo, inv_i64); + let new_hi = nz_bx.or(hi_shifted, carry); + let nz_res = nz_bx.combine_i128(new_lo, new_hi); + nz_bx.br(merge_bb); + + let mut merge_bx = Self::build(self.cx, merge_bb); + let result = merge_bx.phi(self.type_i128(), &[], &[]); + merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); + merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); + merge_bx.add_incoming_to_phi(result, val, zero_bb); + merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); + + *self = merge_bx; + result + } + + pub(super) fn emulate_i128_lshr(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { + let (lo, hi) = self.split_i128(val); + let zero_i32 = self.cx.const_i32(0); + let sixty_four = self.cx.const_i32(64); + let one_twenty_eight = self.cx.const_i32(128); + + let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); + let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); + let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); + + let ge128_bb = self.append_sibling_block("i128_lshr_ge128"); + let lt128_bb = self.append_sibling_block("i128_lshr_lt128"); + let ge64_bb = self.append_sibling_block("i128_lshr_ge64"); + let lt64_bb = self.append_sibling_block("i128_lshr_lt64"); + let zero_bb = self.append_sibling_block("i128_lshr_zero"); + let nz_bb = self.append_sibling_block("i128_lshr_lt64_nz"); + let merge_bb = self.append_sibling_block("i128_lshr_merge"); + + self.cond_br(shift_ge128, ge128_bb, lt128_bb); + + let mut ge128_bx = Self::build(self.cx, ge128_bb); + let ge128_res = ge128_bx.const_u128(0); + ge128_bx.br(merge_bb); + + let mut lt128_bx = Self::build(self.cx, lt128_bb); + lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); + + let mut ge64_bx = Self::build(self.cx, ge64_bb); + let shift_minus64 = ge64_bx.sub(shift, sixty_four); + let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); + let new_lo = ge64_bx.lshr(hi, shift_minus64_i64); + let ge64_res = ge64_bx.combine_i128(new_lo, ge64_bx.const_u64(0)); + ge64_bx.br(merge_bb); + + let mut lt64_bx = Self::build(self.cx, lt64_bb); + lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); + + let mut zero_bx = Self::build(self.cx, zero_bb); + zero_bx.br(merge_bb); + + let mut nz_bx = Self::build(self.cx, nz_bb); + let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); + let new_lo = nz_bx.lshr(lo, shift_i64); + let inv = nz_bx.sub(sixty_four, shift); + let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); + let carry = nz_bx.shl(hi, inv_i64); + let new_lo = nz_bx.or(new_lo, carry); + let new_hi = nz_bx.lshr(hi, shift_i64); + let nz_res = nz_bx.combine_i128(new_lo, new_hi); + nz_bx.br(merge_bb); + + let mut merge_bx = Self::build(self.cx, merge_bb); + let result = merge_bx.phi(self.type_i128(), &[], &[]); + merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); + merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); + merge_bx.add_incoming_to_phi(result, val, zero_bb); + merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); + + *self = merge_bx; + result + } + + pub(super) fn emulate_i128_ashr(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { + let (lo, hi) = self.split_i128(val); + let zero_i32 = self.cx.const_i32(0); + let sixty_four = self.cx.const_i32(64); + let one_twenty_eight = self.cx.const_i32(128); + let zero_u64 = self.const_u64(0); + + let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); + let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); + let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); + let is_negative = self.icmp(IntPredicate::IntSLT, hi, zero_u64); + + let ge128_bb = self.append_sibling_block("i128_ashr_ge128"); + let lt128_bb = self.append_sibling_block("i128_ashr_lt128"); + let ge64_bb = self.append_sibling_block("i128_ashr_ge64"); + let lt64_bb = self.append_sibling_block("i128_ashr_lt64"); + let zero_bb = self.append_sibling_block("i128_ashr_zero"); + let nz_bb = self.append_sibling_block("i128_ashr_lt64_nz"); + let merge_bb = self.append_sibling_block("i128_ashr_merge"); + + self.cond_br(shift_ge128, ge128_bb, lt128_bb); + + let mut ge128_bx = Self::build(self.cx, ge128_bb); + let all_ones = ge128_bx.const_u128(u128::MAX); + let zero = ge128_bx.const_u128(0); + let ge128_res = ge128_bx.select(is_negative, all_ones, zero); + ge128_bx.br(merge_bb); + + let mut lt128_bx = Self::build(self.cx, lt128_bb); + lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); + + let mut ge64_bx = Self::build(self.cx, ge64_bb); + let shift_minus64 = ge64_bx.sub(shift, sixty_four); + let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); + let new_lo = ge64_bx.ashr(hi, shift_minus64_i64); + let fill_hi = ge64_bx.select( + is_negative, + ge64_bx.const_u64(u64::MAX), + ge64_bx.const_u64(0), + ); + let ge64_res = ge64_bx.combine_i128(new_lo, fill_hi); + ge64_bx.br(merge_bb); + + let mut lt64_bx = Self::build(self.cx, lt64_bb); + lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); + + let mut zero_bx = Self::build(self.cx, zero_bb); + zero_bx.br(merge_bb); + + let mut nz_bx = Self::build(self.cx, nz_bb); + let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); + let new_lo_shift = nz_bx.lshr(lo, shift_i64); + let inv = nz_bx.sub(sixty_four, shift); + let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); + let carry = nz_bx.shl(hi, inv_i64); + let new_lo = nz_bx.or(new_lo_shift, carry); + let new_hi = nz_bx.ashr(hi, shift_i64); + let nz_res = nz_bx.combine_i128(new_lo, new_hi); + nz_bx.br(merge_bb); + + let mut merge_bx = Self::build(self.cx, merge_bb); + let result = merge_bx.phi(self.type_i128(), &[], &[]); + merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); + merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); + merge_bx.add_incoming_to_phi(result, val, zero_bb); + merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); + + *self = merge_bx; + result + } + + // Emulate 128-bit bitwise NOT + pub(super) fn emulate_i128_not(&mut self, val: &'ll Value) -> &'ll Value { + let (lo, hi) = self.split_i128(val); + + let not_lo = unsafe { llvm::LLVMBuildNot(self.llbuilder, lo, UNNAMED) }; + let not_hi = unsafe { llvm::LLVMBuildNot(self.llbuilder, hi, UNNAMED) }; + + self.combine_i128(not_lo, not_hi) + } + + // Emulate 128-bit negation (two's complement) + pub(super) fn emulate_i128_neg(&mut self, val: &'ll Value) -> &'ll Value { + // Two's complement: ~val + 1 + let not_val = self.emulate_i128_not(val); + let one = self.const_u128(1); + self.emulate_i128_add(not_val, one) + } + + pub(crate) fn emulate_i128_bswap(&mut self, val: &'ll Value) -> &'ll Value { + // Split the 128-bit value into two 64-bit halves + let (lo, hi) = self.split_i128(val); + + // Byte-swap each 64-bit half using the LLVM intrinsic (which exists in LLVM 7.1) + let swapped_lo = self.call_intrinsic("llvm.bswap.i64", &[lo]); + let swapped_hi = self.call_intrinsic("llvm.bswap.i64", &[hi]); + + // Swap the halves: the high part becomes low and vice versa + self.combine_i128(swapped_hi, swapped_lo) + } + + pub(crate) fn emulate_i128_count_zeros( + &mut self, + val: &'ll Value, + kind: CountZerosKind, + is_nonzero: bool, + ) -> &'ll Value { + // Split the 128-bit value into two 64-bit halves + let (lo, hi) = self.split_i128(val); + + match kind { + CountZerosKind::Leading => { + // Count leading zeros: check high part first + let hi_is_zero = self.icmp(IntPredicate::IntEQ, hi, self.const_u64(0)); + let hi_ctlz = + self.call_intrinsic("llvm.ctlz.i64", &[hi, self.const_bool(is_nonzero)]); + let lo_ctlz = + self.call_intrinsic("llvm.ctlz.i64", &[lo, self.const_bool(is_nonzero)]); + + // If high part is zero, result is 64 + ctlz(lo), otherwise ctlz(hi) + let lo_ctlz_plus_64 = self.add(lo_ctlz, self.const_u64(64)); + let result_64 = self.select(hi_is_zero, lo_ctlz_plus_64, hi_ctlz); + + // Zero-extend to i128 + self.zext(result_64, self.type_i128()) + } + CountZerosKind::Trailing => { + // Count trailing zeros: check low part first + let lo_is_zero = self.icmp(IntPredicate::IntEQ, lo, self.const_u64(0)); + let lo_cttz = + self.call_intrinsic("llvm.cttz.i64", &[lo, self.const_bool(is_nonzero)]); + let hi_cttz = + self.call_intrinsic("llvm.cttz.i64", &[hi, self.const_bool(is_nonzero)]); + + // If low part is zero, result is 64 + cttz(hi), otherwise cttz(lo) + let hi_cttz_plus_64 = self.add(hi_cttz, self.const_u64(64)); + let result_64 = self.select(lo_is_zero, hi_cttz_plus_64, lo_cttz); + + // Zero-extend to i128 + self.zext(result_64, self.type_i128()) + } + } + } + + pub(crate) fn emulate_i128_ctpop(&mut self, val: &'ll Value) -> &'ll Value { + // Split the 128-bit value into two 64-bit halves + let (lo, hi) = self.split_i128(val); + + // Count population (number of 1 bits) in each half + let lo_popcount = self.call_intrinsic("llvm.ctpop.i64", &[lo]); + let hi_popcount = self.call_intrinsic("llvm.ctpop.i64", &[hi]); + + // Add the two counts + let total_64 = self.add(lo_popcount, hi_popcount); + + // Zero-extend to i128 + self.zext(total_64, self.type_i128()) + } + + pub(crate) fn emulate_i128_rotate( + &mut self, + val: &'ll Value, + shift: &'ll Value, + is_left: bool, + ) -> &'ll Value { + // Rotate is implemented as: (val << shift) | (val >> (128 - shift)) + // For rotate right: (val >> shift) | (val << (128 - shift)) + + // Ensure shift is i128 + let shift_128 = if self.val_ty(shift) == self.type_i128() { + shift + } else { + self.zext(shift, self.type_i128()) + }; + + // Calculate 128 - shift for the complementary shift + let bits_128 = self.const_u128(128); + let shift_complement = self.sub(bits_128, shift_128); + + // Perform the two shifts + let (first_shift, second_shift) = if is_left { + (self.shl(val, shift_128), self.lshr(val, shift_complement)) + } else { + (self.lshr(val, shift_128), self.shl(val, shift_complement)) + }; + + // Combine with OR + self.or(first_shift, second_shift) + } + + pub(crate) fn emulate_i128_bitreverse(&mut self, val: &'ll Value) -> &'ll Value { + // Split the 128-bit value into two 64-bit halves + let (lo, hi) = self.split_i128(val); + + // Reverse bits in each half using the 64-bit intrinsic + let reversed_lo = self.call_intrinsic("llvm.bitreverse.i64", &[lo]); + let reversed_hi = self.call_intrinsic("llvm.bitreverse.i64", &[hi]); + + // Swap the halves: reversed high becomes low and vice versa + self.combine_i128(reversed_hi, reversed_lo) + } +} diff --git a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs index 56e45053..d0464212 100644 --- a/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs +++ b/crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs @@ -14,7 +14,6 @@ impl<'ll> CodegenCx<'ll, '_> { #[rustfmt::skip] // stop rustfmt from making this 2k lines pub(crate) fn build_intrinsics_map(&mut self) { let mut map = self.intrinsics_map.borrow_mut(); - let mut remapped = self.remapped_integer_args.borrow_mut(); macro_rules! ifn { ($map:expr, $($name:literal)|*, fn($($arg:expr),*) -> $ret:expr) => { @@ -24,9 +23,6 @@ impl<'ll> CodegenCx<'ll, '_> { }; } - let real_t_i128 = self.type_i128(); - let real_t_i128_i1 = self.type_struct(&[real_t_i128, self.type_i1()], false); - let i8p = self.type_i8p(); let void = self.type_void(); let i1 = self.type_i1(); @@ -34,7 +30,6 @@ impl<'ll> CodegenCx<'ll, '_> { let t_i16 = self.type_i16(); let t_i32 = self.type_i32(); let t_i64 = self.type_i64(); - let t_i128 = self.type_vector(t_i64, 2); let t_f32 = self.type_f32(); let t_f64 = self.type_f64(); let t_isize = self.type_isize(); @@ -43,7 +38,6 @@ impl<'ll> CodegenCx<'ll, '_> { let t_i16_i1 = self.type_struct(&[t_i16, i1], false); let t_i32_i1 = self.type_struct(&[t_i32, i1], false); let t_i64_i1 = self.type_struct(&[t_i64, i1], false); - let t_i128_i1 = self.type_struct(&[t_i128, i1], false); let voidp = self.voidp(); @@ -75,34 +69,6 @@ impl<'ll> CodegenCx<'ll, '_> { ifn!(map, "llvm.umul.with.overflow.i32", fn(t_i32, t_i32) -> t_i32_i1); ifn!(map, "llvm.umul.with.overflow.i64", fn(t_i64, t_i64) -> t_i64_i1); - let i128_checked_binops = [ - "__nvvm_i128_addo", - "__nvvm_u128_addo", - "__nvvm_i128_subo", - "__nvvm_u128_subo", - "__nvvm_i128_mulo", - "__nvvm_u128_mulo" - ]; - - for binop in i128_checked_binops { - map.insert(binop, (vec![t_i128, t_i128], t_i128_i1)); - let llfn_ty = self.type_func(&[t_i128, t_i128], t_i128_i1); - remapped.insert(llfn_ty, (Some(real_t_i128_i1), vec![(0, real_t_i128), (1, real_t_i128)])); - } - - let i128_saturating_ops = [ - "llvm.sadd.sat.i128", - "llvm.uadd.sat.i128", - "llvm.ssub.sat.i128", - "llvm.usub.sat.i128", - ]; - - for binop in i128_saturating_ops { - map.insert(binop, (vec![t_i128, t_i128], t_i128)); - let llfn_ty = self.type_func(&[t_i128, t_i128], t_i128); - remapped.insert(llfn_ty, (Some(real_t_i128), vec![(0, real_t_i128), (1, real_t_i128)])); - } - // for some very strange reason, they arent supported for i8 either, but that case // is easy to handle and we declare our own functions for that which just // zext to i16, use the i16 intrinsic, then trunc back to i8 @@ -115,34 +81,6 @@ impl<'ll> CodegenCx<'ll, '_> { ifn!(map, "__nvvm_i8_mulo", fn(t_i8, t_i8) -> t_i8_i1); ifn!(map, "__nvvm_u8_mulo", fn(t_i8, t_i8) -> t_i8_i1); - // i128 arithmetic operations from compiler-builtins - // Division and remainder - ifn!(map, "__nvvm_divti3", fn(t_i128, t_i128) -> t_i128); - ifn!(map, "__nvvm_udivti3", fn(t_i128, t_i128) -> t_i128); - ifn!(map, "__nvvm_modti3", fn(t_i128, t_i128) -> t_i128); - ifn!(map, "__nvvm_umodti3", fn(t_i128, t_i128) -> t_i128); - - // Multiplication - ifn!(map, "__nvvm_multi3", fn(t_i128, t_i128) -> t_i128); - - // Shift operations - ifn!(map, "__nvvm_ashlti3", fn(t_i128, t_i32) -> t_i128); - ifn!(map, "__nvvm_ashrti3", fn(t_i128, t_i32) -> t_i128); - ifn!(map, "__nvvm_lshrti3", fn(t_i128, t_i32) -> t_i128); - - // Add remapping for i128 binary operations (division, remainder, multiplication) - // All have the same signature: (i128, i128) -> i128 - let i128_binary_llfn_ty = self.type_func(&[t_i128, t_i128], t_i128); - remapped.insert(i128_binary_llfn_ty, (Some(real_t_i128), vec![(0, real_t_i128), (1, real_t_i128)])); - - // Add remapping for i128 shift operations - // All have the same signature: (i128, i32) -> i128 - let i128_shift_llfn_ty = self.type_func(&[t_i128, t_i32], t_i128); - remapped.insert(i128_shift_llfn_ty, (Some(real_t_i128), vec![(0, real_t_i128), (1, t_i32)])); - - // see comment in libintrinsics.ll - // ifn!(map, "__nvvm_i128_trap", fn(t_i128, t_i128) -> t_i128); - ifn!(map, "llvm.sadd.sat.i8", fn(t_i8, t_i8) -> t_i8); ifn!(map, "llvm.sadd.sat.i16", fn(t_i16, t_i16) -> t_i16); ifn!(map, "llvm.sadd.sat.i32", fn(t_i32, t_i32) -> t_i32); diff --git a/crates/rustc_codegen_nvvm/src/intrinsic.rs b/crates/rustc_codegen_nvvm/src/intrinsic.rs index 0a805b89..7718d072 100644 --- a/crates/rustc_codegen_nvvm/src/intrinsic.rs +++ b/crates/rustc_codegen_nvvm/src/intrinsic.rs @@ -428,6 +428,15 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { let pair = self.insert_value(pair, low, 0); self.insert_value(pair, high, 1) } + sym::unchecked_shl => self.shl(args[0].immediate(), args[1].immediate()), + sym::unchecked_shr => { + let lhs = args[0].immediate(); + let rhs = args[1].immediate(); + match arg_tys[0].kind() { + ty::Int(_) => self.ashr(lhs, rhs), + _ => self.lshr(lhs, rhs), + } + } sym::ctlz | sym::ctlz_nonzero | sym::cttz diff --git a/examples/cuda/i128_demo/Cargo.toml b/examples/cuda/i128_demo/Cargo.toml new file mode 100644 index 00000000..4e4f373b --- /dev/null +++ b/examples/cuda/i128_demo/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "i128_demo" +version = "0.1.0" +edition = "2024" + +[dependencies] +cust = { path = "../../../crates/cust" } + +[build-dependencies] +cuda_builder = { workspace = true, default-features = false } diff --git a/examples/cuda/i128_demo/build.rs b/examples/cuda/i128_demo/build.rs new file mode 100644 index 00000000..c87b560c --- /dev/null +++ b/examples/cuda/i128_demo/build.rs @@ -0,0 +1,18 @@ +use std::env; +use std::path::PathBuf; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + + CudaBuilder::new(manifest_dir.join("kernels")) + .copy_to(out_path.join("kernels.ptx")) + .final_module_path(out_path.join("final_module.ll")) + .build() + .unwrap(); +} diff --git a/examples/cuda/i128_demo/kernels/Cargo.toml b/examples/cuda/i128_demo/kernels/Cargo.toml new file mode 100644 index 00000000..58edca9d --- /dev/null +++ b/examples/cuda/i128_demo/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "i128-demo-kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/examples/cuda/i128_demo/kernels/src/lib.rs b/examples/cuda/i128_demo/kernels/src/lib.rs new file mode 100644 index 00000000..027b649b --- /dev/null +++ b/examples/cuda/i128_demo/kernels/src/lib.rs @@ -0,0 +1,48 @@ +#![no_std] + +use cuda_std::prelude::*; + +#[kernel] +#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] +pub unsafe fn i128_ops( + a: &[u128], + b: &[u128], + add_out: *mut u128, + sub_out: *mut u128, + mul_out: *mut u128, + and_out: *mut u128, + xor_out: *mut u128, + shl_out: *mut u128, + lshr_out: *mut u128, + ashr_out: *mut u128, + udiv_out: *mut u128, + sdiv_out: *mut u128, + urem_out: *mut u128, + srem_out: *mut u128, +) { + let idx = thread::index_1d() as usize; + if idx >= a.len() || idx >= b.len() { + return; + } + + let av = a[idx]; + let bv = b[idx]; + let shift = (bv & 127) as u32; + let signed = av as i128; + let signed_b = bv as i128; + + unsafe { + *add_out.add(idx) = av.wrapping_add(bv); + *sub_out.add(idx) = av.wrapping_sub(bv); + *mul_out.add(idx) = av.wrapping_mul(bv); + *and_out.add(idx) = av & bv; + *xor_out.add(idx) = av ^ bv; + *shl_out.add(idx) = av.wrapping_shl(shift); + *lshr_out.add(idx) = av.wrapping_shr(shift); + *ashr_out.add(idx) = (signed.wrapping_shr(shift)) as u128; + *udiv_out.add(idx) = av / bv; + *sdiv_out.add(idx) = (signed / signed_b) as u128; + *urem_out.add(idx) = av % bv; + *srem_out.add(idx) = (signed % signed_b) as u128; + } +} diff --git a/examples/cuda/i128_demo/src/main.rs b/examples/cuda/i128_demo/src/main.rs new file mode 100644 index 00000000..3ed9f92d --- /dev/null +++ b/examples/cuda/i128_demo/src/main.rs @@ -0,0 +1,293 @@ +use cust::prelude::*; +use std::error::Error; + +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +const INPUT_PAIRS: [(u128, u128); 23] = [ + // Basic non-zero divisor sanity check. + (0, 3), + // Simple add/sub with small + patterned mask divisor. + (1, 0x1111_1111_1111_1111_1111_1111_1111_1111), + // Max magnitude dividend exercising unsigned wraparound with patterned divisor. + (u128::MAX, 0x2222_2222_2222_2222_2222_2222_2222_2222), + // Near max positive signed plus odd offset to stress signed division. + ( + (1u128 << 127) + 123_456_789, + 0x3333_3333_3333_3333_3333_3333_3333_3333, + ), + // Mixed hex patterns to ensure bitwise ops and shifts propagate carries between halves. + ( + 0x0123_4567_89ab_cdef_0123_4567_89ab_cdef, + 0x4444_4444_4444_4444_4444_4444_4444_4444, + ), + // Alternating pattern stressing xor/and/or combinations across words. + ( + 0xfedc_ba98_7654_3210_fedc_ba98_7654_3210, + 0x5555_5555_5555_5555_5555_5555_5555_5555, + ), + // Low-half mask to hit low→high shift carries. + ( + 0x0000_ffff_0000_ffff_0000_ffff_0000_ffff, + 0x6666_6666_6666_6666_6666_6666_6666_6666, + ), + // Random-looking pattern to detect misplaced limb ordering. + ( + 0xabcd_ef12_3456_789a_bcde_f012_3456_789a, + 0x7777_7777_7777_7777_7777_7777_7777_7777, + ), + // Pure power-of-two vs small divisor for shifts and div edge cases. + (1u128 << 127, 5), + // Signed overflow boundary vs max unsigned divisor. + (i128::MAX as u128, u128::MAX), + // Distinct power-of-two limbs to check cross-term multiplications. + (1u128 << 64, (1u128 << 63) + 1), + // Odd division with high-bit divisor to stress udiv/sdiv paths. + (u128::MAX / 3, 0x8000_0000_0000_0000_0000_0000_0000_0001), + // Near-overflow positive dividends paired with non power-of-two divisors. + (0x7fff_ffff_ffff_ffff_0000_0000_0000_0001, u128::MAX / 5), + // Signed negative boundary mixed with patterned divisor. + ( + 0x8000_0000_0000_0000_8000_0000_0000_0000, + 0xffff_ffff_0000_0000_ffff_ffff_0000_0001, + ), + // Arbitrary large magnitudes to sanity check arithmetic stability. + ( + 123_456_789_012_345_678_901_234_567_890u128, + (1u128 << 127) - 3, + ), + // Values near u128::MAX to ensure carry/borrow propagation. + (u128::MAX - 999_999_999, u128::MAX - 2), + // Exercises the lower-half cross-carry path in the mul emulation. + ( + 0x0000_0000_0000_0000_ffff_ffff_ffff_ffff, + 0x0000_0000_0000_0000_ffff_ffff_ffff_ffff, + ), + // Check emulated mul path edgecase. + (u128::MAX, u128::MAX), + // Shift exactly 64 bits with positive divisor. + ( + 0x0123_4567_89ab_cdef_fedc_ba98_7654_3210, + (1u128 << 120) | 64, + ), + // Shift exactly 64 bits with negative divisor to stress signed paths. + ( + 0xfedc_ba98_7654_3210_0123_4567_89ab_cdef, + (1u128 << 127) | 64, + ), + // Shift just below the limb boundary. + ( + 0xaaaa_aaaa_5555_5555_ffff_ffff_0000_0000, + (1u128 << 96) | 63, + ), + // Shift just above the limb boundary. + ( + 0x0001_0203_0405_0607_0809_0a0b_0c0d_0e0f, + (1u128 << 80) | 65, + ), + // Maximum masked shift amount with high-bit divisor. + ( + 0xffff_0000_0000_0000_ffff_0000_0000_0001, + (1u128 << 127) | 127, + ), +]; + +fn main() -> Result<(), Box> { + let _ctx = cust::quick_init()?; + + let module = Module::from_ptx(PTX, &[])?; + let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; + let kernel = module.get_function("i128_ops")?; + + let (host_a, host_b): (Vec, Vec) = INPUT_PAIRS.iter().copied().unzip(); + + let len = host_a.len(); + assert_eq!(len, host_b.len()); + + let a_gpu = DeviceBuffer::from_slice(&host_a)?; + let b_gpu = DeviceBuffer::from_slice(&host_b)?; + + let add_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let sub_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let mul_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let and_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let xor_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let shl_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let lshr_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let ashr_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let udiv_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let sdiv_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let urem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + let srem_gpu = DeviceBuffer::from_slice(&vec![0u128; len])?; + + let block_size = 128u32; + let grid_size = (len as u32).div_ceil(block_size); + + unsafe { + launch!( + kernel<<>>( + a_gpu.as_device_ptr(), + a_gpu.len(), + b_gpu.as_device_ptr(), + b_gpu.len(), + add_gpu.as_device_ptr(), + sub_gpu.as_device_ptr(), + mul_gpu.as_device_ptr(), + and_gpu.as_device_ptr(), + xor_gpu.as_device_ptr(), + shl_gpu.as_device_ptr(), + lshr_gpu.as_device_ptr(), + ashr_gpu.as_device_ptr(), + udiv_gpu.as_device_ptr(), + sdiv_gpu.as_device_ptr(), + urem_gpu.as_device_ptr(), + srem_gpu.as_device_ptr() + ) + )?; + } + + stream.synchronize()?; + + let mut gpu_add = vec![0u128; len]; + let mut gpu_sub = vec![0u128; len]; + let mut gpu_mul = vec![0u128; len]; + let mut gpu_and = vec![0u128; len]; + let mut gpu_xor = vec![0u128; len]; + let mut gpu_shl = vec![0u128; len]; + let mut gpu_lshr = vec![0u128; len]; + let mut gpu_ashr = vec![0u128; len]; + let mut gpu_udiv = vec![0u128; len]; + let mut gpu_sdiv = vec![0u128; len]; + let mut gpu_urem = vec![0u128; len]; + let mut gpu_srem = vec![0u128; len]; + + add_gpu.copy_to(&mut gpu_add)?; + sub_gpu.copy_to(&mut gpu_sub)?; + mul_gpu.copy_to(&mut gpu_mul)?; + and_gpu.copy_to(&mut gpu_and)?; + xor_gpu.copy_to(&mut gpu_xor)?; + shl_gpu.copy_to(&mut gpu_shl)?; + lshr_gpu.copy_to(&mut gpu_lshr)?; + ashr_gpu.copy_to(&mut gpu_ashr)?; + udiv_gpu.copy_to(&mut gpu_udiv)?; + sdiv_gpu.copy_to(&mut gpu_sdiv)?; + urem_gpu.copy_to(&mut gpu_urem)?; + srem_gpu.copy_to(&mut gpu_srem)?; + + let mut cpu_add = vec![0u128; len]; + let mut cpu_sub = vec![0u128; len]; + let mut cpu_mul = vec![0u128; len]; + let mut cpu_and = vec![0u128; len]; + let mut cpu_xor = vec![0u128; len]; + let mut cpu_shl = vec![0u128; len]; + let mut cpu_lshr = vec![0u128; len]; + let mut cpu_ashr = vec![0u128; len]; + let mut cpu_udiv = vec![0u128; len]; + let mut cpu_sdiv = vec![0u128; len]; + let mut cpu_urem = vec![0u128; len]; + let mut cpu_srem = vec![0u128; len]; + + for (i, (&av, &bv)) in host_a.iter().zip(host_b.iter()).enumerate() { + let shift = (bv & 127) as u32; + let signed = av as i128; + let signed_b = bv as i128; + cpu_add[i] = av.wrapping_add(bv); + cpu_sub[i] = av.wrapping_sub(bv); + cpu_mul[i] = av.wrapping_mul(bv); + cpu_and[i] = av & bv; + cpu_xor[i] = av ^ bv; + cpu_shl[i] = av.wrapping_shl(shift); + cpu_lshr[i] = av.wrapping_shr(shift); + cpu_ashr[i] = (signed.wrapping_shr(shift)) as u128; + cpu_udiv[i] = av / bv; + cpu_sdiv[i] = (signed / signed_b) as u128; + cpu_urem[i] = av % bv; + cpu_srem[i] = (signed % signed_b) as u128; + } + + let mut all_ok = true; + all_ok &= compare_results("add", &gpu_add, &cpu_add); + all_ok &= compare_results("sub", &gpu_sub, &cpu_sub); + all_ok &= compare_results("mul", &gpu_mul, &cpu_mul); + all_ok &= compare_results("and", &gpu_and, &cpu_and); + all_ok &= compare_results("xor", &gpu_xor, &cpu_xor); + all_ok &= compare_results("shl", &gpu_shl, &cpu_shl); + all_ok &= compare_results("lshr", &gpu_lshr, &cpu_lshr); + all_ok &= compare_results("ashr", &gpu_ashr, &cpu_ashr); + all_ok &= compare_results("udiv", &gpu_udiv, &cpu_udiv); + all_ok &= compare_results("sdiv", &gpu_sdiv, &cpu_sdiv); + all_ok &= compare_results("urem", &gpu_urem, &cpu_urem); + all_ok &= compare_results("srem", &gpu_srem, &cpu_srem); + + if !all_ok { + return Err("Mismatch between GPU and CPU i128 results".into()); + } + + // Ensure signed overflow (`i128::MIN / -1`) traps on the device. + let trap_stream = Stream::new(StreamFlags::NON_BLOCKING, None)?; + let trap_a = DeviceBuffer::from_slice(&[i128::MIN as u128])?; + let trap_b = DeviceBuffer::from_slice(&[u128::MAX])?; + let trap_add = DeviceBuffer::from_slice(&[0u128])?; + let trap_sub = DeviceBuffer::from_slice(&[0u128])?; + let trap_mul = DeviceBuffer::from_slice(&[0u128])?; + let trap_and = DeviceBuffer::from_slice(&[0u128])?; + let trap_xor = DeviceBuffer::from_slice(&[0u128])?; + let trap_shl = DeviceBuffer::from_slice(&[0u128])?; + let trap_lshr = DeviceBuffer::from_slice(&[0u128])?; + let trap_ashr = DeviceBuffer::from_slice(&[0u128])?; + let trap_udiv = DeviceBuffer::from_slice(&[0u128])?; + let trap_sdiv = DeviceBuffer::from_slice(&[0u128])?; + let trap_urem = DeviceBuffer::from_slice(&[0u128])?; + let trap_srem = DeviceBuffer::from_slice(&[0u128])?; + + let trap_launch = unsafe { + launch!( + kernel<<<1u32, 1u32, 0, trap_stream>>>( + trap_a.as_device_ptr(), + trap_a.len(), + trap_b.as_device_ptr(), + trap_b.len(), + trap_add.as_device_ptr(), + trap_sub.as_device_ptr(), + trap_mul.as_device_ptr(), + trap_and.as_device_ptr(), + trap_xor.as_device_ptr(), + trap_shl.as_device_ptr(), + trap_lshr.as_device_ptr(), + trap_ashr.as_device_ptr(), + trap_udiv.as_device_ptr(), + trap_sdiv.as_device_ptr(), + trap_urem.as_device_ptr(), + trap_srem.as_device_ptr() + ) + ) + }; + + let trap_result = match trap_launch { + Ok(()) => trap_stream.synchronize(), + Err(e) => Err(e), + }; + + match trap_result { + Err(e) => println!("Correctly got expected trap for i128::MIN / -1: {e}"), + Ok(()) => return Err("Expected trap for i128::MIN / -1 not triggered".into()), + } + + println!("All i128 GPU results match CPU computations."); + Ok(()) +} + +fn compare_results(name: &str, gpu: &[u128], cpu: &[u128]) -> bool { + let mut ok = true; + for (idx, (g, c)) in gpu.iter().zip(cpu.iter()).enumerate() { + if g != c { + println!("[{name}] mismatch at index {idx}: gpu={g:#034x}, cpu={c:#034x}"); + ok = false; + } + } + + if ok { + println!("[{name}] results match"); + } + + ok +} From 75cb2ba46037b8c233015c72df12808ed9bcd892 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Wed, 24 Sep 2025 10:41:42 -0700 Subject: [PATCH 2/2] Switch to compiler-builtins Ok, figured out how to do this and now I reuse whatever is in `compiler-builtins`. It doesn't have every operation though, so we still need to emulate some parts of some ops. --- .../src/builder/emulate_i128.rs | 423 ++++-------------- 1 file changed, 90 insertions(+), 333 deletions(-) diff --git a/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs b/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs index 1ba12717..a836c50a 100644 --- a/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs +++ b/crates/rustc_codegen_nvvm/src/builder/emulate_i128.rs @@ -1,7 +1,8 @@ +use rustc_abi::{Align, Size}; use rustc_codegen_ssa::common::IntPredicate; use rustc_codegen_ssa::traits::*; -use crate::llvm::{self, Value}; +use crate::llvm::{self, Type, Value}; use super::{Builder, CountZerosKind, UNNAMED}; @@ -32,6 +33,42 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { (lo, hi) } + fn ensure_i128(&mut self, val: &'ll Value) -> &'ll Value { + if self.val_ty(val) == self.type_i128() { + val + } else { + unsafe { llvm::LLVMBuildBitCast(self.llbuilder, val, self.type_i128(), UNNAMED) } + } + } + + fn call_compiler_builtin( + &mut self, + name: &str, + ret_ty: &'ll Type, + args: &[&'ll Value], + ) -> &'ll Value { + let arg_tys: Vec<_> = args.iter().map(|&arg| self.val_ty(arg)).collect(); + let fn_ty = self.type_func(&arg_tys, ret_ty); + let llfn = self.cx.declare_fn(name, fn_ty, None); + self.call(fn_ty, None, None, llfn, args, None, None) + } + + fn trap_if(&mut self, cond: &'ll Value, label: &str) { + let trap_label = format!("{label}_trap"); + let cont_label = format!("{label}_cont"); + let trap_bb = self.append_sibling_block(&trap_label); + let cont_bb = self.append_sibling_block(&cont_label); + + self.cond_br(cond, trap_bb, cont_bb); + + let mut trap_bx = Self::build(self.cx, trap_bb); + trap_bx.call_intrinsic("llvm.trap", &[]); + trap_bx.unreachable(); + + let cont_bx = Self::build(self.cx, cont_bb); + *self = cont_bx; + } + fn uadd_with_overflow_i64( &mut self, lhs: &'ll Value, @@ -109,45 +146,20 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { (lo, hi) } - // Emulate 128-bit addition using two 64-bit additions with carry + // Emulate 128-bit addition using compiler-builtins pub(super) fn emulate_i128_add(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let i64_ty = self.type_i64(); - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - // Add low parts - let sum_lo = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - - // Check for carry from low addition - let carry = self.icmp(IntPredicate::IntULT, sum_lo, lhs_lo); - let carry_ext = self.zext(carry, i64_ty); - - // Add high parts with carry - let sum_hi_temp = unsafe { llvm::LLVMBuildAdd(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - let sum_hi = unsafe { llvm::LLVMBuildAdd(self.llbuilder, sum_hi_temp, carry_ext, UNNAMED) }; - - self.combine_i128(sum_lo, sum_hi) + let lhs = self.ensure_i128(lhs); + let rhs = self.ensure_i128(rhs); + let args = [lhs, rhs]; + self.call_compiler_builtin("__rust_i128_add", self.type_i128(), &args) } - // Emulate 128-bit subtraction + // Emulate 128-bit subtraction using compiler-builtins pub(super) fn emulate_i128_sub(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let i64_ty = self.type_i64(); - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - // Subtract low parts - let diff_lo = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_lo, rhs_lo, UNNAMED) }; - - // Check for borrow - let borrow = self.icmp(IntPredicate::IntUGT, rhs_lo, lhs_lo); - let borrow_ext = self.zext(borrow, i64_ty); - - // Subtract high parts with borrow - let diff_hi_temp = unsafe { llvm::LLVMBuildSub(self.llbuilder, lhs_hi, rhs_hi, UNNAMED) }; - let diff_hi = - unsafe { llvm::LLVMBuildSub(self.llbuilder, diff_hi_temp, borrow_ext, UNNAMED) }; - - self.combine_i128(diff_lo, diff_hi) + let lhs = self.ensure_i128(lhs); + let rhs = self.ensure_i128(rhs); + let args = [lhs, rhs]; + self.call_compiler_builtin("__rust_i128_sub", self.type_i128(), &args) } // Emulate 128-bit bitwise AND @@ -183,30 +195,12 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { self.combine_i128(xor_lo, xor_hi) } - // Emulate 128-bit multiplication using 64-bit partial products + // Emulate 128-bit multiplication using compiler-builtins pub(super) fn emulate_i128_mul(&mut self, lhs: &'ll Value, rhs: &'ll Value) -> &'ll Value { - let (lhs_lo, lhs_hi) = self.split_i128(lhs); - let (rhs_lo, rhs_hi) = self.split_i128(rhs); - - // Compute partial products - let (prod_ll_lo, prod_ll_hi) = self.mul_u64_to_u128(lhs_lo, rhs_lo); - let partial_ll = self.combine_i128(prod_ll_lo, prod_ll_hi); - - let (prod_lh_lo, prod_lh_hi) = self.mul_u64_to_u128(lhs_lo, rhs_hi); - let partial_lh = self.combine_i128(prod_lh_lo, prod_lh_hi); - - let (prod_hl_lo, prod_hl_hi) = self.mul_u64_to_u128(lhs_hi, rhs_lo); - let partial_hl = self.combine_i128(prod_hl_lo, prod_hl_hi); - - let shift_64 = self.cx.const_i32(64); - - // Shift cross terms by 64 bits to align them within the 128-bit space - let partial_lh_shifted = self.emulate_i128_shl(partial_lh, shift_64); - let partial_hl_shifted = self.emulate_i128_shl(partial_hl, shift_64); - - // Sum the partial products modulo 2^128 - let acc = self.emulate_i128_add(partial_ll, partial_lh_shifted); - self.emulate_i128_add(acc, partial_hl_shifted) + let lhs = self.ensure_i128(lhs); + let rhs = self.ensure_i128(rhs); + let args = [lhs, rhs]; + self.call_compiler_builtin("__multi3", self.type_i128(), &args) } fn emulate_i128_udivrem( @@ -214,69 +208,22 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { num: &'ll Value, den: &'ll Value, ) -> (&'ll Value, &'ll Value) { - let zero_i128 = self.const_u128(0); - let one_i128 = self.const_u128(1); - let one_i32 = self.cx.const_i32(1); - let i32_ty = self.cx.type_i32(); - let i128_ty = self.type_i128(); - - let trap_bb = self.append_sibling_block("i128_udiv_trap"); - let main_bb = self.append_sibling_block("i128_udiv_main"); - let lt_bb = self.append_sibling_block("i128_udiv_lt"); - let loop_header_bb = self.append_sibling_block("i128_udiv_loop_header"); - let loop_body_bb = self.append_sibling_block("i128_udiv_loop_body"); - let exit_bb = self.append_sibling_block("i128_udiv_exit"); - - let mut exit_bx = Self::build(self.cx, exit_bb); - let quot_res = exit_bx.phi(i128_ty, &[], &[]); - let rem_res = exit_bx.phi(i128_ty, &[], &[]); - - let den_is_zero = self.icmp(IntPredicate::IntEQ, den, zero_i128); - self.cond_br(den_is_zero, trap_bb, main_bb); + let num = self.ensure_i128(num); + let den = self.ensure_i128(den); - let mut trap_bx = Self::build(self.cx, trap_bb); - trap_bx.call_intrinsic("llvm.trap", &[]); - trap_bx.unreachable(); + let zero = self.const_u128(0); + let denom_is_zero = self.icmp(IntPredicate::IntEQ, den, zero); + self.trap_if(denom_is_zero, "i128_udiv_zero"); - let mut main_bx = Self::build(self.cx, main_bb); - let num_lt_den = main_bx.icmp(IntPredicate::IntULT, num, den); - let bit_init = main_bx.cx.const_int(i32_ty, 128); - main_bx.cond_br(num_lt_den, lt_bb, loop_header_bb); - - let mut lt_bx = Self::build(self.cx, lt_bb); - lt_bx.br(exit_bb); - exit_bx.add_incoming_to_phi(quot_res, zero_i128, lt_bb); - exit_bx.add_incoming_to_phi(rem_res, num, lt_bb); - - let mut header_bx = Self::build(self.cx, loop_header_bb); - let bit_phi = header_bx.phi(i32_ty, &[bit_init], &[main_bb]); - let quot_phi = header_bx.phi(i128_ty, &[zero_i128], &[main_bb]); - let rem_phi = header_bx.phi(i128_ty, &[zero_i128], &[main_bb]); - let bit_zero = header_bx.icmp(IntPredicate::IntEQ, bit_phi, header_bx.cx.const_i32(0)); - header_bx.cond_br(bit_zero, exit_bb, loop_body_bb); - exit_bx.add_incoming_to_phi(quot_res, quot_phi, loop_header_bb); - exit_bx.add_incoming_to_phi(rem_res, rem_phi, loop_header_bb); - - let mut body_bx = Self::build(self.cx, loop_body_bb); - let bit_next = body_bx.sub(bit_phi, one_i32); - let num_shifted = body_bx.emulate_i128_lshr(num, bit_next); - let bit_val = body_bx.and(num_shifted, one_i128); - let rem_shift = body_bx.emulate_i128_shl(rem_phi, one_i32); - let rem_candidate = body_bx.or(rem_shift, bit_val); - let rem_ge = body_bx.icmp(IntPredicate::IntUGE, rem_candidate, den); - let rem_sub = body_bx.emulate_i128_sub(rem_candidate, den); - let rem_new = body_bx.select(rem_ge, rem_sub, rem_candidate); - let one_shifted = body_bx.emulate_i128_shl(one_i128, bit_next); - let quot_candidate = body_bx.or(quot_phi, one_shifted); - let quot_new = body_bx.select(rem_ge, quot_candidate, quot_phi); - let loop_body_exit_bb = body_bx.llbb(); - body_bx.br(loop_header_bb); - header_bx.add_incoming_to_phi(bit_phi, bit_next, loop_body_exit_bb); - header_bx.add_incoming_to_phi(quot_phi, quot_new, loop_body_exit_bb); - header_bx.add_incoming_to_phi(rem_phi, rem_new, loop_body_exit_bb); - - *self = exit_bx; - (quot_res, rem_res) + let i128_align = Align::from_bytes(16).expect("align 16"); + let rem_slot = self.alloca(Size::from_bits(128), i128_align); + let rem_ptr = self.pointercast(rem_slot, self.cx.type_ptr_to(self.type_i128())); + + let args = [num, den, rem_ptr]; + let quot = self.call_compiler_builtin("__udivmodti4", self.type_i128(), &args); + let rem = self.load(self.type_i128(), rem_ptr, i128_align); + + (quot, rem) } pub(super) fn emulate_i128_udiv(&mut self, num: &'ll Value, den: &'ll Value) -> &'ll Value { @@ -292,40 +239,24 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { num: &'ll Value, den: &'ll Value, ) -> (&'ll Value, &'ll Value) { + let num = self.ensure_i128(num); + let den = self.ensure_i128(den); + + let zero = self.const_u128(0); + let denom_is_zero = self.icmp(IntPredicate::IntEQ, den, zero); + self.trap_if(denom_is_zero, "i128_sdiv_zero"); + let min_i128 = self.const_u128(1u128 << 127); let neg_one_i128 = self.const_u128(u128::MAX); let num_is_min = self.icmp(IntPredicate::IntEQ, num, min_i128); let den_is_neg_one = self.icmp(IntPredicate::IntEQ, den, neg_one_i128); let overflow_case = self.and(num_is_min, den_is_neg_one); + self.trap_if(overflow_case, "i128_sdiv_overflow"); - let trap_bb = self.append_sibling_block("i128_sdiv_overflow_trap"); - let cont_bb = self.append_sibling_block("i128_sdiv_overflow_cont"); - - self.cond_br(overflow_case, trap_bb, cont_bb); + let args = [num, den]; + let quot = self.call_compiler_builtin("__divti3", self.type_i128(), &args); + let rem = self.call_compiler_builtin("__modti3", self.type_i128(), &args); - let mut trap_bx = Self::build(self.cx, trap_bb); - trap_bx.call_intrinsic("llvm.trap", &[]); - trap_bx.unreachable(); - - let mut bx = Self::build(self.cx, cont_bb); - - let zero_u64 = bx.const_u64(0); - let (_, num_hi) = bx.split_i128(num); - let (_, den_hi) = bx.split_i128(den); - let num_neg = bx.icmp(IntPredicate::IntSLT, num_hi, zero_u64); - let den_neg = bx.icmp(IntPredicate::IntSLT, den_hi, zero_u64); - let num_negated = bx.emulate_i128_neg(num); - let num_abs = bx.select(num_neg, num_negated, num); - let den_negated = bx.emulate_i128_neg(den); - let den_abs = bx.select(den_neg, den_negated, den); - let (quot_raw, rem_raw) = bx.emulate_i128_udivrem(num_abs, den_abs); - let sign_diff = bx.xor(num_neg, den_neg); - let quot_negated = bx.emulate_i128_neg(quot_raw); - let quot = bx.select(sign_diff, quot_negated, quot_raw); - let rem_negated = bx.emulate_i128_neg(rem_raw); - let rem = bx.select(num_neg, rem_negated, rem_raw); - - *self = bx; (quot, rem) } @@ -473,198 +404,24 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> { } pub(super) fn emulate_i128_shl(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { - let (lo, hi) = self.split_i128(val); - let zero_i32 = self.cx.const_i32(0); - let sixty_four = self.cx.const_i32(64); - let one_twenty_eight = self.cx.const_i32(128); - - let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); - let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); - let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); - - let ge128_bb = self.append_sibling_block("i128_shl_ge128"); - let lt128_bb = self.append_sibling_block("i128_shl_lt128"); - let ge64_bb = self.append_sibling_block("i128_shl_ge64"); - let lt64_bb = self.append_sibling_block("i128_shl_lt64"); - let zero_bb = self.append_sibling_block("i128_shl_zero"); - let nz_bb = self.append_sibling_block("i128_shl_lt64_nz"); - let merge_bb = self.append_sibling_block("i128_shl_merge"); - - self.cond_br(shift_ge128, ge128_bb, lt128_bb); - - let mut ge128_bx = Self::build(self.cx, ge128_bb); - let ge128_res = ge128_bx.const_u128(0); - ge128_bx.br(merge_bb); - - let mut lt128_bx = Self::build(self.cx, lt128_bb); - lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); - - let mut ge64_bx = Self::build(self.cx, ge64_bb); - let shift_minus64 = ge64_bx.sub(shift, sixty_four); - let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); - let hi_from_lo = ge64_bx.shl(lo, shift_minus64_i64); - let ge64_res = ge64_bx.combine_i128(ge64_bx.const_u64(0), hi_from_lo); - ge64_bx.br(merge_bb); - - let mut lt64_bx = Self::build(self.cx, lt64_bb); - lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); - - let mut zero_bx = Self::build(self.cx, zero_bb); - zero_bx.br(merge_bb); - - let mut nz_bx = Self::build(self.cx, nz_bb); - let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); - let new_lo = nz_bx.shl(lo, shift_i64); - let hi_shifted = nz_bx.shl(hi, shift_i64); - let inv = nz_bx.sub(sixty_four, shift); - let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); - let carry = nz_bx.lshr(lo, inv_i64); - let new_hi = nz_bx.or(hi_shifted, carry); - let nz_res = nz_bx.combine_i128(new_lo, new_hi); - nz_bx.br(merge_bb); - - let mut merge_bx = Self::build(self.cx, merge_bb); - let result = merge_bx.phi(self.type_i128(), &[], &[]); - merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); - merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); - merge_bx.add_incoming_to_phi(result, val, zero_bb); - merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); - - *self = merge_bx; - result + let val = self.ensure_i128(val); + let shift = self.intcast(shift, self.cx.type_i32(), false); + let args = [val, shift]; + self.call_compiler_builtin("__ashlti3", self.type_i128(), &args) } pub(super) fn emulate_i128_lshr(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { - let (lo, hi) = self.split_i128(val); - let zero_i32 = self.cx.const_i32(0); - let sixty_four = self.cx.const_i32(64); - let one_twenty_eight = self.cx.const_i32(128); - - let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); - let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); - let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); - - let ge128_bb = self.append_sibling_block("i128_lshr_ge128"); - let lt128_bb = self.append_sibling_block("i128_lshr_lt128"); - let ge64_bb = self.append_sibling_block("i128_lshr_ge64"); - let lt64_bb = self.append_sibling_block("i128_lshr_lt64"); - let zero_bb = self.append_sibling_block("i128_lshr_zero"); - let nz_bb = self.append_sibling_block("i128_lshr_lt64_nz"); - let merge_bb = self.append_sibling_block("i128_lshr_merge"); - - self.cond_br(shift_ge128, ge128_bb, lt128_bb); - - let mut ge128_bx = Self::build(self.cx, ge128_bb); - let ge128_res = ge128_bx.const_u128(0); - ge128_bx.br(merge_bb); - - let mut lt128_bx = Self::build(self.cx, lt128_bb); - lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); - - let mut ge64_bx = Self::build(self.cx, ge64_bb); - let shift_minus64 = ge64_bx.sub(shift, sixty_four); - let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); - let new_lo = ge64_bx.lshr(hi, shift_minus64_i64); - let ge64_res = ge64_bx.combine_i128(new_lo, ge64_bx.const_u64(0)); - ge64_bx.br(merge_bb); - - let mut lt64_bx = Self::build(self.cx, lt64_bb); - lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); - - let mut zero_bx = Self::build(self.cx, zero_bb); - zero_bx.br(merge_bb); - - let mut nz_bx = Self::build(self.cx, nz_bb); - let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); - let new_lo = nz_bx.lshr(lo, shift_i64); - let inv = nz_bx.sub(sixty_four, shift); - let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); - let carry = nz_bx.shl(hi, inv_i64); - let new_lo = nz_bx.or(new_lo, carry); - let new_hi = nz_bx.lshr(hi, shift_i64); - let nz_res = nz_bx.combine_i128(new_lo, new_hi); - nz_bx.br(merge_bb); - - let mut merge_bx = Self::build(self.cx, merge_bb); - let result = merge_bx.phi(self.type_i128(), &[], &[]); - merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); - merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); - merge_bx.add_incoming_to_phi(result, val, zero_bb); - merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); - - *self = merge_bx; - result + let val = self.ensure_i128(val); + let shift = self.intcast(shift, self.cx.type_i32(), false); + let args = [val, shift]; + self.call_compiler_builtin("__lshrti3", self.type_i128(), &args) } pub(super) fn emulate_i128_ashr(&mut self, val: &'ll Value, shift: &'ll Value) -> &'ll Value { - let (lo, hi) = self.split_i128(val); - let zero_i32 = self.cx.const_i32(0); - let sixty_four = self.cx.const_i32(64); - let one_twenty_eight = self.cx.const_i32(128); - let zero_u64 = self.const_u64(0); - - let shift_is_zero = self.icmp(IntPredicate::IntEQ, shift, zero_i32); - let shift_ge64 = self.icmp(IntPredicate::IntUGE, shift, sixty_four); - let shift_ge128 = self.icmp(IntPredicate::IntUGE, shift, one_twenty_eight); - let is_negative = self.icmp(IntPredicate::IntSLT, hi, zero_u64); - - let ge128_bb = self.append_sibling_block("i128_ashr_ge128"); - let lt128_bb = self.append_sibling_block("i128_ashr_lt128"); - let ge64_bb = self.append_sibling_block("i128_ashr_ge64"); - let lt64_bb = self.append_sibling_block("i128_ashr_lt64"); - let zero_bb = self.append_sibling_block("i128_ashr_zero"); - let nz_bb = self.append_sibling_block("i128_ashr_lt64_nz"); - let merge_bb = self.append_sibling_block("i128_ashr_merge"); - - self.cond_br(shift_ge128, ge128_bb, lt128_bb); - - let mut ge128_bx = Self::build(self.cx, ge128_bb); - let all_ones = ge128_bx.const_u128(u128::MAX); - let zero = ge128_bx.const_u128(0); - let ge128_res = ge128_bx.select(is_negative, all_ones, zero); - ge128_bx.br(merge_bb); - - let mut lt128_bx = Self::build(self.cx, lt128_bb); - lt128_bx.cond_br(shift_ge64, ge64_bb, lt64_bb); - - let mut ge64_bx = Self::build(self.cx, ge64_bb); - let shift_minus64 = ge64_bx.sub(shift, sixty_four); - let shift_minus64_i64 = ge64_bx.zext(shift_minus64, ge64_bx.type_i64()); - let new_lo = ge64_bx.ashr(hi, shift_minus64_i64); - let fill_hi = ge64_bx.select( - is_negative, - ge64_bx.const_u64(u64::MAX), - ge64_bx.const_u64(0), - ); - let ge64_res = ge64_bx.combine_i128(new_lo, fill_hi); - ge64_bx.br(merge_bb); - - let mut lt64_bx = Self::build(self.cx, lt64_bb); - lt64_bx.cond_br(shift_is_zero, zero_bb, nz_bb); - - let mut zero_bx = Self::build(self.cx, zero_bb); - zero_bx.br(merge_bb); - - let mut nz_bx = Self::build(self.cx, nz_bb); - let shift_i64 = nz_bx.zext(shift, nz_bx.type_i64()); - let new_lo_shift = nz_bx.lshr(lo, shift_i64); - let inv = nz_bx.sub(sixty_four, shift); - let inv_i64 = nz_bx.zext(inv, nz_bx.type_i64()); - let carry = nz_bx.shl(hi, inv_i64); - let new_lo = nz_bx.or(new_lo_shift, carry); - let new_hi = nz_bx.ashr(hi, shift_i64); - let nz_res = nz_bx.combine_i128(new_lo, new_hi); - nz_bx.br(merge_bb); - - let mut merge_bx = Self::build(self.cx, merge_bb); - let result = merge_bx.phi(self.type_i128(), &[], &[]); - merge_bx.add_incoming_to_phi(result, ge128_res, ge128_bb); - merge_bx.add_incoming_to_phi(result, ge64_res, ge64_bb); - merge_bx.add_incoming_to_phi(result, val, zero_bb); - merge_bx.add_incoming_to_phi(result, nz_res, nz_bb); - - *self = merge_bx; - result + let val = self.ensure_i128(val); + let shift = self.intcast(shift, self.cx.type_i32(), false); + let args = [val, shift]; + self.call_compiler_builtin("__ashrti3", self.type_i128(), &args) } // Emulate 128-bit bitwise NOT