Skip to content

Conversation

@jwnrt
Copy link

@jwnrt jwnrt commented Mar 18, 2025

This was previously a pointer for AS0 (i32) which trips an assertion later in getCopyToParts (called by LowerCall) when a capability is copied into that position.

I was hitting this assertion with smulo.i128 so I think it's the same issue filed here: CTSRD-CHERI#743. Should I have submitted this to the CHERI repo instead? (I will submit it there afterwards anyway if the patch looks sensible here).

@jwnrt jwnrt force-pushed the smulo-cap branch 2 times, most recently from e6746ec to 5d96f3a Compare March 18, 2025 20:50
This was previously a pointer from AS0 (i32) which trips an assertion
later in `getCopyToParts` (called by `LoweCall`) when a capability is
copied into that position.
@davidchisnall
Copy link

Thanks, do you have a test case that this fixes?

@jwnrt
Copy link
Author

jwnrt commented Mar 19, 2025

To be honest I haven't tested that the resulting assembly is correct, just that the assertion goes away.
I can make a proper LLVM test for this but I haven't really worked on LLVM before so it will take me some time.

Here's a snippet for reproducing the assertion:

target datalayout = "e-m:e-pf200:64:64:64:32-p:32:32-i64:64-n32-S128-A200-P200-G200"

define zeroext i1 @smulo.i128(i128 signext %v1, i128 signext %v2, ptr %res) {
entry:
  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
  %val = extractvalue {i128, i1} %t, 0
  %obit = extractvalue {i128, i1} %t, 1
  store i128 %val, ptr %res
  ret i1 %obit
}

Before:

$ llc --filetype=asm --mcpu=cheriot --mtriple=riscv32-unknown-unknown -target-abi cheriot -mattr=+xcheri,+cap-mode < smulo_i128.ll                                               
	.text
	.attribute	4, 16
	.attribute	5, "rv32e2p0_m2p0_c2p0_xcheri0p0"
	.file	"<stdin>"
llc: /build/source/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:564: void getCopyToParts(llvm::SelectionDAG&, const llvm::SDLoc&, llvm::SDValue, llvm::SDValue*, unsigned int, llvm::MVT, const llvm::Value*, std::optional<unsigned int>, llvm::ISD::NodeType): Assertion `(PartVT.isInteger() || PartVT == MVT::x86mmx) && ValueVT.isInteger() && "Unknown mismatch!"' failed.
PLEASE submit a bug report to https://github.com/CTSRD-CHERI/llvm-project/issues and include the crash backtrace.
Stack dump:
0.	Program arguments: llc --filetype=asm --mcpu=cheriot --mtriple=riscv32-unknown-unknown -target-abi cheriot -mattr=+xcheri,+cap-mode
1.	Running pass 'Function Pass Manager' on module '<stdin>'.
2.	Running pass 'RISC-V DAG->DAG Pattern Instruction Selection' on function '@smulo.i128'
#0 0x00007f1a31f5626e llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /build/source/llvm/lib/Support/Unix/Signals.inc:723:22
zsh: abort (core dumped)  llc --filetype=asm --mcpu=cheriot --mtriple=riscv32-unknown-unknown  cheriot

After:

$ llc --filetype=asm --mcpu=cheriot --mtriple=riscv32-unknown-unknown -target-abi cheriot -mattr=+xcheri,+cap-mode < smulo_i128.ll
$ echo $?
0

Here's the assembly if you're interested:

(expand)
	.text
	.attribute	4, 16
	.attribute	5, "rv32e2p0_m2p0_c2p0_xcheri0p0"
	.file	"<stdin>"
	.globl	smulo.i128                      # -- Begin function smulo.i128
	.p2align	1
	.type	smulo.i128,@function
smulo.i128:                             # @smulo.i128
	.cfi_startproc
# %bb.0:                                # %entry
	cincoffset	csp, csp, -80
	.cfi_def_cfa_offset 80
	csc	cra, 72(csp)                    # 8-byte Folded Spill
	csc	cs0, 64(csp)                    # 8-byte Folded Spill
	.cfi_offset ra, -8
	.cfi_offset s0, -16
	clw	t0, 0(ca0)
	clw	t1, 4(ca0)
	clw	t2, 8(ca0)
	clw	a0, 12(ca0)
	clw	a4, 0(ca1)
	clw	a3, 4(ca1)
	clw	a5, 8(ca1)
	clw	a1, 12(ca1)
	mv	s0, a2
	csw	zero, 60(csp)
	csw	a1, 20(csp)
	csw	a5, 16(csp)
	csw	a3, 12(csp)
	csw	a4, 8(csp)
	csw	a0, 36(csp)
	csw	t2, 32(csp)
	csw	t1, 28(csp)
	cincoffset	ca0, csp, 40
	cincoffset	ca1, csp, 24
	cincoffset	ca2, csp, 8
	cincoffset	ca3, csp, 60
	csw	t0, 24(csp)
.LBB0_1:                                # %entry
                                        # Label of block must be emitted
	auipcc	ct2, %cheriot_compartment_hi(__library_import_libcalls___muloti4)
	clc	ct2, %cheriot_compartment_lo_i(.LBB0_1)(ct2)
	cjalr	ct2
	clw	a0, 60(csp)
	snez	a0, a0
	clw	a1, 40(csp)
	clw	a2, 52(csp)
	clw	a3, 44(csp)
	clw	a4, 48(csp)
	addi	a5, s0, 12
	sw.ddc	a2, (a5)
	addi	a2, s0, 8
	sw.ddc	a4, (a2)
	addi	a2, s0, 4
	sw.ddc	a3, (a2)
	sw.ddc	a1, (s0)
	clc	cra, 72(csp)                    # 8-byte Folded Reload
	clc	cs0, 64(csp)                    # 8-byte Folded Reload
	cincoffset	csp, csp, 80
	cret
.Lfunc_end0:
	.size	smulo.i128, .Lfunc_end0-smulo.i128
	.cfi_endproc
                                        # -- End function
	.section	".note.GNU-stack","",@progbits
	.section	.compartment_imports,"aG",@progbits,__library_import_libcalls___muloti4,comdat
	.type	__library_import_libcalls___muloti4,@object
	.weak	__library_import_libcalls___muloti4
	.p2align	3, 0x0
__library_import_libcalls___muloti4:
	.word	__library_export_libcalls___muloti4+1
	.word	0
	.size	__library_import_libcalls___muloti4, 8

@davidchisnall
Copy link

Thanks. It looks as if __uint128 is being passed in a strange way here. It’s on the stack, but each value is passed as an explicit capability argument in the corresponding argument register, rather than as a stack offsets, which is unusual. The output then generates some sw.ddc instructions, which don’t exist in CHERIoT and, if they did, would not be correct here. I’d like to understand what the correct behaviour is here: replacing an assertion failure with generating code that will crash at run time is not ideal.

@resistor
Copy link
Collaborator

I'm not able to reproduce this with our clang-20 based compiler. Do you have a reduced test case I could try?

@resistor
Copy link
Collaborator

With the clang-20 based compiler, we generate the following code. I haven't tested it but it looks plausible-ish to me:

	.attribute	4, 16
	.attribute	5, "rv32e2p0_m2p0_c2p0_zmmul1p0_xcheri0p0_xcheriot1p0"
	.file	"<stdin>"
	.text
	.globl	smulo.i128                      # -- Begin function smulo.i128
	.p2align	1
	.type	smulo.i128,@function
smulo.i128:                             # @smulo.i128
	.cfi_startproc
# %bb.0:                                # %entry
	ct.cincoffset	csp, csp, -96
	.cfi_def_cfa_offset 96
	ct.csc	cra, 88(csp)                    # 8-byte Folded Spill
	ct.csc	cs0, 80(csp)                    # 8-byte Folded Spill
	ct.csc	cs1, 72(csp)                    # 8-byte Folded Spill
	.cfi_offset ra, -8
	.cfi_offset s0, -16
	.cfi_offset s1, -24
	ct.csw	a2, 68(csp)                     # 4-byte Folded Spill
	ct.cmove	ct0, ca1
	ct.clw	a1, 4(ca0)
	ct.clw	t2, 0(ct0)
	ct.clw	ra, 8(ca0)
	ct.clw	t1, 12(ca0)
	ct.clw	tp, 0(ca0)
	ct.clw	a0, 4(ct0)
	mulhu	a2, ra, t2
	mul	a4, t1, t2
	mulhu	a5, t1, t2
	srai	a3, t1, 31
	add	a2, a2, a4
	mul	s1, t2, a3
	ct.csw	s1, 64(csp)                     # 4-byte Folded Spill
	mulhu	s0, t2, a3
	sltu	a4, a2, a4
	add	a5, a5, a4
	ct.csw	a3, 44(csp)                     # 4-byte Folded Spill
	mul	a4, a0, a3
	add	s0, s0, s1
	add	a4, a4, s0
	ct.csw	a4, 52(csp)                     # 4-byte Folded Spill
	mulhu	a4, tp, t2
	mul	s0, a1, t2
	add	a4, a4, s0
	sltu	s0, a4, s0
	mulhu	a3, a1, t2
	add	a3, a3, s0
	mul	s0, ra, a0
	add	s1, s0, a2
	sltu	a2, s1, s0
	mulhu	s0, ra, a0
	add	a2, a2, s0
	mul	s0, tp, a0
	add	a4, a4, s0
	ct.csw	a4, 40(csp)                     # 4-byte Folded Spill
	sltu	a4, a4, s0
	ct.csw	tp, 56(csp)                     # 4-byte Folded Spill
	mulhu	s0, tp, a0
	add	a4, a4, s0
	add	a2, a2, a5
	sltu	a5, a2, a5
	mulhu	s0, t1, a0
	add	a5, a5, s0
	add	a4, a4, a3
	sltu	a3, a4, a3
	mulhu	s0, a1, a0
	add	a3, a3, s0
	ct.csw	t1, 60(csp)                     # 4-byte Folded Spill
	mul	s0, t1, a0
	add	a2, a2, s0
	sltu	s0, a2, s0
	add	a5, a5, s0
	mul	a0, a1, a0
	add	a4, a4, a0
	sltu	a0, a4, a0
	add	a3, a3, a0
	ct.clw	s0, 52(csp)                     # 4-byte Folded Reload
	add	s0, s0, a5
	ct.clw	t1, 12(ct0)
	ct.clw	tp, 8(ct0)
	ct.clw	t0, 64(csp)                     # 4-byte Folded Reload
	add	t0, t0, a2
	ct.csw	ra, 64(csp)                     # 4-byte Folded Spill
	ct.csw	t2, 28(csp)                     # 4-byte Folded Spill
	mul	a5, ra, t2
	add	a4, a4, a5
	sltu	a2, t0, a2
	add	a3, a3, s1
	ct.csw	a4, 36(csp)                     # 4-byte Folded Spill
	sltu	a5, a4, a5
	add	ra, a3, a5
	add	a2, a2, s0
	ct.csw	a2, 52(csp)                     # 4-byte Folded Spill
	beq	ra, s1, .LBB0_2
# %bb.1:                                # %entry
	sltu	a5, ra, s1
.LBB0_2:                                # %entry
	add	a4, t0, a5
	ct.clw	s0, 56(csp)                     # 4-byte Folded Reload
	mulhu	t2, s0, tp
	mul	a2, a1, tp
	mulhu	a3, a1, tp
	mul	a5, s0, t1
	srai	a0, t1, 31
	ct.csw	a4, 48(csp)                     # 4-byte Folded Spill
	sltu	t0, a4, t0
	mul	a4, a0, a1
	ct.clw	s1, 52(csp)                     # 4-byte Folded Reload
	add	s1, s1, t0
	ct.csw	s1, 52(csp)                     # 4-byte Folded Spill
	mulhu	s1, a0, s0
	add	a4, a4, s1
	ct.csw	a4, 24(csp)                     # 4-byte Folded Spill
	mulhu	s1, s0, t1
	add	a4, a2, t2
	sltu	a2, a4, a2
	add	a3, a3, a2
	mv	t2, a1
	mul	a1, s0, tp
	ct.clw	a2, 36(csp)                     # 4-byte Folded Reload
	add	a2, a2, a1
	add	a4, a4, a5
	ct.csw	a2, 12(csp)                     # 4-byte Folded Spill
	sltu	t0, a2, a1
	sltu	a5, a4, a5
	add	a5, a5, s1
	add	a2, a4, t0
	add	ra, ra, a2
	mulhu	a2, t2, t1
	add	a5, a5, a3
	sltu	a3, a5, a3
	add	a2, a2, a3
	mul	a3, t2, t1
	add	a5, a5, a3
	sltu	a3, a5, a3
	add	a2, a2, a3
	ct.csw	a0, 32(csp)                     # 4-byte Folded Spill
	mul	s1, a0, s0
	ct.clw	a3, 24(csp)                     # 4-byte Folded Reload
	add	a3, a3, s1
	add	a2, a2, a3
	add	s1, s1, a5
	sltu	a3, s1, a5
	add	a0, a2, a3
	beq	ra, a4, .LBB0_4
# %bb.3:                                # %entry
	sltu	t0, ra, a4
.LBB0_4:                                # %entry
	add	t0, t0, s1
	ct.clw	a3, 64(csp)                     # 4-byte Folded Reload
	mulhu	a2, a3, tp
	ct.clw	a4, 60(csp)                     # 4-byte Folded Reload
	mul	a5, a4, tp
	mul	s0, a3, t1
	mul	a3, a3, tp
	sltu	a4, t0, s1
	ct.clw	a1, 48(csp)                     # 4-byte Folded Reload
	add	t0, t0, a1
	add	a2, a2, a5
	add	s1, a0, a4
	sltu	t2, t0, a1
	add	a4, s0, a2
	add	t0, t0, a3
	ct.clw	a0, 52(csp)                     # 4-byte Folded Reload
	ct.csw	s1, 16(csp)                     # 4-byte Folded Spill
	add	s1, s1, a0
	ct.csw	t0, 20(csp)                     # 4-byte Folded Spill
	sltu	a1, t0, a3
	ct.csw	t2, 48(csp)                     # 4-byte Folded Spill
	add	s1, s1, t2
	add	t0, a4, a1
	add	t0, t0, s1
	beq	t0, a4, .LBB0_6
# %bb.5:                                # %entry
	sltu	a1, t0, a4
.LBB0_6:                                # %entry
	ct.csw	a1, 24(csp)                     # 4-byte Folded Spill
	ct.csw	ra, 36(csp)                     # 4-byte Folded Spill
	beq	s1, a0, .LBB0_8
# %bb.7:                                # %entry
	sltu	a1, s1, a0
	ct.csw	a1, 48(csp)                     # 4-byte Folded Spill
.LBB0_8:                                # %entry
	sltu	a1, a2, a5
	ct.csw	a1, 8(csp)                      # 4-byte Folded Spill
	sltu	a1, a4, s0
	ct.csw	a1, 4(csp)                      # 4-byte Folded Spill
	ct.clw	a1, 56(csp)                     # 4-byte Folded Reload
	ct.clw	a0, 28(csp)                     # 4-byte Folded Reload
	mul	a2, a1, a0
	ct.clw	a0, 68(csp)                     # 4-byte Folded Reload
	sw.ddc	a2, (a0)
	addi	a2, a0, 4
	ct.clw	a3, 40(csp)                     # 4-byte Folded Reload
	sw.ddc	a3, (a2)
	addi	a2, a0, 8
	ct.clw	a0, 12(csp)                     # 4-byte Folded Reload
	sw.ddc	a0, (a2)
	ct.clw	s0, 60(csp)                     # 4-byte Folded Reload
	mulhu	a2, s0, tp
	ct.clw	a5, 44(csp)                     # 4-byte Folded Reload
	mul	a3, tp, a5
	mulhu	tp, tp, a5
	mul	a5, t1, a5
	ct.clw	a1, 64(csp)                     # 4-byte Folded Reload
	mulhu	s1, a1, t1
	mul	ra, s0, t1
	mulhu	t2, s0, t1
	ct.clw	a4, 32(csp)                     # 4-byte Folded Reload
	mul	s0, a4, s0
	mul	a0, a4, a1
	mulhu	a1, a4, a1
	ct.clw	a4, 8(csp)                      # 4-byte Folded Reload
	add	a2, a2, a4
	ct.clw	a4, 4(csp)                      # 4-byte Folded Reload
	add	s1, s1, a4
	add	a1, a1, s0
	add	a4, tp, a3
	add	a4, a4, a5
	add	a1, a1, a0
	add	a1, a1, a4
	ct.clw	a4, 52(csp)                     # 4-byte Folded Reload
	srai	a4, a4, 31
	ct.clw	a5, 16(csp)                     # 4-byte Folded Reload
	srai	a5, a5, 31
	add	a5, a5, a4
	ct.clw	tp, 48(csp)                     # 4-byte Folded Reload
	add	tp, tp, a5
	sltu	a4, a5, a4
	add	a4, a4, a5
	sltu	a5, tp, a5
	add	a4, a4, a5
	ct.csw	a4, 64(csp)                     # 4-byte Folded Spill
	add	s1, s1, a2
	sltu	a2, s1, a2
	add	a2, a2, t2
	ct.clw	a4, 36(csp)                     # 4-byte Folded Reload
	srai	a5, a4, 31
	add	a3, a3, a0
	xor	t0, t0, a5
	ct.clw	s0, 20(csp)                     # 4-byte Folded Reload
	xor	t2, s0, a5
	sltu	a0, a3, a0
	add	s1, s1, ra
	add	a3, a3, s1
	sltu	s0, s1, ra
	add	a0, a0, a1
	add	tp, tp, a3
	sltu	a1, a3, s1
	add	a2, a2, s0
	ct.clw	t1, 24(csp)                     # 4-byte Folded Reload
	add	t1, t1, tp
	add	a0, a0, a2
	sltu	a2, tp, a3
	sltu	a3, t1, tp
	add	s1, a0, a1
	xor	a1, t1, a5
	ct.clw	a0, 64(csp)                     # 4-byte Folded Reload
	add	a0, a0, s1
	or	a1, t2, a1
	add	a0, a0, a2
	add	a0, a0, a3
	xor	a0, a0, a5
	or	a0, t0, a0
	or	a0, a0, a1
	snez	a0, a0
	ct.clw	a1, 68(csp)                     # 4-byte Folded Reload
	addi	a1, a1, 12
	sw.ddc	a4, (a1)
	ct.clc	cra, 88(csp)                    # 8-byte Folded Reload
	ct.clc	cs0, 80(csp)                    # 8-byte Folded Reload
	ct.clc	cs1, 72(csp)                    # 8-byte Folded Reload
	.cfi_restore ra
	.cfi_restore s0
	.cfi_restore s1
	ct.cincoffset	csp, csp, 96
	.cfi_def_cfa_offset 0
	ct.cret
.Lfunc_end0:
	.size	smulo.i128, .Lfunc_end0-smulo.i128
	.cfi_endproc
                                        # -- End function
	.section	".note.GNU-stack","",@progbits

@resistor
Copy link
Collaborator

Closing unless we find another reproducer.

@resistor resistor closed this May 21, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants