Skip to content

Commit 989d9aa

Browse files
committed
Add initial f16 and f128 support to the s390x backend
1 parent a045eaa commit 989d9aa

26 files changed

+1808
-64
lines changed

cranelift/codegen/src/isa/s390x/abi.rs

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ fn in_int_reg(ty: Type) -> bool {
166166

167167
fn in_flt_reg(ty: Type) -> bool {
168168
match ty {
169-
types::F32 | types::F64 => true,
169+
types::F16 | types::F32 | types::F64 => true,
170170
_ => false,
171171
}
172172
}
@@ -387,13 +387,13 @@ impl ABIMachineSpec for S390xMachineDeps {
387387
}
388388
};
389389

390-
let slot = if let Some(reg) = candidate {
390+
let slots = if let Some(reg) = candidate {
391391
*next_reg += 1;
392-
ABIArgSlot::Reg {
392+
smallvec![ABIArgSlot::Reg {
393393
reg: reg.to_real_reg().unwrap(),
394394
ty: param.value_type,
395395
extension: param.extension,
396-
}
396+
}]
397397
} else {
398398
if args_or_rets == ArgsOrRets::Rets && !flags.enable_multi_ret_implicit_sret() {
399399
return Err(crate::CodegenError::Unsupported(
@@ -420,13 +420,29 @@ impl ABIMachineSpec for S390xMachineDeps {
420420
} else {
421421
0
422422
};
423-
let offset = (next_stack + offset) as i64;
423+
let mut offset = (next_stack + offset) as i64;
424424
next_stack += slot_size;
425-
ABIArgSlot::Stack {
426-
offset,
427-
ty: param.value_type,
428-
extension: param.extension,
429-
}
425+
let types = Inst::rc_for_type(param.value_type)?.1;
426+
// If the type is held in a single register, use the original type. This is required
427+
// by `copy_reg_to_arg_slot` to ensure that the lanes are reversed correctly if the
428+
// calling convention requires the lanes to be reversed.
429+
let types = if types.len() == 1 {
430+
&[param.value_type]
431+
} else {
432+
types
433+
};
434+
types
435+
.iter()
436+
.map(|&ty| {
437+
let this_offset = offset;
438+
offset += i64::from(ty.bytes());
439+
ABIArgSlot::Stack {
440+
offset: this_offset,
441+
ty,
442+
extension: param.extension,
443+
}
444+
})
445+
.collect()
430446
};
431447

432448
if let Some(ty) = implicit_ref {
@@ -435,14 +451,14 @@ impl ABIMachineSpec for S390xMachineDeps {
435451
"implicit argument size is not properly aligned"
436452
);
437453
args.push(ABIArg::ImplicitPtrArg {
438-
pointer: slot,
454+
pointer: slots[0],
439455
offset: 0, // Will be filled in later
440456
ty,
441457
purpose: param.purpose,
442458
});
443459
} else {
444460
args.push(ABIArg::Slots {
445-
slots: smallvec![slot],
461+
slots,
446462
purpose: param.purpose,
447463
});
448464
}

cranelift/codegen/src/isa/s390x/inst.isle

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,16 @@
514514
(ri Reg)
515515
(rm Reg))
516516

517+
;; Two 64-bit conditional move FPU instructions, possibly as vector instructions.
518+
(FpuCMov6464
519+
(rd1 WritableReg)
520+
(rd2 WritableReg)
521+
(cond Cond)
522+
(ri1 Reg)
523+
(ri2 Reg)
524+
(rm1 Reg)
525+
(rm2 Reg))
526+
517527
;; 1-op FPU instruction implemented as vector instruction with the W bit.
518528
(FpuRR
519529
(fpu_op FPUOp1)
@@ -552,6 +562,11 @@
552562
(rn Reg)
553563
(rm Reg))
554564

565+
;; Load floating-point constant, half-precision (16 bit).
566+
(LoadFpuConst16
567+
(rd WritableReg)
568+
(const_data u16))
569+
555570
;; Load floating-point constant, single-precision (32 bit).
556571
(LoadFpuConst32
557572
(rd WritableReg)
@@ -2836,6 +2851,7 @@
28362851
(rule (arg_store $I16 reg mem) (store16 reg mem))
28372852
(rule (arg_store $I32 reg mem) (store32 reg mem))
28382853
(rule (arg_store $I64 reg mem) (store64 reg mem))
2854+
(rule (arg_store $F16 reg mem) (vec_store_lane $F16X8 reg mem 0))
28392855
(rule (arg_store $F32 reg mem) (vec_store_lane $F32X4 reg mem 0))
28402856
(rule (arg_store $F64 reg mem) (vec_store_lane $F64X2 reg mem 0))
28412857
(rule -1 (arg_store (vr128_ty ty) reg mem) (vec_store reg mem))
@@ -2872,6 +2888,12 @@
28722888
;; Prepare a stack copy of a single (oversized) argument.
28732889
(decl copy_to_buffer (MemArg ABIArg Value) InstOutput)
28742890
(rule 2 (copy_to_buffer base (abi_arg_only_slot slot) _) (output_none))
2891+
(rule 1 (copy_to_buffer base (abi_arg_implicit_pointer _ offset ty)
2892+
val @ (value_type $F128))
2893+
(let ((mem MemArg (memarg_offset base offset)))
2894+
(side_effect (side_effect_concat
2895+
(vec_store_lane $F64X2 (value_regs_get val 0) mem 0)
2896+
(vec_store_lane $F64X2 (value_regs_get val 1) (memarg_offset mem 8) 0)))))
28752897
(rule 0 (copy_to_buffer base (abi_arg_implicit_pointer _ offset ty)
28762898
val @ (value_type ty))
28772899
(side_effect (arg_store ty val (memarg_offset base offset))))
@@ -2925,7 +2947,7 @@
29252947
(decl imm (Type u64) Reg)
29262948

29272949
;; 16-bit (or smaller) result type, any value
2928-
(rule 7 (imm (fits_in_16 ty) n)
2950+
(rule 7 (imm (fits_in_16 (ty_int ty)) n)
29292951
(let ((dst WritableReg (temp_writable_reg ty))
29302952
(_ Unit (emit (MInst.Mov32SImm16 dst (u64_as_i16 n)))))
29312953
dst))
@@ -2986,6 +3008,13 @@
29863008
(_ Unit (emit (MInst.Insert64UImm32Shifted dst src n))))
29873009
dst))
29883010

3011+
;; 16-bit floating-point type, any value. Loaded from literal pool.
3012+
;; TODO: use LZER to load 0.0
3013+
(rule 8 (imm $F16 n)
3014+
(let ((dst WritableReg (temp_writable_reg $F16))
3015+
(_ Unit (emit (MInst.LoadFpuConst16 dst (u64_as_u16 n)))))
3016+
dst))
3017+
29893018
;; 32-bit floating-point type, any value. Loaded from literal pool.
29903019
;; TODO: use LZER to load 0.0
29913020
(rule 8 (imm $F32 n)
@@ -3222,6 +3251,10 @@
32223251
(let ((dst WritableReg (temp_writable_reg ty))
32233252
(inst MInst (MInst.CMov64 dst cond reg_false reg_true)))
32243253
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
3254+
(rule 3 (cmov_reg_reg $F16 cond reg_true reg_false)
3255+
(let ((dst WritableReg (temp_writable_reg $F16))
3256+
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))
3257+
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
32253258
(rule 3 (cmov_reg_reg $F32 cond reg_true reg_false)
32263259
(let ((dst WritableReg (temp_writable_reg $F32))
32273260
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))
@@ -3235,6 +3268,15 @@
32353268
(inst MInst (MInst.VecCMov dst cond reg_false reg_true)))
32363269
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
32373270

3271+
(decl cmov_value_regs (Type Cond ValueRegs ValueRegs) ConsumesFlags)
3272+
(rule (cmov_value_regs $F128 cond val_true val_false)
3273+
(let ((dst1 WritableReg (temp_writable_reg $F64))
3274+
(dst2 WritableReg (temp_writable_reg $F64))
3275+
(inst MInst (MInst.FpuCMov6464 dst1 dst2 cond
3276+
(value_regs_get val_false 0) (value_regs_get val_false 1)
3277+
(value_regs_get val_true 0) (value_regs_get val_true 1))))
3278+
(ConsumesFlags.ConsumesFlagsReturnsValueRegs inst (value_regs dst1 dst2))))
3279+
32383280

32393281
;; Helpers for generating conditional traps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32403282

@@ -3300,6 +3342,11 @@
33003342
(rule (select_bool_reg ty (ProducesBool.ProducesBool producer cond) reg_true reg_false)
33013343
(with_flags_reg producer (cmov_reg_reg ty cond reg_true reg_false)))
33023344

3345+
;; Use a boolean condition to select between two pairs registers.
3346+
(decl select_bool_value_regs (Type ProducesBool ValueRegs ValueRegs) ValueRegs)
3347+
(rule (select_bool_value_regs ty (ProducesBool.ProducesBool producer cond) val_true val_false)
3348+
(with_flags producer (cmov_value_regs ty cond val_true val_false)))
3349+
33033350
;; Use a boolean condition to select between two immediate values.
33043351
(decl select_bool_imm (Type ProducesBool i16 i16) Reg)
33053352
(rule (select_bool_imm ty (ProducesBool.ProducesBool producer cond) imm_true imm_false)

cranelift/codegen/src/isa/s390x/inst/emit.rs

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ macro_rules! debug_assert_valid_regpair {
3838
};
3939
}
4040

41+
const OPCODE_BRAS: u16 = 0xa75;
42+
const OPCODE_BCR: u16 = 0xa74;
43+
const OPCODE_LDR: u16 = 0x28;
44+
const OPCODE_VLR: u16 = 0xe756;
45+
4146
/// Type(s) of memory instructions available for mem_finalize.
4247
pub struct MemInstType {
4348
/// True if 12-bit unsigned displacement is supported.
@@ -2298,9 +2303,8 @@ impl Inst {
22982303
rd,
22992304
ref symbol_reloc,
23002305
} => {
2301-
let opcode = 0xa75; // BRAS
23022306
let reg = writable_spilltmp_reg().to_reg();
2303-
put(sink, &enc_ri_b(opcode, reg, 12));
2307+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
23042308
let (reloc, name, offset) = match &**symbol_reloc {
23052309
SymbolReloc::Absolute { name, offset } => (Reloc::Abs8, name, *offset),
23062310
SymbolReloc::TlsGd { name } => (Reloc::S390xTlsGd64, name, 0),
@@ -2319,53 +2323,81 @@ impl Inst {
23192323
let opcode = 0x38; // LER
23202324
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
23212325
} else {
2322-
let opcode = 0xe756; // VLR
2323-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2326+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
23242327
}
23252328
}
23262329
&Inst::FpuMove64 { rd, rn } => {
23272330
if is_fpr(rd.to_reg()) && is_fpr(rn) {
2328-
let opcode = 0x28; // LDR
2329-
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
2331+
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rn));
23302332
} else {
2331-
let opcode = 0xe756; // VLR
2332-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2333+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
23332334
}
23342335
}
23352336
&Inst::FpuCMov32 { rd, cond, ri, rm } => {
23362337
debug_assert_eq!(rd.to_reg(), ri);
23372338

23382339
if is_fpr(rd.to_reg()) && is_fpr(rm) {
2339-
let opcode = 0xa74; // BCR
2340-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
2340+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
23412341
let opcode = 0x38; // LER
23422342
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
23432343
} else {
2344-
let opcode = 0xa74; // BCR
2345-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2346-
let opcode = 0xe756; // VLR
2347-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2344+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2345+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
23482346
}
23492347
}
23502348
&Inst::FpuCMov64 { rd, cond, ri, rm } => {
23512349
debug_assert_eq!(rd.to_reg(), ri);
23522350

23532351
if is_fpr(rd.to_reg()) && is_fpr(rm) {
2354-
let opcode = 0xa74; // BCR
2355-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
2356-
let opcode = 0x28; // LDR
2357-
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
2352+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
2353+
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rm));
2354+
} else {
2355+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2356+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
2357+
}
2358+
}
2359+
&Inst::FpuCMov6464 {
2360+
rd1,
2361+
rd2,
2362+
cond,
2363+
ri1,
2364+
ri2,
2365+
rm1,
2366+
rm2,
2367+
} => {
2368+
debug_assert_eq!(rd1.to_reg(), ri1);
2369+
debug_assert_eq!(rd2.to_reg(), ri2);
2370+
2371+
let is_fpr_1 = is_fpr(rd1.to_reg()) && is_fpr(rm1);
2372+
let is_fpr_2 = is_fpr(rd2.to_reg()) && is_fpr(rm2);
2373+
let offset = 4 + if is_fpr_1 { 2 } else { 6 } + if is_fpr_2 { 2 } else { 6 };
2374+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), offset));
2375+
if is_fpr_1 {
2376+
put(sink, &enc_rr(OPCODE_LDR, rd1.to_reg(), rm1));
23582377
} else {
2359-
let opcode = 0xa74; // BCR
2360-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2361-
let opcode = 0xe756; // VLR
2362-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2378+
put(sink, &enc_vrr_a(OPCODE_VLR, rd1.to_reg(), rm1, 0, 0, 0));
23632379
}
2380+
if is_fpr_2 {
2381+
put(sink, &enc_rr(OPCODE_LDR, rd2.to_reg(), rm2));
2382+
} else {
2383+
put(sink, &enc_vrr_a(OPCODE_VLR, rd2.to_reg(), rm2, 0, 0, 0));
2384+
}
2385+
}
2386+
&Inst::LoadFpuConst16 { rd, const_data } => {
2387+
let reg = writable_spilltmp_reg().to_reg();
2388+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 6));
2389+
sink.put2(const_data.swap_bytes());
2390+
let inst = Inst::VecLoadLaneUndef {
2391+
size: 16,
2392+
rd,
2393+
mem: MemArg::reg(reg, MemFlags::trusted()),
2394+
lane_imm: 0,
2395+
};
2396+
inst.emit(sink, emit_info, state);
23642397
}
23652398
&Inst::LoadFpuConst32 { rd, const_data } => {
2366-
let opcode = 0xa75; // BRAS
23672399
let reg = writable_spilltmp_reg().to_reg();
2368-
put(sink, &enc_ri_b(opcode, reg, 8));
2400+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 8));
23692401
sink.put4(const_data.swap_bytes());
23702402
let inst = Inst::VecLoadLaneUndef {
23712403
size: 32,
@@ -2376,9 +2408,8 @@ impl Inst {
23762408
inst.emit(sink, emit_info, state);
23772409
}
23782410
&Inst::LoadFpuConst64 { rd, const_data } => {
2379-
let opcode = 0xa75; // BRAS
23802411
let reg = writable_spilltmp_reg().to_reg();
2381-
put(sink, &enc_ri_b(opcode, reg, 12));
2412+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
23822413
sink.put8(const_data.swap_bytes());
23832414
let inst = Inst::VecLoadLaneUndef {
23842415
size: 64,
@@ -2780,8 +2811,7 @@ impl Inst {
27802811
put(sink, &enc_vrr_a(opcode, rm, rn, m3, 0, 0));
27812812

27822813
// If CC != 0, we'd done, so jump over the next instruction.
2783-
let opcode = 0xa74; // BCR
2784-
put(sink, &enc_ri_c(opcode, 7, 4 + 6));
2814+
put(sink, &enc_ri_c(OPCODE_BCR, 7, 4 + 6));
27852815

27862816
// Otherwise, use VECTOR COMPARE HIGH LOGICAL.
27872817
// Since we already know the high parts are equal, the CC
@@ -2864,25 +2894,21 @@ impl Inst {
28642894
}
28652895

28662896
&Inst::VecMov { rd, rn } => {
2867-
let opcode = 0xe756; // VLR
2868-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2897+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
28692898
}
28702899
&Inst::VecCMov { rd, cond, ri, rm } => {
28712900
debug_assert_eq!(rd.to_reg(), ri);
28722901

2873-
let opcode = 0xa74; // BCR
2874-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2875-
let opcode = 0xe756; // VLR
2876-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2902+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2903+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
28772904
}
28782905
&Inst::MovToVec128 { rd, rn, rm } => {
28792906
let opcode = 0xe762; // VLVGP
28802907
put(sink, &enc_vrr_f(opcode, rd.to_reg(), rn, rm));
28812908
}
28822909
&Inst::VecLoadConst { rd, const_data } => {
2883-
let opcode = 0xa75; // BRAS
28842910
let reg = writable_spilltmp_reg().to_reg();
2885-
put(sink, &enc_ri_b(opcode, reg, 20));
2911+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 20));
28862912
for i in const_data.to_be_bytes().iter() {
28872913
sink.put1(*i);
28882914
}
@@ -2897,9 +2923,8 @@ impl Inst {
28972923
rd,
28982924
const_data,
28992925
} => {
2900-
let opcode = 0xa75; // BRAS
29012926
let reg = writable_spilltmp_reg().to_reg();
2902-
put(sink, &enc_ri_b(opcode, reg, (4 + size / 8) as i32));
2927+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, (4 + size / 8) as i32));
29032928
for i in 0..size / 8 {
29042929
sink.put1((const_data >> (size - 8 - 8 * i)) as u8);
29052930
}

0 commit comments

Comments
 (0)