Skip to content

Commit 578707c

Browse files
committed
Add initial f16 and f128 support to the s390x backend
1 parent a045eaa commit 578707c

24 files changed

+1417
-48
lines changed

cranelift/codegen/src/isa/s390x/abi.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ fn in_int_reg(ty: Type) -> bool {
166166

167167
fn in_flt_reg(ty: Type) -> bool {
168168
match ty {
169-
types::F32 | types::F64 => true,
169+
types::F16 | types::F32 | types::F64 => true,
170170
_ => false,
171171
}
172172
}

cranelift/codegen/src/isa/s390x/inst.isle

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,11 @@
552552
(rn Reg)
553553
(rm Reg))
554554

555+
;; Load floating-point constant, half-precision (16 bit).
556+
(LoadFpuConst16
557+
(rd WritableReg)
558+
(const_data u16))
559+
555560
;; Load floating-point constant, single-precision (32 bit).
556561
(LoadFpuConst32
557562
(rd WritableReg)
@@ -2836,6 +2841,7 @@
28362841
(rule (arg_store $I16 reg mem) (store16 reg mem))
28372842
(rule (arg_store $I32 reg mem) (store32 reg mem))
28382843
(rule (arg_store $I64 reg mem) (store64 reg mem))
2844+
(rule (arg_store $F16 reg mem) (vec_store_lane $F16X8 reg mem 0))
28392845
(rule (arg_store $F32 reg mem) (vec_store_lane $F32X4 reg mem 0))
28402846
(rule (arg_store $F64 reg mem) (vec_store_lane $F64X2 reg mem 0))
28412847
(rule -1 (arg_store (vr128_ty ty) reg mem) (vec_store reg mem))
@@ -2861,6 +2867,7 @@
28612867
(rule 5 (abi_vec_elt_rev _ (gpr32_ty ty) reg) reg)
28622868
(rule 4 (abi_vec_elt_rev _ (gpr64_ty ty) reg) reg)
28632869
(rule 3 (abi_vec_elt_rev _ $I128 reg) reg)
2870+
(rule 3 (abi_vec_elt_rev _ $F128 reg) reg)
28642871
(rule 2 (abi_vec_elt_rev _ (ty_scalar_float ty) reg) reg)
28652872
(rule 0 (abi_vec_elt_rev callee_lane_order _ reg)
28662873
(if-let true (lane_order_equal callee_lane_order (lane_order)))
@@ -2925,7 +2932,7 @@
29252932
(decl imm (Type u64) Reg)
29262933

29272934
;; 16-bit (or smaller) result type, any value
2928-
(rule 7 (imm (fits_in_16 ty) n)
2935+
(rule 7 (imm (fits_in_16 (ty_int ty)) n)
29292936
(let ((dst WritableReg (temp_writable_reg ty))
29302937
(_ Unit (emit (MInst.Mov32SImm16 dst (u64_as_i16 n)))))
29312938
dst))
@@ -2986,6 +2993,13 @@
29862993
(_ Unit (emit (MInst.Insert64UImm32Shifted dst src n))))
29872994
dst))
29882995

2996+
;; 16-bit floating-point type, any value. Loaded from literal pool.
2997+
;; TODO: use LZER to load 0.0
2998+
(rule 8 (imm $F16 n)
2999+
(let ((dst WritableReg (temp_writable_reg $F16))
3000+
(_ Unit (emit (MInst.LoadFpuConst16 dst (u64_as_u16 n)))))
3001+
dst))
3002+
29893003
;; 32-bit floating-point type, any value. Loaded from literal pool.
29903004
;; TODO: use LZER to load 0.0
29913005
(rule 8 (imm $F32 n)
@@ -3222,6 +3236,10 @@
32223236
(let ((dst WritableReg (temp_writable_reg ty))
32233237
(inst MInst (MInst.CMov64 dst cond reg_false reg_true)))
32243238
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
3239+
(rule 3 (cmov_reg_reg $F16 cond reg_true reg_false)
3240+
(let ((dst WritableReg (temp_writable_reg $F16))
3241+
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))
3242+
(ConsumesFlags.ConsumesFlagsReturnsReg inst dst)))
32253243
(rule 3 (cmov_reg_reg $F32 cond reg_true reg_false)
32263244
(let ((dst WritableReg (temp_writable_reg $F32))
32273245
(inst MInst (MInst.FpuCMov32 dst cond reg_false reg_true)))

cranelift/codegen/src/isa/s390x/inst/emit.rs

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ macro_rules! debug_assert_valid_regpair {
3838
};
3939
}
4040

41+
const OPCODE_BRAS: u16 = 0xa75;
42+
const OPCODE_BCR: u16 = 0xa74;
43+
const OPCODE_LDR: u16 = 0x28;
44+
const OPCODE_VLR: u16 = 0xe756;
45+
4146
/// Type(s) of memory instructions available for mem_finalize.
4247
pub struct MemInstType {
4348
/// True if 12-bit unsigned displacement is supported.
@@ -2298,9 +2303,8 @@ impl Inst {
22982303
rd,
22992304
ref symbol_reloc,
23002305
} => {
2301-
let opcode = 0xa75; // BRAS
23022306
let reg = writable_spilltmp_reg().to_reg();
2303-
put(sink, &enc_ri_b(opcode, reg, 12));
2307+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
23042308
let (reloc, name, offset) = match &**symbol_reloc {
23052309
SymbolReloc::Absolute { name, offset } => (Reloc::Abs8, name, *offset),
23062310
SymbolReloc::TlsGd { name } => (Reloc::S390xTlsGd64, name, 0),
@@ -2319,53 +2323,54 @@ impl Inst {
23192323
let opcode = 0x38; // LER
23202324
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
23212325
} else {
2322-
let opcode = 0xe756; // VLR
2323-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2326+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
23242327
}
23252328
}
23262329
&Inst::FpuMove64 { rd, rn } => {
23272330
if is_fpr(rd.to_reg()) && is_fpr(rn) {
2328-
let opcode = 0x28; // LDR
2329-
put(sink, &enc_rr(opcode, rd.to_reg(), rn));
2331+
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rn));
23302332
} else {
2331-
let opcode = 0xe756; // VLR
2332-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2333+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
23332334
}
23342335
}
23352336
&Inst::FpuCMov32 { rd, cond, ri, rm } => {
23362337
debug_assert_eq!(rd.to_reg(), ri);
23372338

23382339
if is_fpr(rd.to_reg()) && is_fpr(rm) {
2339-
let opcode = 0xa74; // BCR
2340-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
2340+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
23412341
let opcode = 0x38; // LER
23422342
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
23432343
} else {
2344-
let opcode = 0xa74; // BCR
2345-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2346-
let opcode = 0xe756; // VLR
2347-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2344+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2345+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
23482346
}
23492347
}
23502348
&Inst::FpuCMov64 { rd, cond, ri, rm } => {
23512349
debug_assert_eq!(rd.to_reg(), ri);
23522350

23532351
if is_fpr(rd.to_reg()) && is_fpr(rm) {
2354-
let opcode = 0xa74; // BCR
2355-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 2));
2356-
let opcode = 0x28; // LDR
2357-
put(sink, &enc_rr(opcode, rd.to_reg(), rm));
2352+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 2));
2353+
put(sink, &enc_rr(OPCODE_LDR, rd.to_reg(), rm));
23582354
} else {
2359-
let opcode = 0xa74; // BCR
2360-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2361-
let opcode = 0xe756; // VLR
2362-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2355+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2356+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
23632357
}
23642358
}
2359+
&Inst::LoadFpuConst16 { rd, const_data } => {
2360+
let reg = writable_spilltmp_reg().to_reg();
2361+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 6));
2362+
sink.put2(const_data.swap_bytes());
2363+
let inst = Inst::VecLoadLaneUndef {
2364+
size: 16,
2365+
rd,
2366+
mem: MemArg::reg(reg, MemFlags::trusted()),
2367+
lane_imm: 0,
2368+
};
2369+
inst.emit(sink, emit_info, state);
2370+
}
23652371
&Inst::LoadFpuConst32 { rd, const_data } => {
2366-
let opcode = 0xa75; // BRAS
23672372
let reg = writable_spilltmp_reg().to_reg();
2368-
put(sink, &enc_ri_b(opcode, reg, 8));
2373+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 8));
23692374
sink.put4(const_data.swap_bytes());
23702375
let inst = Inst::VecLoadLaneUndef {
23712376
size: 32,
@@ -2376,9 +2381,8 @@ impl Inst {
23762381
inst.emit(sink, emit_info, state);
23772382
}
23782383
&Inst::LoadFpuConst64 { rd, const_data } => {
2379-
let opcode = 0xa75; // BRAS
23802384
let reg = writable_spilltmp_reg().to_reg();
2381-
put(sink, &enc_ri_b(opcode, reg, 12));
2385+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 12));
23822386
sink.put8(const_data.swap_bytes());
23832387
let inst = Inst::VecLoadLaneUndef {
23842388
size: 64,
@@ -2780,8 +2784,7 @@ impl Inst {
27802784
put(sink, &enc_vrr_a(opcode, rm, rn, m3, 0, 0));
27812785

27822786
// If CC != 0, we'd done, so jump over the next instruction.
2783-
let opcode = 0xa74; // BCR
2784-
put(sink, &enc_ri_c(opcode, 7, 4 + 6));
2787+
put(sink, &enc_ri_c(OPCODE_BCR, 7, 4 + 6));
27852788

27862789
// Otherwise, use VECTOR COMPARE HIGH LOGICAL.
27872790
// Since we already know the high parts are equal, the CC
@@ -2864,25 +2867,21 @@ impl Inst {
28642867
}
28652868

28662869
&Inst::VecMov { rd, rn } => {
2867-
let opcode = 0xe756; // VLR
2868-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, 0, 0, 0));
2870+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rn, 0, 0, 0));
28692871
}
28702872
&Inst::VecCMov { rd, cond, ri, rm } => {
28712873
debug_assert_eq!(rd.to_reg(), ri);
28722874

2873-
let opcode = 0xa74; // BCR
2874-
put(sink, &enc_ri_c(opcode, cond.invert().bits(), 4 + 6));
2875-
let opcode = 0xe756; // VLR
2876-
put(sink, &enc_vrr_a(opcode, rd.to_reg(), rm, 0, 0, 0));
2875+
put(sink, &enc_ri_c(OPCODE_BCR, cond.invert().bits(), 4 + 6));
2876+
put(sink, &enc_vrr_a(OPCODE_VLR, rd.to_reg(), rm, 0, 0, 0));
28772877
}
28782878
&Inst::MovToVec128 { rd, rn, rm } => {
28792879
let opcode = 0xe762; // VLVGP
28802880
put(sink, &enc_vrr_f(opcode, rd.to_reg(), rn, rm));
28812881
}
28822882
&Inst::VecLoadConst { rd, const_data } => {
2883-
let opcode = 0xa75; // BRAS
28842883
let reg = writable_spilltmp_reg().to_reg();
2885-
put(sink, &enc_ri_b(opcode, reg, 20));
2884+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, 20));
28862885
for i in const_data.to_be_bytes().iter() {
28872886
sink.put1(*i);
28882887
}
@@ -2897,9 +2896,8 @@ impl Inst {
28972896
rd,
28982897
const_data,
28992898
} => {
2900-
let opcode = 0xa75; // BRAS
29012899
let reg = writable_spilltmp_reg().to_reg();
2902-
put(sink, &enc_ri_b(opcode, reg, (4 + size / 8) as i32));
2900+
put(sink, &enc_ri_b(OPCODE_BRAS, reg, (4 + size / 8) as i32));
29032901
for i in 0..size / 8 {
29042902
sink.put1((const_data >> (size - 8 - 8 * i)) as u8);
29052903
}

cranelift/codegen/src/isa/s390x/inst/emit_tests.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7817,6 +7817,24 @@ fn test_s390x_binemit() {
78177817
"wfcdb %v24, %f12",
78187818
));
78197819

7820+
// FIXME(#8312): Use `1.0_f16.to_bits()` once `f16` is stabilised.
7821+
let f16_1_0 = 0x3c00;
7822+
insns.push((
7823+
Inst::LoadFpuConst16 {
7824+
rd: writable_vr(8),
7825+
const_data: f16_1_0,
7826+
},
7827+
"A71500033C00E78010000001",
7828+
"bras %r1, 8 ; data.f16 0x1.000p0 ; vleh %v8, 0(%r1), 0",
7829+
));
7830+
insns.push((
7831+
Inst::LoadFpuConst16 {
7832+
rd: writable_vr(24),
7833+
const_data: f16_1_0,
7834+
},
7835+
"A71500033C00E78010000801",
7836+
"bras %r1, 8 ; data.f16 0x1.000p0 ; vleh %v24, 0(%r1), 0",
7837+
));
78207838
insns.push((
78217839
Inst::LoadFpuConst32 {
78227840
rd: writable_vr(8),

cranelift/codegen/src/isa/s390x/inst/mod.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! This module defines s390x-specific machine instruction types.
22
33
use crate::binemit::{Addend, CodeOffset, Reloc};
4+
use crate::ir::immediates::Ieee16;
45
use crate::ir::{types, ExternalName, Type};
56
use crate::isa::s390x::abi::S390xMachineDeps;
67
use crate::isa::{CallConv, FunctionAlignment};
@@ -177,6 +178,7 @@ impl Inst {
177178
| Inst::FpuRRRR { .. }
178179
| Inst::FpuCmp32 { .. }
179180
| Inst::FpuCmp64 { .. }
181+
| Inst::LoadFpuConst16 { .. }
180182
| Inst::LoadFpuConst32 { .. }
181183
| Inst::LoadFpuConst64 { .. }
182184
| Inst::VecRRR { .. }
@@ -324,6 +326,12 @@ impl Inst {
324326
types::I16 => Inst::Load64ZExt16 { rd: into_reg, mem },
325327
types::I32 => Inst::Load64ZExt32 { rd: into_reg, mem },
326328
types::I64 => Inst::Load64 { rd: into_reg, mem },
329+
types::F16 => Inst::VecLoadLaneUndef {
330+
size: 16,
331+
rd: into_reg,
332+
mem,
333+
lane_imm: 0,
334+
},
327335
types::F32 => Inst::VecLoadLaneUndef {
328336
size: 32,
329337
rd: into_reg,
@@ -336,8 +344,7 @@ impl Inst {
336344
mem,
337345
lane_imm: 0,
338346
},
339-
_ if ty.is_vector() && ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem },
340-
types::I128 => Inst::VecLoad { rd: into_reg, mem },
347+
_ if ty.bits() == 128 => Inst::VecLoad { rd: into_reg, mem },
341348
_ => unimplemented!("gen_load({})", ty),
342349
}
343350
}
@@ -349,6 +356,12 @@ impl Inst {
349356
types::I16 => Inst::Store16 { rd: from_reg, mem },
350357
types::I32 => Inst::Store32 { rd: from_reg, mem },
351358
types::I64 => Inst::Store64 { rd: from_reg, mem },
359+
types::F16 => Inst::VecStoreLane {
360+
size: 16,
361+
rd: from_reg,
362+
mem,
363+
lane_imm: 0,
364+
},
352365
types::F32 => Inst::VecStoreLane {
353366
size: 32,
354367
rd: from_reg,
@@ -361,8 +374,7 @@ impl Inst {
361374
mem,
362375
lane_imm: 0,
363376
},
364-
_ if ty.is_vector() && ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem },
365-
types::I128 => Inst::VecStore { rd: from_reg, mem },
377+
_ if ty.bits() == 128 => Inst::VecStore { rd: from_reg, mem },
366378
_ => unimplemented!("gen_store({})", ty),
367379
}
368380
}
@@ -646,7 +658,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
646658
collector.reg_use(rn);
647659
collector.reg_use(rm);
648660
}
649-
Inst::LoadFpuConst32 { rd, .. } | Inst::LoadFpuConst64 { rd, .. } => {
661+
Inst::LoadFpuConst16 { rd, .. }
662+
| Inst::LoadFpuConst32 { rd, .. }
663+
| Inst::LoadFpuConst64 { rd, .. } => {
650664
collector.reg_def(rd);
651665
collector.reg_fixed_nonallocatable(gpr_preg(1));
652666
}
@@ -1119,8 +1133,10 @@ impl MachInst for Inst {
11191133
types::I16 => Ok((&[RegClass::Int], &[types::I16])),
11201134
types::I32 => Ok((&[RegClass::Int], &[types::I32])),
11211135
types::I64 => Ok((&[RegClass::Int], &[types::I64])),
1136+
types::F16 => Ok((&[RegClass::Float], &[types::F16])),
11221137
types::F32 => Ok((&[RegClass::Float], &[types::F32])),
11231138
types::F64 => Ok((&[RegClass::Float], &[types::F64])),
1139+
types::F128 => Ok((&[RegClass::Float], &[types::F128])),
11241140
types::I128 => Ok((&[RegClass::Float], &[types::I128])),
11251141
_ if ty.is_vector() && ty.bits() == 128 => Ok((&[RegClass::Float], &[types::I8X16])),
11261142
_ => Err(CodegenError::Unsupported(format!(
@@ -2267,6 +2283,18 @@ impl Inst {
22672283
format!("wfcdb {}, {}", rn_fpr.unwrap_or(rn), rm_fpr.unwrap_or(rm))
22682284
}
22692285
}
2286+
&Inst::LoadFpuConst16 { rd, const_data } => {
2287+
let (rd, _rd_fpr) = pretty_print_fpr(rd.to_reg());
2288+
let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg());
2289+
// FIXME(#8312): Use `f16::from_bits` once it is stabilised.
2290+
format!(
2291+
"bras {}, 8 ; data.f16 {} ; vleh {}, 0({}), 0",
2292+
tmp,
2293+
Ieee16::with_bits(const_data),
2294+
rd,
2295+
tmp
2296+
)
2297+
}
22702298
&Inst::LoadFpuConst32 { rd, const_data } => {
22712299
let (rd, rd_fpr) = pretty_print_fpr(rd.to_reg());
22722300
let tmp = pretty_print_reg(writable_spilltmp_reg().to_reg());

0 commit comments

Comments
 (0)