Skip to content

Commit d44749b

Browse files
sophie-zhaoabner-chenc
authored andcommitted
cmd/internal/obj/loong64: add [X]VLDREPL.{B/H/W/D} instructions support
Go asm syntax: VMOVQ offset(Rj), Vd.<T> XVMOVQ offset(Rj), Xd.<T> <T> can have the following values: B16, H8, W4, V2, B32, H16, W8, V4 Change-Id: I44af51d58bb62649d3fe360b3abb771565e78a8a Reviewed-on: https://go-review.googlesource.com/c/go/+/682895 Reviewed-by: abner chenc <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Reviewed-by: Meidan Li <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Mark Freeman <[email protected]>
1 parent d6beda8 commit d44749b

File tree

3 files changed

+104
-34
lines changed

3 files changed

+104
-34
lines changed

src/cmd/asm/internal/asm/testdata/loong64enc1.s

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,16 @@ lable2:
510510
VMOVQ V3.W[1], V7.W4 // 67e4f772
511511
VMOVQ V4.V[0], V6.V2 // 86f0f772
512512

513+
// Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), <Vd>.<T>
514+
VMOVQ (R4), V0.B16 // 80008030
515+
VMOVQ 1(R4), V1.H8 // 81044030
516+
VMOVQ 2(R4), V2.W4 // 82082030
517+
VMOVQ 3(R4), V3.V2 // 830c1030
518+
XVMOVQ (R4), X0.B32 // 80008032
519+
XVMOVQ 1(R4), X1.H16 // 81044032
520+
XVMOVQ 2(R4), X2.W8 // 82082032
521+
XVMOVQ 3(R4), X3.V4 // 830c1032
522+
513523
// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
514524
VSEQB V1, V2, V3 // 43040070
515525
VSEQH V1, V2, V3 // 43840070

src/cmd/internal/obj/loong64/asm.go

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,11 @@ var optab = []Optab{
416416

417417
{AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0},
418418

419-
{APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 46, 4, 0, 0},
420-
{APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 47, 20, 0, 0},
419+
{AVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
420+
{AXVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
421+
422+
{APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 47, 4, 0, 0},
423+
{APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 48, 20, 0, 0},
421424

422425
{obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
423426
{obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0},
@@ -2395,7 +2398,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
23952398
o1 = uint32(c.regoff(&p.From))
23962399

23972400
case 39: // vmov Rn, Vd.<T>[index]
2398-
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2401+
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
23992402
if v == 0 {
24002403
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24012404
}
@@ -2407,7 +2410,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24072410
o1 = v | (index << 10) | (Rj << 5) | Vd
24082411

24092412
case 40: // vmov Vd.<T>[index], Rn
2410-
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2413+
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24112414
if v == 0 {
24122415
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24132416
}
@@ -2419,7 +2422,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24192422
o1 = v | (index << 10) | (Vj << 5) | Rd
24202423

24212424
case 41: // vmov Rn, Vd.<T>
2422-
v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2425+
v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24232426
if v == 0 {
24242427
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24252428
}
@@ -2429,7 +2432,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24292432
o1 = v | (Rj << 5) | Vd
24302433

24312434
case 42: // vmov xj, xd.<T>
2432-
v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2435+
v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24332436
if v == 0 {
24342437
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24352438
}
@@ -2439,7 +2442,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24392442
o1 = v | (Xj << 5) | Xd
24402443

24412444
case 43: // vmov xj, xd.<T>[index]
2442-
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2445+
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24432446
if v == 0 {
24442447
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24452448
}
@@ -2451,7 +2454,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24512454
o1 = v | (index << 10) | (Xj << 5) | Xd
24522455

24532456
case 44: // vmov xj.<T>[index], xd
2454-
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2457+
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24552458
if v == 0 {
24562459
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24572460
}
@@ -2463,7 +2466,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24632466
o1 = v | (index << 10) | (Xj << 5) | Xd
24642467

24652468
case 45: // vmov vj.<T>[index], vd.<T>
2466-
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
2469+
v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
24672470
if v == 0 {
24682471
c.ctxt.Diag("illegal arng type combination: %v\n", p)
24692472
}
@@ -2474,12 +2477,23 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
24742477
c.checkindex(p, index, m)
24752478
o1 = v | (index << 10) | (vj << 5) | vd
24762479

2477-
case 46: // preld offset(Rbase), $hint
2480+
case 46: // vmov offset(vj), vd.<T>
2481+
v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, true)
2482+
if v == 0 {
2483+
c.ctxt.Diag("illegal arng type combination: %v\n", p)
2484+
}
2485+
2486+
si := c.regoff(&p.From)
2487+
Rj := uint32(p.From.Reg & EXT_REG_MASK)
2488+
Vd := uint32(p.To.Reg & EXT_REG_MASK)
2489+
o1 = v | uint32(si<<10) | (Rj << 5) | Vd
2490+
2491+
case 47: // preld offset(Rbase), $hint
24782492
offs := c.regoff(&p.From)
24792493
hint := p.GetFrom3().Offset
24802494
o1 = OP_12IR_5I(c.opiir(p.As), uint32(offs), uint32(p.From.Reg), uint32(hint))
24812495

2482-
case 47: // preldx offset(Rbase), $n, $hint
2496+
case 48: // preldx offset(Rbase), $n, $hint
24832497
offs := c.regoff(&p.From)
24842498
hint := p.RestArgs[1].Offset
24852499
n := uint64(p.GetFrom3().Offset)
@@ -4192,7 +4206,7 @@ func (c *ctxt0) specialFpMovInst(a obj.As, fclass int, tclass int) uint32 {
41924206
return 0
41934207
}
41944208

4195-
func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_mask uint32) {
4209+
func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16, offset_flag bool) (op_code, index_mask uint32) {
41964210
farng := (fReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
41974211
tarng := (tReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
41984212
fclass := c.rclass(fReg)
@@ -4258,29 +4272,58 @@ func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_ma
42584272
}
42594273

42604274
case C_REG | (C_ARNG << 16):
4261-
// vmov Rn, Vd.<T>
4262-
switch a {
4263-
case AVMOVQ:
4264-
switch tarng {
4265-
case ARNG_16B:
4266-
return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
4267-
case ARNG_8H:
4268-
return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
4269-
case ARNG_4W:
4270-
return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
4271-
case ARNG_2V:
4272-
return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
4275+
switch {
4276+
case offset_flag:
4277+
// vmov offset(vj), vd.<T>
4278+
switch a {
4279+
case AVMOVQ:
4280+
switch tarng {
4281+
case ARNG_16B:
4282+
return (0xC2 << 22), 0x0 // vldrepl.b
4283+
case ARNG_8H:
4284+
return (0x182 << 21), 0x0 // vldrepl.h
4285+
case ARNG_4W:
4286+
return (0x302 << 20), 0x0 // vldrepl.w
4287+
case ARNG_2V:
4288+
return (0x602 << 19), 0x0 // vldrepl.d
4289+
}
4290+
case AXVMOVQ:
4291+
switch tarng {
4292+
case ARNG_32B:
4293+
return (0xCA << 22), 0x0 // xvldrepl.b
4294+
case ARNG_16H:
4295+
return (0x192 << 21), 0x0 // xvldrepl.h
4296+
case ARNG_8W:
4297+
return (0x322 << 20), 0x0 // xvldrepl.w
4298+
case ARNG_4V:
4299+
return (0x642 << 19), 0x0 // xvldrepl.d
4300+
}
42734301
}
4274-
case AXVMOVQ:
4275-
switch tarng {
4276-
case ARNG_32B:
4277-
return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
4278-
case ARNG_16H:
4279-
return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
4280-
case ARNG_8W:
4281-
return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
4282-
case ARNG_4V:
4283-
return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
4302+
default:
4303+
// vmov Rn, Vd.<T>
4304+
switch a {
4305+
case AVMOVQ:
4306+
switch tarng {
4307+
case ARNG_16B:
4308+
return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
4309+
case ARNG_8H:
4310+
return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
4311+
case ARNG_4W:
4312+
return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
4313+
case ARNG_2V:
4314+
return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
4315+
}
4316+
case AXVMOVQ:
4317+
switch tarng {
4318+
case ARNG_32B:
4319+
return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
4320+
case ARNG_16H:
4321+
return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
4322+
case ARNG_8W:
4323+
return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
4324+
case ARNG_4V:
4325+
return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
4326+
}
42844327
}
42854328
}
42864329

src/cmd/internal/obj/loong64/doc.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,23 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
203203
VMOVQ Vj.W[index], Vd.W4 | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
204204
VMOVQ Vj.V[index], Vd.V2 | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
205205
206+
3.7 Load data from memory and broadcast to each element of a vector register.
207+
208+
Instruction format:
209+
VMOVQ offset(Rj), <Vd>.<T>
210+
211+
Mapping between Go and platform assembly:
212+
Go assembly | platform assembly | semantics
213+
-------------------------------------------------------------------------------------------------------------------------------------------------------
214+
VMOVQ offset(Rj), Vd.B16 | vldrepl.b Vd, Rj, si12 | for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
215+
VMOVQ offset(Rj), Vd.H8 | vldrepl.h Vd, Rj, si11 | for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
216+
VMOVQ offset(Rj), Vd.W4 | vldrepl.w Vd, Rj, si10 | for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
217+
VMOVQ offset(Rj), Vd.V2 | vldrepl.d Vd, Rj, si9 | for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
218+
XVMOVQ offset(Rj), Xd.B32 | xvldrepl.b Xd, Rj, si12 | for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
219+
XVMOVQ offset(Rj), Xd.H16 | xvldrepl.h Xd, Rj, si11 | for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
220+
XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
221+
XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
222+
206223
# Special instruction encoding definition and description on LoongArch
207224
208225
1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased

0 commit comments

Comments
 (0)