diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s index bfff555782e9f7..8363996683cad9 100644 --- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s +++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s @@ -510,6 +510,16 @@ lable2: VMOVQ V3.W[1], V7.W4 // 67e4f772 VMOVQ V4.V[0], V6.V2 // 86f0f772 + // Load data from memory and broadcast to each element of a vector register: VMOVQ offset(Rj), . + VMOVQ (R4), V0.B16 // 80008030 + VMOVQ 1(R4), V1.H8 // 81044030 + VMOVQ 2(R4), V2.W4 // 82082030 + VMOVQ 3(R4), V3.V2 // 830c1030 + XVMOVQ (R4), X0.B32 // 80008032 + XVMOVQ 1(R4), X1.H16 // 81044032 + XVMOVQ 2(R4), X2.W8 // 82082032 + XVMOVQ 3(R4), X3.V4 // 830c1032 + // VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction VSEQB V1, V2, V3 // 43040070 VSEQH V1, V2, V3 // 43840070 @@ -1035,3 +1045,53 @@ lable2: PRELD (R4), $0 // 8000c02a PRELD -1(R4), $8 // 88fcff2a PRELD 8(R4), $31 // 9f20c02a + + // [X]{VBITCLR/VBITSET/VBITREV}{B,H,W,V} instructions + VBITCLRB V1, V2, V3 // 43040c71 + VBITCLRH V1, V2, V3 // 43840c71 + VBITCLRW V1, V2, V3 // 43040d71 + VBITCLRV V1, V2, V3 // 43840d71 + VBITSETB V1, V2, V3 // 43040e71 + VBITSETH V1, V2, V3 // 43840e71 + VBITSETW V1, V2, V3 // 43040f71 + VBITSETV V1, V2, V3 // 43840f71 + VBITREVB V1, V2, V3 // 43041071 + VBITREVH V1, V2, V3 // 43841071 + VBITREVW V1, V2, V3 // 43041171 + VBITREVV V1, V2, V3 // 43841171 + XVBITCLRB X3, X2, X1 // 410c0c75 + XVBITCLRH X3, X2, X1 // 418c0c75 + XVBITCLRW X3, X2, X1 // 410c0d75 + XVBITCLRV X3, X2, X1 // 418c0d75 + XVBITSETB X3, X2, X1 // 410c0e75 + XVBITSETH X3, X2, X1 // 418c0e75 + XVBITSETW X3, X2, X1 // 410c0f75 + XVBITSETV X3, X2, X1 // 418c0f75 + XVBITREVB X3, X2, X1 // 410c1075 + XVBITREVH X3, X2, X1 // 418c1075 + XVBITREVW X3, X2, X1 // 410c1175 + XVBITREVV X3, X2, X1 // 418c1175 + VBITCLRB $7, V2, V3 // 433c1073 + VBITCLRH $15, V2, V3 // 437c1073 + VBITCLRW $31, V2, V3 // 43fc1073 + VBITCLRV $63, V2, V3 // 43fc1173 + VBITSETB $7, V2, V3 // 433c1473 + VBITSETH $15, V2, V3 // 437c1473 + VBITSETW $31, V2, V3 // 43fc1473 + VBITSETV $63, V2, V3 // 43fc1573 + VBITREVB $7, V2, V3 // 433c1873 + VBITREVH $15, V2, V3 // 437c1873 + VBITREVW $31, V2, V3 // 43fc1873 + VBITREVV $63, V2, V3 // 43fc1973 + XVBITCLRB $7, X2, X1 // 413c1077 + XVBITCLRH $15, X2, X1 // 417c1077 + XVBITCLRW $31, X2, X1 // 41fc1077 + XVBITCLRV $63, X2, X1 // 41fc1177 + XVBITSETB $7, X2, X1 // 413c1477 + XVBITSETH $15, X2, X1 // 417c1477 + XVBITSETW $31, X2, X1 // 41fc1477 + XVBITSETV $63, X2, X1 // 41fc1577 + XVBITREVB $7, X2, X1 // 413c1877 + XVBITREVH $15, X2, X1 // 417c1877 + XVBITREVW $31, X2, X1 // 41fc1877 + XVBITREVV $63, X2, X1 // 41fc1977 diff --git a/src/cmd/cgo/out.go b/src/cmd/cgo/out.go index dfa54e41d33399..155fff4a268bb8 100644 --- a/src/cmd/cgo/out.go +++ b/src/cmd/cgo/out.go @@ -1051,7 +1051,7 @@ func (p *Package) writeExports(fgo2, fm, fgcc, fgcch io.Writer) { // string.h for memset, and is also robust to C++ // types with constructors. Both GCC and LLVM optimize // this into just zeroing _cgo_a. - fmt.Fprintf(fgcc, "\ttypedef %s %v _cgo_argtype;\n", ctype.String(), p.packedAttribute()) + fmt.Fprintf(fgcc, "\ttypedef %s %v __attribute__((aligned(%d))) _cgo_argtype;\n", ctype.String(), p.packedAttribute(), p.PtrSize) fmt.Fprintf(fgcc, "\tstatic _cgo_argtype _cgo_zero;\n") fmt.Fprintf(fgcc, "\t_cgo_argtype _cgo_a = _cgo_zero;\n") if gccResult != "void" && (len(fntype.Results.List) > 1 || len(fntype.Results.List[0].Names) > 1) { diff --git a/src/cmd/compile/internal/test/mulconst_test.go b/src/cmd/compile/internal/test/mulconst_test.go index c4aed84432de50..1d1b351af19640 100644 --- a/src/cmd/compile/internal/test/mulconst_test.go +++ b/src/cmd/compile/internal/test/mulconst_test.go @@ -143,7 +143,7 @@ func BenchmarkMulconstI32(b *testing.B) { } mulSinkI32 = x }) - // -120x = 8x - 120x + // -120x = 8x - 128x b.Run("-120", func(b *testing.B) { x := int32(1) for i := 0; i < b.N; i++ { @@ -202,7 +202,7 @@ func BenchmarkMulconstI64(b *testing.B) { } mulSinkI64 = x }) - // -120x = 8x - 120x + // -120x = 8x - 128x b.Run("-120", func(b *testing.B) { x := int64(1) for i := 0; i < b.N; i++ { diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go index 193993ec4d9b4c..162e36be8cb6a2 100644 --- a/src/cmd/internal/obj/loong64/a.out.go +++ b/src/cmd/internal/obj/loong64/a.out.go @@ -816,6 +816,31 @@ const ( AXVPCNTW AXVPCNTV + AVBITCLRB + AVBITCLRH + AVBITCLRW + AVBITCLRV + AVBITSETB + AVBITSETH + AVBITSETW + AVBITSETV + AVBITREVB + AVBITREVH + AVBITREVW + AVBITREVV + AXVBITCLRB + AXVBITCLRH + AXVBITCLRW + AXVBITCLRV + AXVBITSETB + AXVBITSETH + AXVBITSETW + AXVBITSETV + AXVBITREVB + AXVBITREVH + AXVBITREVW + AXVBITREVV + // LSX and LASX integer comparison instruction AVSEQB AXVSEQB diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go index bf9b0722cc39d7..d9ff3b7bc94521 100644 --- a/src/cmd/internal/obj/loong64/anames.go +++ b/src/cmd/internal/obj/loong64/anames.go @@ -327,6 +327,30 @@ var Anames = []string{ "XVPCNTH", "XVPCNTW", "XVPCNTV", + "VBITCLRB", + "VBITCLRH", + "VBITCLRW", + "VBITCLRV", + "VBITSETB", + "VBITSETH", + "VBITSETW", + "VBITSETV", + "VBITREVB", + "VBITREVH", + "VBITREVW", + "VBITREVV", + "XVBITCLRB", + "XVBITCLRH", + "XVBITCLRW", + "XVBITCLRV", + "XVBITSETB", + "XVBITSETH", + "XVBITSETW", + "XVBITSETV", + "XVBITREVB", + "XVBITREVH", + "XVBITREVW", + "XVBITREVV", "VSEQB", "XVSEQB", "VSEQH", diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go index 6e09930183383c..2ed12698e6fd98 100644 --- a/src/cmd/internal/obj/loong64/asm.go +++ b/src/cmd/internal/obj/loong64/asm.go @@ -416,8 +416,11 @@ var optab = []Optab{ {AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0}, - {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 46, 4, 0, 0}, - {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 47, 20, 0, 0}, + {AVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0}, + {AXVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0}, + + {APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 47, 4, 0, 0}, + {APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 48, 20, 0, 0}, {obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, {obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0}, @@ -1830,21 +1833,33 @@ func buildop(ctxt *obj.Link) { opset(AVSRLB, r0) opset(AVSRAB, r0) opset(AVROTRB, r0) + opset(AVBITCLRB, r0) + opset(AVBITSETB, r0) + opset(AVBITREVB, r0) case AXVSLLB: opset(AXVSRLB, r0) opset(AXVSRAB, r0) opset(AXVROTRB, r0) + opset(AXVBITCLRB, r0) + opset(AXVBITSETB, r0) + opset(AXVBITREVB, r0) case AVSLLH: opset(AVSRLH, r0) opset(AVSRAH, r0) opset(AVROTRH, r0) + opset(AVBITCLRH, r0) + opset(AVBITSETH, r0) + opset(AVBITREVH, r0) case AXVSLLH: opset(AXVSRLH, r0) opset(AXVSRAH, r0) opset(AXVROTRH, r0) + opset(AXVBITCLRH, r0) + opset(AXVBITSETH, r0) + opset(AXVBITREVH, r0) case AVSLLW: opset(AVSRLW, r0) @@ -1858,6 +1873,9 @@ func buildop(ctxt *obj.Link) { opset(AVSUBHU, r0) opset(AVSUBWU, r0) opset(AVSUBVU, r0) + opset(AVBITCLRW, r0) + opset(AVBITSETW, r0) + opset(AVBITREVW, r0) case AXVSLLW: opset(AXVSRLW, r0) @@ -1871,16 +1889,25 @@ func buildop(ctxt *obj.Link) { opset(AXVSUBHU, r0) opset(AXVSUBWU, r0) opset(AXVSUBVU, r0) + opset(AXVBITCLRW, r0) + opset(AXVBITSETW, r0) + opset(AXVBITREVW, r0) case AVSLLV: opset(AVSRLV, r0) opset(AVSRAV, r0) opset(AVROTRV, r0) + opset(AVBITCLRV, r0) + opset(AVBITSETV, r0) + opset(AVBITREVV, r0) case AXVSLLV: opset(AXVSRLV, r0) opset(AXVSRAV, r0) opset(AXVROTRV, r0) + opset(AXVBITCLRV, r0) + opset(AXVBITSETV, r0) + opset(AXVBITREVV, r0) case AVSETEQV: opset(AVSETNEV, r0) @@ -2395,7 +2422,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = uint32(c.regoff(&p.From)) case 39: // vmov Rn, Vd.[index] - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2407,7 +2434,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Rj << 5) | Vd case 40: // vmov Vd.[index], Rn - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2419,7 +2446,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Vj << 5) | Rd case 41: // vmov Rn, Vd. - v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2429,7 +2456,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (Rj << 5) | Vd case 42: // vmov xj, xd. - v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2439,7 +2466,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (Xj << 5) | Xd case 43: // vmov xj, xd.[index] - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2451,7 +2478,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Xj << 5) | Xd case 44: // vmov xj.[index], xd - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2463,7 +2490,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = v | (index << 10) | (Xj << 5) | Xd case 45: // vmov vj.[index], vd. - v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg) + v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false) if v == 0 { c.ctxt.Diag("illegal arng type combination: %v\n", p) } @@ -2474,12 +2501,23 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) { c.checkindex(p, index, m) o1 = v | (index << 10) | (vj << 5) | vd - case 46: // preld offset(Rbase), $hint + case 46: // vmov offset(vj), vd. + v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, true) + if v == 0 { + c.ctxt.Diag("illegal arng type combination: %v\n", p) + } + + si := c.regoff(&p.From) + Rj := uint32(p.From.Reg & EXT_REG_MASK) + Vd := uint32(p.To.Reg & EXT_REG_MASK) + o1 = v | uint32(si<<10) | (Rj << 5) | Vd + + case 47: // preld offset(Rbase), $hint offs := c.regoff(&p.From) hint := p.GetFrom3().Offset o1 = OP_12IR_5I(c.opiir(p.As), uint32(offs), uint32(p.From.Reg), uint32(hint)) - case 47: // preldx offset(Rbase), $n, $hint + case 48: // preldx offset(Rbase), $n, $hint offs := c.regoff(&p.From) hint := p.RestArgs[1].Offset n := uint64(p.GetFrom3().Offset) @@ -3504,6 +3542,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 { return 0xea75 << 15 // xvfdiv.s case AXVDIVD: return 0xea76 << 15 // xvfdiv.d + case AVBITCLRB: + return 0xe218 << 15 // vbitclr.b + case AVBITCLRH: + return 0xe219 << 15 // vbitclr.h + case AVBITCLRW: + return 0xe21a << 15 // vbitclr.w + case AVBITCLRV: + return 0xe21b << 15 // vbitclr.d + case AVBITSETB: + return 0xe21c << 15 // vbitset.b + case AVBITSETH: + return 0xe21d << 15 // vbitset.h + case AVBITSETW: + return 0xe21e << 15 // vbitset.w + case AVBITSETV: + return 0xe21f << 15 // vbitset.d + case AVBITREVB: + return 0xe220 << 15 // vbitrev.b + case AVBITREVH: + return 0xe221 << 15 // vbitrev.h + case AVBITREVW: + return 0xe222 << 15 // vbitrev.w + case AVBITREVV: + return 0xe223 << 15 // vbitrev.d + case AXVBITCLRB: + return 0xea18 << 15 // xvbitclr.b + case AXVBITCLRH: + return 0xea19 << 15 // xvbitclr.h + case AXVBITCLRW: + return 0xea1a << 15 // xvbitclr.w + case AXVBITCLRV: + return 0xea1b << 15 // xvbitclr.d + case AXVBITSETB: + return 0xea1c << 15 // xvbitset.b + case AXVBITSETH: + return 0xea1d << 15 // xvbitset.h + case AXVBITSETW: + return 0xea1e << 15 // xvbitset.w + case AXVBITSETV: + return 0xea1f << 15 // xvbitset.d + case AXVBITREVB: + return 0xea20 << 15 // xvbitrev.b + case AXVBITREVH: + return 0xea21 << 15 // xvbitrev.h + case AXVBITREVW: + return 0xea22 << 15 // xvbitrev.w + case AXVBITREVV: + return 0xea23 << 15 // xvbitrev.d } if a < 0 { @@ -4104,6 +4190,54 @@ func (c *ctxt0) opirr(a obj.As) uint32 { return 0x1de6 << 18 // xvshuf4i.w case AXVSHUF4IV: return 0x1de7 << 18 // xvshuf4i.d + case AVBITCLRB: + return 0x1CC4<<18 | 0x1<<13 // vbitclri.b + case AVBITCLRH: + return 0x1CC4<<18 | 0x1<<14 // vbitclri.h + case AVBITCLRW: + return 0x1CC4<<18 | 0x1<<15 // vbitclri.w + case AVBITCLRV: + return 0x1CC4<<18 | 0x1<<16 // vbitclri.d + case AVBITSETB: + return 0x1CC5<<18 | 0x1<<13 // vbitseti.b + case AVBITSETH: + return 0x1CC5<<18 | 0x1<<14 // vbitseti.h + case AVBITSETW: + return 0x1CC5<<18 | 0x1<<15 // vbitseti.w + case AVBITSETV: + return 0x1CC5<<18 | 0x1<<16 // vbitseti.d + case AVBITREVB: + return 0x1CC6<<18 | 0x1<<13 // vbitrevi.b + case AVBITREVH: + return 0x1CC6<<18 | 0x1<<14 // vbitrevi.h + case AVBITREVW: + return 0x1CC6<<18 | 0x1<<15 // vbitrevi.w + case AVBITREVV: + return 0x1CC6<<18 | 0x1<<16 // vbitrevi.d + case AXVBITCLRB: + return 0x1DC4<<18 | 0x1<<13 // xvbitclri.b + case AXVBITCLRH: + return 0x1DC4<<18 | 0x1<<14 // xvbitclri.h + case AXVBITCLRW: + return 0x1DC4<<18 | 0x1<<15 // xvbitclri.w + case AXVBITCLRV: + return 0x1DC4<<18 | 0x1<<16 // xvbitclri.d + case AXVBITSETB: + return 0x1DC5<<18 | 0x1<<13 // xvbitseti.b + case AXVBITSETH: + return 0x1DC5<<18 | 0x1<<14 // xvbitseti.h + case AXVBITSETW: + return 0x1DC5<<18 | 0x1<<15 // xvbitseti.w + case AXVBITSETV: + return 0x1DC5<<18 | 0x1<<16 // xvbitseti.d + case AXVBITREVB: + return 0x1DC6<<18 | 0x1<<13 // xvbitrevi.b + case AXVBITREVH: + return 0x1DC6<<18 | 0x1<<14 // xvbitrevi.h + case AXVBITREVW: + return 0x1DC6<<18 | 0x1<<15 // xvbitrevi.w + case AXVBITREVV: + return 0x1DC6<<18 | 0x1<<16 // xvbitrevi.d } if a < 0 { @@ -4192,7 +4326,7 @@ func (c *ctxt0) specialFpMovInst(a obj.As, fclass int, tclass int) uint32 { return 0 } -func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_mask uint32) { +func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16, offset_flag bool) (op_code, index_mask uint32) { farng := (fReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK tarng := (tReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK fclass := c.rclass(fReg) @@ -4258,29 +4392,58 @@ func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_ma } case C_REG | (C_ARNG << 16): - // vmov Rn, Vd. - switch a { - case AVMOVQ: - switch tarng { - case ARNG_16B: - return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b - case ARNG_8H: - return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h - case ARNG_4W: - return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w - case ARNG_2V: - return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d + switch { + case offset_flag: + // vmov offset(vj), vd. + switch a { + case AVMOVQ: + switch tarng { + case ARNG_16B: + return (0xC2 << 22), 0x0 // vldrepl.b + case ARNG_8H: + return (0x182 << 21), 0x0 // vldrepl.h + case ARNG_4W: + return (0x302 << 20), 0x0 // vldrepl.w + case ARNG_2V: + return (0x602 << 19), 0x0 // vldrepl.d + } + case AXVMOVQ: + switch tarng { + case ARNG_32B: + return (0xCA << 22), 0x0 // xvldrepl.b + case ARNG_16H: + return (0x192 << 21), 0x0 // xvldrepl.h + case ARNG_8W: + return (0x322 << 20), 0x0 // xvldrepl.w + case ARNG_4V: + return (0x642 << 19), 0x0 // xvldrepl.d + } } - case AXVMOVQ: - switch tarng { - case ARNG_32B: - return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b - case ARNG_16H: - return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h - case ARNG_8W: - return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w - case ARNG_4V: - return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d + default: + // vmov Rn, Vd. + switch a { + case AVMOVQ: + switch tarng { + case ARNG_16B: + return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b + case ARNG_8H: + return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h + case ARNG_4W: + return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w + case ARNG_2V: + return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d + } + case AXVMOVQ: + switch tarng { + case ARNG_32B: + return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b + case ARNG_16H: + return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h + case ARNG_8W: + return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w + case ARNG_4V: + return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d + } } } diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go index 0818389c8d9366..a990b230892623 100644 --- a/src/cmd/internal/obj/loong64/doc.go +++ b/src/cmd/internal/obj/loong64/doc.go @@ -203,6 +203,23 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate) VMOVQ Vj.W[index], Vd.W4 | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2] VMOVQ Vj.V[index], Vd.V2 | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1] +3.7 Load data from memory and broadcast to each element of a vector register. + + Instruction format: + VMOVQ offset(Rj), . + + Mapping between Go and platform assembly: + Go assembly | platform assembly | semantics + ------------------------------------------------------------------------------------------------------------------------------------------------------- + VMOVQ offset(Rj), Vd.B16 | vldrepl.b Vd, Rj, si12 | for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12)) + VMOVQ offset(Rj), Vd.H8 | vldrepl.h Vd, Rj, si11 | for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1)) + VMOVQ offset(Rj), Vd.W4 | vldrepl.w Vd, Rj, si10 | for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2)) + VMOVQ offset(Rj), Vd.V2 | vldrepl.d Vd, Rj, si9 | for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3)) + XVMOVQ offset(Rj), Xd.B32 | xvldrepl.b Xd, Rj, si12 | for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12)) + XVMOVQ offset(Rj), Xd.H16 | xvldrepl.h Xd, Rj, si11 | for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1)) + XVMOVQ offset(Rj), Xd.W8 | xvldrepl.w Xd, Rj, si10 | for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2)) + XVMOVQ offset(Rj), Xd.V4 | xvldrepl.d Xd, Rj, si9 | for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3)) + # Special instruction encoding definition and description on LoongArch 1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased diff --git a/src/runtime/asm_loong64.s b/src/runtime/asm_loong64.s index 46ef00bab8aa35..ee7f825e1f6681 100644 --- a/src/runtime/asm_loong64.s +++ b/src/runtime/asm_loong64.s @@ -70,8 +70,9 @@ nocgo: // start this M JAL runtime·mstart(SB) - // Prevent dead-code elimination of debugCallV2, which is + // Prevent dead-code elimination of debugCallV2 and debugPinnerV1, which are // intended to be called by debuggers. + MOVV $runtime·debugPinnerV1(SB), R0 MOVV $runtime·debugCallV2(SB), R0 MOVV R0, 1(R0)