diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
index bfff555782e9f7..8363996683cad9 100644
--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
@@ -510,6 +510,16 @@ lable2:
 	VMOVQ		V3.W[1], V7.W4  // 67e4f772
 	VMOVQ		V4.V[0], V6.V2  // 86f0f772
 
+	// Load data from memory and broadcast to each element of a vector register: VMOVQ    offset(Rj), <Vd>.<T>
+	VMOVQ		(R4), V0.B16	// 80008030
+	VMOVQ		1(R4), V1.H8	// 81044030
+	VMOVQ		2(R4), V2.W4	// 82082030
+	VMOVQ		3(R4), V3.V2	// 830c1030
+	XVMOVQ		(R4), X0.B32	// 80008032
+	XVMOVQ		1(R4), X1.H16	// 81044032
+	XVMOVQ		2(R4), X2.W8	// 82082032
+	XVMOVQ		3(R4), X3.V4	// 830c1032
+
 	// VSEQ{B,H,W,V}, XVSEQ{B,H,W,V} instruction
 	VSEQB		V1, V2, V3      // 43040070
 	VSEQH		V1, V2, V3      // 43840070
@@ -1035,3 +1045,53 @@ lable2:
 	PRELD	(R4), $0		// 8000c02a
 	PRELD	-1(R4), $8		// 88fcff2a
 	PRELD	8(R4),  $31		// 9f20c02a
+
+	// [X]{VBITCLR/VBITSET/VBITREV}{B,H,W,V} instructions
+	VBITCLRB	V1, V2, V3	// 43040c71
+	VBITCLRH	V1, V2, V3	// 43840c71
+	VBITCLRW	V1, V2, V3	// 43040d71
+	VBITCLRV	V1, V2, V3	// 43840d71
+	VBITSETB	V1, V2, V3	// 43040e71
+	VBITSETH	V1, V2, V3	// 43840e71
+	VBITSETW	V1, V2, V3	// 43040f71
+	VBITSETV	V1, V2, V3	// 43840f71
+	VBITREVB	V1, V2, V3	// 43041071
+	VBITREVH	V1, V2, V3	// 43841071
+	VBITREVW	V1, V2, V3	// 43041171
+	VBITREVV	V1, V2, V3	// 43841171
+	XVBITCLRB	X3, X2, X1	// 410c0c75
+	XVBITCLRH	X3, X2, X1	// 418c0c75
+	XVBITCLRW	X3, X2, X1	// 410c0d75
+	XVBITCLRV	X3, X2, X1	// 418c0d75
+	XVBITSETB	X3, X2, X1	// 410c0e75
+	XVBITSETH	X3, X2, X1	// 418c0e75
+	XVBITSETW	X3, X2, X1	// 410c0f75
+	XVBITSETV	X3, X2, X1	// 418c0f75
+	XVBITREVB	X3, X2, X1	// 410c1075
+	XVBITREVH	X3, X2, X1	// 418c1075
+	XVBITREVW	X3, X2, X1	// 410c1175
+	XVBITREVV	X3, X2, X1	// 418c1175
+	VBITCLRB	$7, V2, V3	// 433c1073
+	VBITCLRH	$15, V2, V3	// 437c1073
+	VBITCLRW	$31, V2, V3	// 43fc1073
+	VBITCLRV	$63, V2, V3	// 43fc1173
+	VBITSETB	$7, V2, V3	// 433c1473
+	VBITSETH	$15, V2, V3	// 437c1473
+	VBITSETW	$31, V2, V3	// 43fc1473
+	VBITSETV	$63, V2, V3	// 43fc1573
+	VBITREVB	$7, V2, V3	// 433c1873
+	VBITREVH	$15, V2, V3	// 437c1873
+	VBITREVW	$31, V2, V3	// 43fc1873
+	VBITREVV	$63, V2, V3	// 43fc1973
+	XVBITCLRB	$7, X2, X1	// 413c1077
+	XVBITCLRH	$15, X2, X1	// 417c1077
+	XVBITCLRW	$31, X2, X1	// 41fc1077
+	XVBITCLRV	$63, X2, X1	// 41fc1177
+	XVBITSETB	$7, X2, X1	// 413c1477
+	XVBITSETH	$15, X2, X1	// 417c1477
+	XVBITSETW	$31, X2, X1	// 41fc1477
+	XVBITSETV	$63, X2, X1	// 41fc1577
+	XVBITREVB	$7, X2, X1	// 413c1877
+	XVBITREVH	$15, X2, X1	// 417c1877
+	XVBITREVW	$31, X2, X1	// 41fc1877
+	XVBITREVV	$63, X2, X1	// 41fc1977
diff --git a/src/cmd/cgo/out.go b/src/cmd/cgo/out.go
index dfa54e41d33399..155fff4a268bb8 100644
--- a/src/cmd/cgo/out.go
+++ b/src/cmd/cgo/out.go
@@ -1051,7 +1051,7 @@ func (p *Package) writeExports(fgo2, fm, fgcc, fgcch io.Writer) {
 		// string.h for memset, and is also robust to C++
 		// types with constructors. Both GCC and LLVM optimize
 		// this into just zeroing _cgo_a.
-		fmt.Fprintf(fgcc, "\ttypedef %s %v _cgo_argtype;\n", ctype.String(), p.packedAttribute())
+		fmt.Fprintf(fgcc, "\ttypedef %s %v __attribute__((aligned(%d))) _cgo_argtype;\n", ctype.String(), p.packedAttribute(), p.PtrSize)
 		fmt.Fprintf(fgcc, "\tstatic _cgo_argtype _cgo_zero;\n")
 		fmt.Fprintf(fgcc, "\t_cgo_argtype _cgo_a = _cgo_zero;\n")
 		if gccResult != "void" && (len(fntype.Results.List) > 1 || len(fntype.Results.List[0].Names) > 1) {
diff --git a/src/cmd/compile/internal/test/mulconst_test.go b/src/cmd/compile/internal/test/mulconst_test.go
index c4aed84432de50..1d1b351af19640 100644
--- a/src/cmd/compile/internal/test/mulconst_test.go
+++ b/src/cmd/compile/internal/test/mulconst_test.go
@@ -143,7 +143,7 @@ func BenchmarkMulconstI32(b *testing.B) {
 		}
 		mulSinkI32 = x
 	})
-	// -120x = 8x - 120x
+	// -120x = 8x - 128x
 	b.Run("-120", func(b *testing.B) {
 		x := int32(1)
 		for i := 0; i < b.N; i++ {
@@ -202,7 +202,7 @@ func BenchmarkMulconstI64(b *testing.B) {
 		}
 		mulSinkI64 = x
 	})
-	// -120x = 8x - 120x
+	// -120x = 8x - 128x
 	b.Run("-120", func(b *testing.B) {
 		x := int64(1)
 		for i := 0; i < b.N; i++ {
diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
index 193993ec4d9b4c..162e36be8cb6a2 100644
--- a/src/cmd/internal/obj/loong64/a.out.go
+++ b/src/cmd/internal/obj/loong64/a.out.go
@@ -816,6 +816,31 @@ const (
 	AXVPCNTW
 	AXVPCNTV
 
+	AVBITCLRB
+	AVBITCLRH
+	AVBITCLRW
+	AVBITCLRV
+	AVBITSETB
+	AVBITSETH
+	AVBITSETW
+	AVBITSETV
+	AVBITREVB
+	AVBITREVH
+	AVBITREVW
+	AVBITREVV
+	AXVBITCLRB
+	AXVBITCLRH
+	AXVBITCLRW
+	AXVBITCLRV
+	AXVBITSETB
+	AXVBITSETH
+	AXVBITSETW
+	AXVBITSETV
+	AXVBITREVB
+	AXVBITREVH
+	AXVBITREVW
+	AXVBITREVV
+
 	// LSX and LASX integer comparison instruction
 	AVSEQB
 	AXVSEQB
diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
index bf9b0722cc39d7..d9ff3b7bc94521 100644
--- a/src/cmd/internal/obj/loong64/anames.go
+++ b/src/cmd/internal/obj/loong64/anames.go
@@ -327,6 +327,30 @@ var Anames = []string{
 	"XVPCNTH",
 	"XVPCNTW",
 	"XVPCNTV",
+	"VBITCLRB",
+	"VBITCLRH",
+	"VBITCLRW",
+	"VBITCLRV",
+	"VBITSETB",
+	"VBITSETH",
+	"VBITSETW",
+	"VBITSETV",
+	"VBITREVB",
+	"VBITREVH",
+	"VBITREVW",
+	"VBITREVV",
+	"XVBITCLRB",
+	"XVBITCLRH",
+	"XVBITCLRW",
+	"XVBITCLRV",
+	"XVBITSETB",
+	"XVBITSETH",
+	"XVBITSETW",
+	"XVBITSETV",
+	"XVBITREVB",
+	"XVBITREVH",
+	"XVBITREVW",
+	"XVBITREVV",
 	"VSEQB",
 	"XVSEQB",
 	"VSEQH",
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 6e09930183383c..2ed12698e6fd98 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -416,8 +416,11 @@ var optab = []Optab{
 
 	{AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0},
 
-	{APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 46, 4, 0, 0},
-	{APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 47, 20, 0, 0},
+	{AVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
+	{AXVMOVQ, C_SOREG, C_NONE, C_NONE, C_ARNG, C_NONE, 46, 4, 0, 0},
+
+	{APRELD, C_SOREG, C_U5CON, C_NONE, C_NONE, C_NONE, 47, 4, 0, 0},
+	{APRELDX, C_SOREG, C_DCON, C_U5CON, C_NONE, C_NONE, 48, 20, 0, 0},
 
 	{obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
 	{obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0},
@@ -1830,21 +1833,33 @@ func buildop(ctxt *obj.Link) {
 			opset(AVSRLB, r0)
 			opset(AVSRAB, r0)
 			opset(AVROTRB, r0)
+			opset(AVBITCLRB, r0)
+			opset(AVBITSETB, r0)
+			opset(AVBITREVB, r0)
 
 		case AXVSLLB:
 			opset(AXVSRLB, r0)
 			opset(AXVSRAB, r0)
 			opset(AXVROTRB, r0)
+			opset(AXVBITCLRB, r0)
+			opset(AXVBITSETB, r0)
+			opset(AXVBITREVB, r0)
 
 		case AVSLLH:
 			opset(AVSRLH, r0)
 			opset(AVSRAH, r0)
 			opset(AVROTRH, r0)
+			opset(AVBITCLRH, r0)
+			opset(AVBITSETH, r0)
+			opset(AVBITREVH, r0)
 
 		case AXVSLLH:
 			opset(AXVSRLH, r0)
 			opset(AXVSRAH, r0)
 			opset(AXVROTRH, r0)
+			opset(AXVBITCLRH, r0)
+			opset(AXVBITSETH, r0)
+			opset(AXVBITREVH, r0)
 
 		case AVSLLW:
 			opset(AVSRLW, r0)
@@ -1858,6 +1873,9 @@ func buildop(ctxt *obj.Link) {
 			opset(AVSUBHU, r0)
 			opset(AVSUBWU, r0)
 			opset(AVSUBVU, r0)
+			opset(AVBITCLRW, r0)
+			opset(AVBITSETW, r0)
+			opset(AVBITREVW, r0)
 
 		case AXVSLLW:
 			opset(AXVSRLW, r0)
@@ -1871,16 +1889,25 @@ func buildop(ctxt *obj.Link) {
 			opset(AXVSUBHU, r0)
 			opset(AXVSUBWU, r0)
 			opset(AXVSUBVU, r0)
+			opset(AXVBITCLRW, r0)
+			opset(AXVBITSETW, r0)
+			opset(AXVBITREVW, r0)
 
 		case AVSLLV:
 			opset(AVSRLV, r0)
 			opset(AVSRAV, r0)
 			opset(AVROTRV, r0)
+			opset(AVBITCLRV, r0)
+			opset(AVBITSETV, r0)
+			opset(AVBITREVV, r0)
 
 		case AXVSLLV:
 			opset(AXVSRLV, r0)
 			opset(AXVSRAV, r0)
 			opset(AXVROTRV, r0)
+			opset(AXVBITCLRV, r0)
+			opset(AXVBITSETV, r0)
+			opset(AXVBITREVV, r0)
 
 		case AVSETEQV:
 			opset(AVSETNEV, r0)
@@ -2395,7 +2422,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = uint32(c.regoff(&p.From))
 
 	case 39: // vmov Rn, Vd.<T>[index]
-		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2407,7 +2434,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (index << 10) | (Rj << 5) | Vd
 
 	case 40: // vmov Vd.<T>[index], Rn
-		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2419,7 +2446,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (index << 10) | (Vj << 5) | Rd
 
 	case 41: // vmov Rn, Vd.<T>
-		v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2429,7 +2456,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (Rj << 5) | Vd
 
 	case 42: // vmov  xj, xd.<T>
-		v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2439,7 +2466,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (Xj << 5) | Xd
 
 	case 43: // vmov  xj, xd.<T>[index]
-		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2451,7 +2478,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (index << 10) | (Xj << 5) | Xd
 
 	case 44: // vmov  xj.<T>[index], xd
-		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2463,7 +2490,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		o1 = v | (index << 10) | (Xj << 5) | Xd
 
 	case 45: // vmov  vj.<T>[index], vd.<T>
-		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg)
+		v, m := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, false)
 		if v == 0 {
 			c.ctxt.Diag("illegal arng type combination: %v\n", p)
 		}
@@ -2474,12 +2501,23 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 		c.checkindex(p, index, m)
 		o1 = v | (index << 10) | (vj << 5) | vd
 
-	case 46: // preld  offset(Rbase), $hint
+	case 46: // vmov offset(vj), vd.<T>
+		v, _ := c.specialLsxMovInst(p.As, p.From.Reg, p.To.Reg, true)
+		if v == 0 {
+			c.ctxt.Diag("illegal arng type combination: %v\n", p)
+		}
+
+		si := c.regoff(&p.From)
+		Rj := uint32(p.From.Reg & EXT_REG_MASK)
+		Vd := uint32(p.To.Reg & EXT_REG_MASK)
+		o1 = v | uint32(si<<10) | (Rj << 5) | Vd
+
+	case 47: // preld  offset(Rbase), $hint
 		offs := c.regoff(&p.From)
 		hint := p.GetFrom3().Offset
 		o1 = OP_12IR_5I(c.opiir(p.As), uint32(offs), uint32(p.From.Reg), uint32(hint))
 
-	case 47: // preldx offset(Rbase), $n, $hint
+	case 48: // preldx offset(Rbase), $n, $hint
 		offs := c.regoff(&p.From)
 		hint := p.RestArgs[1].Offset
 		n := uint64(p.GetFrom3().Offset)
@@ -3504,6 +3542,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
 		return 0xea75 << 15 // xvfdiv.s
 	case AXVDIVD:
 		return 0xea76 << 15 // xvfdiv.d
+	case AVBITCLRB:
+		return 0xe218 << 15 // vbitclr.b
+	case AVBITCLRH:
+		return 0xe219 << 15 // vbitclr.h
+	case AVBITCLRW:
+		return 0xe21a << 15 // vbitclr.w
+	case AVBITCLRV:
+		return 0xe21b << 15 // vbitclr.d
+	case AVBITSETB:
+		return 0xe21c << 15 // vbitset.b
+	case AVBITSETH:
+		return 0xe21d << 15 // vbitset.h
+	case AVBITSETW:
+		return 0xe21e << 15 // vbitset.w
+	case AVBITSETV:
+		return 0xe21f << 15 // vbitset.d
+	case AVBITREVB:
+		return 0xe220 << 15 // vbitrev.b
+	case AVBITREVH:
+		return 0xe221 << 15 // vbitrev.h
+	case AVBITREVW:
+		return 0xe222 << 15 // vbitrev.w
+	case AVBITREVV:
+		return 0xe223 << 15 // vbitrev.d
+	case AXVBITCLRB:
+		return 0xea18 << 15 // xvbitclr.b
+	case AXVBITCLRH:
+		return 0xea19 << 15 // xvbitclr.h
+	case AXVBITCLRW:
+		return 0xea1a << 15 // xvbitclr.w
+	case AXVBITCLRV:
+		return 0xea1b << 15 // xvbitclr.d
+	case AXVBITSETB:
+		return 0xea1c << 15 // xvbitset.b
+	case AXVBITSETH:
+		return 0xea1d << 15 // xvbitset.h
+	case AXVBITSETW:
+		return 0xea1e << 15 // xvbitset.w
+	case AXVBITSETV:
+		return 0xea1f << 15 // xvbitset.d
+	case AXVBITREVB:
+		return 0xea20 << 15 // xvbitrev.b
+	case AXVBITREVH:
+		return 0xea21 << 15 // xvbitrev.h
+	case AXVBITREVW:
+		return 0xea22 << 15 // xvbitrev.w
+	case AXVBITREVV:
+		return 0xea23 << 15 // xvbitrev.d
 	}
 
 	if a < 0 {
@@ -4104,6 +4190,54 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
 		return 0x1de6 << 18 // xvshuf4i.w
 	case AXVSHUF4IV:
 		return 0x1de7 << 18 // xvshuf4i.d
+	case AVBITCLRB:
+		return 0x1CC4<<18 | 0x1<<13 // vbitclri.b
+	case AVBITCLRH:
+		return 0x1CC4<<18 | 0x1<<14 // vbitclri.h
+	case AVBITCLRW:
+		return 0x1CC4<<18 | 0x1<<15 // vbitclri.w
+	case AVBITCLRV:
+		return 0x1CC4<<18 | 0x1<<16 // vbitclri.d
+	case AVBITSETB:
+		return 0x1CC5<<18 | 0x1<<13 // vbitseti.b
+	case AVBITSETH:
+		return 0x1CC5<<18 | 0x1<<14 // vbitseti.h
+	case AVBITSETW:
+		return 0x1CC5<<18 | 0x1<<15 // vbitseti.w
+	case AVBITSETV:
+		return 0x1CC5<<18 | 0x1<<16 // vbitseti.d
+	case AVBITREVB:
+		return 0x1CC6<<18 | 0x1<<13 // vbitrevi.b
+	case AVBITREVH:
+		return 0x1CC6<<18 | 0x1<<14 // vbitrevi.h
+	case AVBITREVW:
+		return 0x1CC6<<18 | 0x1<<15 // vbitrevi.w
+	case AVBITREVV:
+		return 0x1CC6<<18 | 0x1<<16 // vbitrevi.d
+	case AXVBITCLRB:
+		return 0x1DC4<<18 | 0x1<<13 // xvbitclri.b
+	case AXVBITCLRH:
+		return 0x1DC4<<18 | 0x1<<14 // xvbitclri.h
+	case AXVBITCLRW:
+		return 0x1DC4<<18 | 0x1<<15 // xvbitclri.w
+	case AXVBITCLRV:
+		return 0x1DC4<<18 | 0x1<<16 // xvbitclri.d
+	case AXVBITSETB:
+		return 0x1DC5<<18 | 0x1<<13 // xvbitseti.b
+	case AXVBITSETH:
+		return 0x1DC5<<18 | 0x1<<14 // xvbitseti.h
+	case AXVBITSETW:
+		return 0x1DC5<<18 | 0x1<<15 // xvbitseti.w
+	case AXVBITSETV:
+		return 0x1DC5<<18 | 0x1<<16 // xvbitseti.d
+	case AXVBITREVB:
+		return 0x1DC6<<18 | 0x1<<13 // xvbitrevi.b
+	case AXVBITREVH:
+		return 0x1DC6<<18 | 0x1<<14 // xvbitrevi.h
+	case AXVBITREVW:
+		return 0x1DC6<<18 | 0x1<<15 // xvbitrevi.w
+	case AXVBITREVV:
+		return 0x1DC6<<18 | 0x1<<16 // xvbitrevi.d
 	}
 
 	if a < 0 {
@@ -4192,7 +4326,7 @@ func (c *ctxt0) specialFpMovInst(a obj.As, fclass int, tclass int) uint32 {
 	return 0
 }
 
-func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_mask uint32) {
+func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16, offset_flag bool) (op_code, index_mask uint32) {
 	farng := (fReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
 	tarng := (tReg >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK
 	fclass := c.rclass(fReg)
@@ -4258,29 +4392,58 @@ func (c *ctxt0) specialLsxMovInst(a obj.As, fReg, tReg int16) (op_code, index_ma
 		}
 
 	case C_REG | (C_ARNG << 16):
-		// vmov Rn, Vd.<T>
-		switch a {
-		case AVMOVQ:
-			switch tarng {
-			case ARNG_16B:
-				return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
-			case ARNG_8H:
-				return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
-			case ARNG_4W:
-				return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
-			case ARNG_2V:
-				return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
+		switch {
+		case offset_flag:
+			// vmov offset(vj), vd.<T>
+			switch a {
+			case AVMOVQ:
+				switch tarng {
+				case ARNG_16B:
+					return (0xC2 << 22), 0x0 // vldrepl.b
+				case ARNG_8H:
+					return (0x182 << 21), 0x0 // vldrepl.h
+				case ARNG_4W:
+					return (0x302 << 20), 0x0 // vldrepl.w
+				case ARNG_2V:
+					return (0x602 << 19), 0x0 // vldrepl.d
+				}
+			case AXVMOVQ:
+				switch tarng {
+				case ARNG_32B:
+					return (0xCA << 22), 0x0 // xvldrepl.b
+				case ARNG_16H:
+					return (0x192 << 21), 0x0 // xvldrepl.h
+				case ARNG_8W:
+					return (0x322 << 20), 0x0 // xvldrepl.w
+				case ARNG_4V:
+					return (0x642 << 19), 0x0 // xvldrepl.d
+				}
 			}
-		case AXVMOVQ:
-			switch tarng {
-			case ARNG_32B:
-				return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
-			case ARNG_16H:
-				return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
-			case ARNG_8W:
-				return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
-			case ARNG_4V:
-				return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
+		default:
+			// vmov Rn, Vd.<T>
+			switch a {
+			case AVMOVQ:
+				switch tarng {
+				case ARNG_16B:
+					return (0x1CA7C0 << 10), 0x0 // vreplgr2vr.b
+				case ARNG_8H:
+					return (0x1CA7C1 << 10), 0x0 // vreplgr2vr.h
+				case ARNG_4W:
+					return (0x1CA7C2 << 10), 0x0 // vreplgr2vr.w
+				case ARNG_2V:
+					return (0x1CA7C3 << 10), 0x0 // vreplgr2vr.d
+				}
+			case AXVMOVQ:
+				switch tarng {
+				case ARNG_32B:
+					return (0x1DA7C0 << 10), 0x0 // xvreplgr2vr.b
+				case ARNG_16H:
+					return (0x1DA7C1 << 10), 0x0 // xvreplgr2vr.h
+				case ARNG_8W:
+					return (0x1DA7C2 << 10), 0x0 // xvreplgr2vr.w
+				case ARNG_4V:
+					return (0x1DA7C3 << 10), 0x0 // xvreplgr2vr.d
+				}
 			}
 		}
 
diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go
index 0818389c8d9366..a990b230892623 100644
--- a/src/cmd/internal/obj/loong64/doc.go
+++ b/src/cmd/internal/obj/loong64/doc.go
@@ -203,6 +203,23 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
 	VMOVQ Vj.W[index], Vd.W4  | vreplvei.w vd, vj, ui2 | for i in range(4) : VR[vd].w[i] = VR[vj].w[ui2]
 	VMOVQ Vj.V[index], Vd.V2  | vreplvei.d vd, vj, ui1 | for i in range(2) : VR[vd].d[i] = VR[vj].d[ui1]
 
+3.7 Load data from memory and broadcast to each element of a vector register.
+
+	Instruction format:
+	        VMOVQ    offset(Rj), <Vd>.<T>
+
+	Mapping between Go and platform assembly:
+	   Go assembly              |     platform assembly      |                                semantics
+	-------------------------------------------------------------------------------------------------------------------------------------------------------
+	 VMOVQ  offset(Rj), Vd.B16  |   vldrepl.b  Vd, Rj, si12  |  for i in range(16): VR[vd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
+	 VMOVQ  offset(Rj), Vd.H8   |   vldrepl.h  Vd, Rj, si11  |  for i in range(8) : VR[vd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
+	 VMOVQ  offset(Rj), Vd.W4   |   vldrepl.w  Vd, Rj, si10  |  for i in range(4) : VR[vd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
+	 VMOVQ  offset(Rj), Vd.V2   |   vldrepl.d  Vd, Rj, si9   |  for i in range(2) : VR[vd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
+	XVMOVQ  offset(Rj), Xd.B32  |  xvldrepl.b  Xd, Rj, si12  |  for i in range(32): XR[xd].b[i] = load 8 bit memory data from (GR[rj]+SignExtend(si12))
+	XVMOVQ  offset(Rj), Xd.H16  |  xvldrepl.h  Xd, Rj, si11  |  for i in range(16): XR[xd].h[i] = load 16 bit memory data from (GR[rj]+SignExtend(si11<<1))
+	XVMOVQ  offset(Rj), Xd.W8   |  xvldrepl.w  Xd, Rj, si10  |  for i in range(8) : XR[xd].w[i] = load 32 bit memory data from (GR[rj]+SignExtend(si10<<2))
+	XVMOVQ  offset(Rj), Xd.V4   |  xvldrepl.d  Xd, Rj, si9   |  for i in range(4) : XR[xd].d[i] = load 64 bit memory data from (GR[rj]+SignExtend(si9<<3))
+
 # Special instruction encoding definition and description on LoongArch
 
  1. DBAR hint encoding for LA664(Loongson 3A6000) and later micro-architectures, paraphrased
diff --git a/src/runtime/asm_loong64.s b/src/runtime/asm_loong64.s
index 46ef00bab8aa35..ee7f825e1f6681 100644
--- a/src/runtime/asm_loong64.s
+++ b/src/runtime/asm_loong64.s
@@ -70,8 +70,9 @@ nocgo:
 	// start this M
 	JAL	runtime·mstart(SB)
 
-	// Prevent dead-code elimination of debugCallV2, which is
+	// Prevent dead-code elimination of debugCallV2 and debugPinnerV1, which are
 	// intended to be called by debuggers.
+	MOVV	$runtime·debugPinnerV1<ABIInternal>(SB), R0
 	MOVV	$runtime·debugCallV2<ABIInternal>(SB), R0
 
 	MOVV	R0, 1(R0)