Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,25 @@ bool ArchitectureAArch64::ReconfigureRegisterInfo(DynamicRegisterInfo &reg_info,
if (reg_value != fail_value && reg_value <= 32)
svg_reg_value = reg_value;
}
if (!svg_reg_value) {
const RegisterInfo *darwin_svg_reg_info = reg_info.GetRegisterInfo("svl");
if (darwin_svg_reg_info) {
uint32_t svg_reg_num = darwin_svg_reg_info->kinds[eRegisterKindLLDB];
uint64_t reg_value =
reg_context.ReadRegisterAsUnsigned(svg_reg_num, fail_value);
// UpdateARM64SVERegistersInfos and UpdateARM64SMERegistersInfos
// expect the number of 8-byte granules; darwin provides number of
// bytes.
if (reg_value != fail_value && reg_value <= 256) {
svg_reg_value = reg_value / 8;
// Apple hardware only implements Streaming SVE mode, so
// the non-streaming Vector Length is not reported by the
// kernel. Set both svg and vg to this svl value.
if (!vg_reg_value)
vg_reg_value = reg_value / 8;
Comment on lines +112 to +118
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: lots of magic values here but in all fairness that's consistent with the surrounding code. The comment covers the 8 byte granule so I'm not too concerned, though some constants might make this easier to read.

}
}
}

if (!vg_reg_value && !svg_reg_value)
return false;
Expand Down
5 changes: 5 additions & 0 deletions lldb/test/API/macosx/sme-registers/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
C_SOURCES := main.c

CFLAGS_EXTRAS := -mcpu=apple-m4

include Makefile.rules
217 changes: 217 additions & 0 deletions lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
import lldb
from lldbsuite.test.lldbtest import *
from lldbsuite.test.decorators import *
import lldbsuite.test.lldbutil as lldbutil
import os


class TestSMERegistersDarwin(TestBase):
NO_DEBUG_INFO_TESTCASE = True
mydir = TestBase.compute_mydir(__file__)

@skipIfRemote
@skipUnlessDarwin
@skipUnlessFeature("hw.optional.arm.FEAT_SME")
@skipUnlessFeature("hw.optional.arm.FEAT_SME2")
# thread_set_state/thread_get_state only avail in macOS 15.4+
@skipIf(macos_version=["<", "15.4"])
def test(self):
"""Test that we can read the contents of the SME/SVE registers on Darwin"""
self.build()
(target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
self, "break before sme", lldb.SBFileSpec("main.c")
)
frame = thread.GetFrameAtIndex(0)
self.assertTrue(frame.IsValid())

self.assertTrue(
target.BreakpointCreateBySourceRegex(
"break while sme", lldb.SBFileSpec("main.c")
).IsValid()
)
self.assertTrue(
target.BreakpointCreateBySourceRegex(
"break after sme", lldb.SBFileSpec("main.c")
).IsValid()
)

if self.TraceOn():
self.runCmd("reg read -a")

self.assertTrue(frame.register["svl"].GetError().Fail())
self.assertTrue(frame.register["z0"].GetError().Fail())
self.assertTrue(frame.register["p0"].GetError().Fail())
self.assertTrue(frame.register["za"].GetError().Fail())
self.assertTrue(frame.register["zt0"].GetError().Fail())

process.Continue()
frame = thread.GetFrameAtIndex(0)
self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)

# Now in SME enabled mode
self.assertTrue(frame.register["svl"].GetError().Success())
self.assertTrue(frame.register["z0"].GetError().Success())
self.assertTrue(frame.register["p0"].GetError().Success())
self.assertTrue(frame.register["za"].GetError().Success())
self.assertTrue(frame.register["zt0"].GetError().Success())

# SSVE and SME modes should be enabled (reflecting PSTATE.SM and PSTATE.ZA)
svcr = frame.register["svcr"]
self.assertEqual(svcr.GetValueAsUnsigned(), 3)

svl_reg = frame.register["svl"]
svl = svl_reg.GetValueAsUnsigned()

z0 = frame.register["z0"]
self.assertEqual(z0.GetNumChildren(), svl)
self.assertEqual(z0.GetChildAtIndex(0).GetValueAsUnsigned(), 0x1)
self.assertEqual(z0.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 0x1)

z31 = frame.register["z31"]
self.assertEqual(z31.GetNumChildren(), svl)
self.assertEqual(z31.GetChildAtIndex(0).GetValueAsUnsigned(), 32)
self.assertEqual(z31.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 32)

p0 = frame.register["p0"]
self.assertEqual(p0.GetNumChildren(), svl / 8)
self.assertEqual(p0.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
self.assertEqual(
p0.GetChildAtIndex(p0.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
)

p15 = frame.register["p15"]
self.assertEqual(p15.GetNumChildren(), svl / 8)
self.assertEqual(p15.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
self.assertEqual(
p15.GetChildAtIndex(p15.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
)

za = frame.register["za"]
self.assertEqual(za.GetNumChildren(), (svl * svl))
za_0 = za.GetChildAtIndex(0)
self.assertEqual(za_0.GetValueAsUnsigned(), 4)
za_final = za.GetChildAtIndex(za.GetNumChildren() - 1)
self.assertEqual(za_final.GetValueAsUnsigned(), 67)

zt0 = frame.register["zt0"]
self.assertEqual(zt0.GetNumChildren(), 64)
zt0_0 = zt0.GetChildAtIndex(0)
self.assertEqual(zt0_0.GetValueAsUnsigned(), 0)
zt0_final = zt0.GetChildAtIndex(63)
self.assertEqual(zt0_final.GetValueAsUnsigned(), 63)

# Modify all of the registers, instruction step, confirm that the
# registers have the new values. Without the instruction step, it's
# possible debugserver or lldb could lie about the write succeeding.

z0_old_values = []
z0_new_values = []
z0_new_str = '"{'
for i in range(svl):
z0_old_values.append(z0.GetChildAtIndex(i).GetValueAsUnsigned())
z0_new_values.append(z0_old_values[i] + 5)
z0_new_str = z0_new_str + ("0x%02x " % z0_new_values[i])
z0_new_str = z0_new_str + '}"'
self.runCmd("reg write z0 %s" % z0_new_str)

z31_old_values = []
z31_new_values = []
z31_new_str = '"{'
for i in range(svl):
z31_old_values.append(z31.GetChildAtIndex(i).GetValueAsUnsigned())
z31_new_values.append(z31_old_values[i] + 3)
z31_new_str = z31_new_str + ("0x%02x " % z31_new_values[i])
z31_new_str = z31_new_str + '}"'
self.runCmd("reg write z31 %s" % z31_new_str)

p0_old_values = []
p0_new_values = []
p0_new_str = '"{'
for i in range(int(svl / 8)):
p0_old_values.append(p0.GetChildAtIndex(i).GetValueAsUnsigned())
p0_new_values.append(p0_old_values[i] - 5)
p0_new_str = p0_new_str + ("0x%02x " % p0_new_values[i])
p0_new_str = p0_new_str + '}"'
self.runCmd("reg write p0 %s" % p0_new_str)

p15_old_values = []
p15_new_values = []
p15_new_str = '"{'
for i in range(int(svl / 8)):
p15_old_values.append(p15.GetChildAtIndex(i).GetValueAsUnsigned())
p15_new_values.append(p15_old_values[i] - 8)
p15_new_str = p15_new_str + ("0x%02x " % p15_new_values[i])
p15_new_str = p15_new_str + '}"'
self.runCmd("reg write p15 %s" % p15_new_str)

za_old_values = []
za_new_values = []
za_new_str = '"{'
for i in range(svl * svl):
za_old_values.append(za.GetChildAtIndex(i).GetValueAsUnsigned())
za_new_values.append(za_old_values[i] + 7)
za_new_str = za_new_str + ("0x%02x " % za_new_values[i])
za_new_str = za_new_str + '}"'
self.runCmd("reg write za %s" % za_new_str)

zt0_old_values = []
zt0_new_values = []
zt0_new_str = '"{'
for i in range(64):
zt0_old_values.append(zt0.GetChildAtIndex(i).GetValueAsUnsigned())
zt0_new_values.append(zt0_old_values[i] + 2)
zt0_new_str = zt0_new_str + ("0x%02x " % zt0_new_values[i])
zt0_new_str = zt0_new_str + '}"'
self.runCmd("reg write zt0 %s" % zt0_new_str)

thread.StepInstruction(False)
frame = thread.GetFrameAtIndex(0)

if self.TraceOn():
self.runCmd("reg read -a")

z0 = frame.register["z0"]
for i in range(z0.GetNumChildren()):
self.assertEqual(
z0_new_values[i], z0.GetChildAtIndex(i).GetValueAsUnsigned()
)

z31 = frame.register["z31"]
for i in range(z31.GetNumChildren()):
self.assertEqual(
z31_new_values[i], z31.GetChildAtIndex(i).GetValueAsUnsigned()
)

p0 = frame.register["p0"]
for i in range(p0.GetNumChildren()):
self.assertEqual(
p0_new_values[i], p0.GetChildAtIndex(i).GetValueAsUnsigned()
)

p15 = frame.register["p15"]
for i in range(p15.GetNumChildren()):
self.assertEqual(
p15_new_values[i], p15.GetChildAtIndex(i).GetValueAsUnsigned()
)

za = frame.register["za"]
for i in range(za.GetNumChildren()):
self.assertEqual(
za_new_values[i], za.GetChildAtIndex(i).GetValueAsUnsigned()
)

zt0 = frame.register["zt0"]
for i in range(zt0.GetNumChildren()):
self.assertEqual(
zt0_new_values[i], zt0.GetChildAtIndex(i).GetValueAsUnsigned()
)

process.Continue()
frame = thread.GetFrameAtIndex(0)
self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)

self.assertTrue(frame.register["svl"].GetError().Fail())
self.assertTrue(frame.register["z0"].GetError().Fail())
self.assertTrue(frame.register["p0"].GetError().Fail())
self.assertTrue(frame.register["za"].GetError().Fail())
self.assertTrue(frame.register["zt0"].GetError().Fail())
113 changes: 113 additions & 0 deletions lldb/test/API/macosx/sme-registers/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/// BUILT with
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: s/built/build/. But also this is covered by the makefile so maybe something like "Requires -mcpu=apple-m4" would be more to the point.

/// xcrun -sdk macosx.internal clang -mcpu=apple-m4 -g sme.c -o sme

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

void write_sve_regs() {
asm volatile("ptrue p0.b\n\t");
asm volatile("ptrue p1.h\n\t");
asm volatile("ptrue p2.s\n\t");
asm volatile("ptrue p3.d\n\t");
asm volatile("pfalse p4.b\n\t");
asm volatile("ptrue p5.b\n\t");
asm volatile("ptrue p6.h\n\t");
asm volatile("ptrue p7.s\n\t");
asm volatile("ptrue p8.d\n\t");
asm volatile("pfalse p9.b\n\t");
asm volatile("ptrue p10.b\n\t");
asm volatile("ptrue p11.h\n\t");
asm volatile("ptrue p12.s\n\t");
asm volatile("ptrue p13.d\n\t");
asm volatile("pfalse p14.b\n\t");
asm volatile("ptrue p15.b\n\t");

asm volatile("cpy z0.b, p0/z, #1\n\t");
asm volatile("cpy z1.b, p5/z, #2\n\t");
asm volatile("cpy z2.b, p10/z, #3\n\t");
asm volatile("cpy z3.b, p15/z, #4\n\t");
asm volatile("cpy z4.b, p0/z, #5\n\t");
asm volatile("cpy z5.b, p5/z, #6\n\t");
asm volatile("cpy z6.b, p10/z, #7\n\t");
asm volatile("cpy z7.b, p15/z, #8\n\t");
asm volatile("cpy z8.b, p0/z, #9\n\t");
asm volatile("cpy z9.b, p5/z, #10\n\t");
asm volatile("cpy z10.b, p10/z, #11\n\t");
asm volatile("cpy z11.b, p15/z, #12\n\t");
asm volatile("cpy z12.b, p0/z, #13\n\t");
asm volatile("cpy z13.b, p5/z, #14\n\t");
asm volatile("cpy z14.b, p10/z, #15\n\t");
asm volatile("cpy z15.b, p15/z, #16\n\t");
asm volatile("cpy z16.b, p0/z, #17\n\t");
asm volatile("cpy z17.b, p5/z, #18\n\t");
asm volatile("cpy z18.b, p10/z, #19\n\t");
asm volatile("cpy z19.b, p15/z, #20\n\t");
asm volatile("cpy z20.b, p0/z, #21\n\t");
asm volatile("cpy z21.b, p5/z, #22\n\t");
asm volatile("cpy z22.b, p10/z, #23\n\t");
asm volatile("cpy z23.b, p15/z, #24\n\t");
asm volatile("cpy z24.b, p0/z, #25\n\t");
asm volatile("cpy z25.b, p5/z, #26\n\t");
asm volatile("cpy z26.b, p10/z, #27\n\t");
asm volatile("cpy z27.b, p15/z, #28\n\t");
asm volatile("cpy z28.b, p0/z, #29\n\t");
asm volatile("cpy z29.b, p5/z, #30\n\t");
asm volatile("cpy z30.b, p10/z, #31\n\t");
asm volatile("cpy z31.b, p15/z, #32\n\t");
}

#define MAX_VL_BYTES 256
void set_za_register(int svl, int value_offset) {
uint8_t data[MAX_VL_BYTES];

// ldr za will actually wrap the selected vector row, by the number of rows
// you have. So setting one that didn't exist would actually set one that did.
// That's why we need the streaming vector length here.
for (int i = 0; i < svl; ++i) {
// This may involve instructions that require the smefa64 extension.
for (int j = 0; j < MAX_VL_BYTES; j++)
data[j] = i + value_offset;
// Each one of these loads a VL sized row of ZA.
asm volatile("mov w12, %w0\n\t"
"ldr za[w12, 0], [%1]\n\t" ::"r"(i),
"r"(&data)
: "w12");
}
}

static uint16_t arm_sme_svl_b(void) {
uint64_t ret = 0;
asm volatile("rdsvl %[ret], #1" : [ret] "=r"(ret));
return (uint16_t)ret;
}

void arm_sme2_set_zt0() {
#define ZTO_LEN (512 / 8)
uint8_t data[ZTO_LEN];
for (unsigned i = 0; i < ZTO_LEN; ++i)
data[i] = i + 0;

asm volatile("ldr zt0, [%0]" ::"r"(&data));
#undef ZT0_LEN
}

int main() {
printf("Enable SME mode\n"); // break before sme

asm volatile("smstart");

write_sve_regs();

set_za_register(arm_sme_svl_b(), 4);

arm_sme2_set_zt0();

int c = 10; // break while sme
c += 5;
c += 5;

asm volatile("smstop");

printf("SME mode disabled\n"); // break after sme
}
25 changes: 15 additions & 10 deletions lldb/tools/debugserver/source/DNBDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,16 +312,21 @@ struct DNBRegisterValue {
uint64_t uint64;
float float32;
double float64;
int8_t v_sint8[64];
int16_t v_sint16[32];
int32_t v_sint32[16];
int64_t v_sint64[8];
uint8_t v_uint8[64];
uint16_t v_uint16[32];
uint32_t v_uint32[16];
uint64_t v_uint64[8];
float v_float32[16];
double v_float64[8];
// AArch64 SME's ZA register max size is 64k, this object must be
// large enough to hold that much data. The current Apple cores
// have a much smaller maximum ZA reg size, but there are not
// multiple copies of this object so increase the static size to
// maximum possible.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Linux I remember heap allocating the object that represented the array register, because of the potential size. Perhaps that just uses a buffer in the background though.

The problem you have with this is that even x0 will take up 64k, right? Or is this object used as an overlay to a buffer and doesn't actually get allocated?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this object is allocated to read/write a single register, so a read of x0 will be a 64k object. But looking at the debugserver sources, we don't store an array of them anywhere - we read / write individual registers one at a time with this object for a short time period, so I don't think the memory increase is a problem. It might be better to have a dynamically allocated size here though, as you did. I did that for the DNBArm64ArchImpl register contexts stored for each thread, where we will have one for each thread when stopped, that memory use made me more nervous.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for Linux we were also stack allocating the register value and I didn't want 64k stack frames everywhere we used one. d99d9d8 in case any of the concerns apply to debugserver also.

(I am also very aware of these issues because in a previous job when we added MIPS MSA support we accidentally turned every register object into 512 bits, even the 8 and 16 bit ones we read from non-MIPS DSP chips)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked at the DNBRegisterValue use a little more, and I think I want to change it to a heap allocated object, but it's going to touch all of the arch plugins in debugserver, so I will do it as a separate change from this one. On the macOS environment, the single 64k register on the stack isn't blowing anything, but it's not ideal and could cause a problem in our more constrained environments.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on changing this to a heap object. This seems unnecessarily wasteful when not in SME mode, which I expect to remain the majority of the time.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking of restructuring the internals of the object to heap-allocate the value space, which would require touching all of the DNBArchImpl back-ends, but actually just heap-allocating the object in RNBRemote (the main place this object is created) would be much easier than changing it at all.

int8_t v_sint8[65536];
int16_t v_sint16[32768];
int32_t v_sint32[16384];
int64_t v_sint64[8192];
uint8_t v_uint8[65536];
uint16_t v_uint16[32768];
uint32_t v_uint32[16384];
uint64_t v_uint64[8192];
float v_float32[16384];
double v_float64[8192];
void *pointer;
char *c_str;
} value;
Expand Down
Loading
Loading