llvm · jasonmolenda · Dec 19, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
@@ -100,6 +100,25 @@ bool ArchitectureAArch64::ReconfigureRegisterInfo(DynamicRegisterInfo &reg_info,
     if (reg_value != fail_value && reg_value <= 32)
       svg_reg_value = reg_value;
   }
+  if (!svg_reg_value) {
+    const RegisterInfo *darwin_svg_reg_info = reg_info.GetRegisterInfo("svl");
+    if (darwin_svg_reg_info) {
+      uint32_t svg_reg_num = darwin_svg_reg_info->kinds[eRegisterKindLLDB];
+      uint64_t reg_value =
+          reg_context.ReadRegisterAsUnsigned(svg_reg_num, fail_value);
+      // UpdateARM64SVERegistersInfos and UpdateARM64SMERegistersInfos
+      // expect the number of 8-byte granules; darwin provides number of
+      // bytes.
+      if (reg_value != fail_value && reg_value <= 256) {
+        svg_reg_value = reg_value / 8;
+        // Apple hardware only implements Streaming SVE mode, so
+        // the non-streaming Vector Length is not reported by the
+        // kernel. Set both svg and vg to this svl value.
+        if (!vg_reg_value)
+          vg_reg_value = reg_value / 8;
+      }
+    }
+  }
 
   if (!vg_reg_value && !svg_reg_value)
     return false;

@@ -0,0 +1,5 @@
+C_SOURCES := main.c
+
+CFLAGS_EXTRAS := -mcpu=apple-m4
+
+include Makefile.rules
@@ -0,0 +1,217 @@
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+import os
+
+
+class TestSMERegistersDarwin(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipIfRemote
+    @skipUnlessDarwin
+    @skipUnlessFeature("hw.optional.arm.FEAT_SME")
+    @skipUnlessFeature("hw.optional.arm.FEAT_SME2")
+    # thread_set_state/thread_get_state only avail in macOS 15.4+
+    @skipIf(macos_version=["<", "15.4"])
+    def test(self):
+        """Test that we can read the contents of the SME/SVE registers on Darwin"""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break before sme", lldb.SBFileSpec("main.c")
+        )
+        frame = thread.GetFrameAtIndex(0)
+        self.assertTrue(frame.IsValid())
+
+        self.assertTrue(
+            target.BreakpointCreateBySourceRegex(
+                "break while sme", lldb.SBFileSpec("main.c")
+            ).IsValid()
+        )
+        self.assertTrue(
+            target.BreakpointCreateBySourceRegex(
+                "break after sme", lldb.SBFileSpec("main.c")
+            ).IsValid()
+        )
+
+        if self.TraceOn():
+            self.runCmd("reg read -a")
+
+        self.assertTrue(frame.register["svl"].GetError().Fail())
+        self.assertTrue(frame.register["z0"].GetError().Fail())
+        self.assertTrue(frame.register["p0"].GetError().Fail())
+        self.assertTrue(frame.register["za"].GetError().Fail())
+        self.assertTrue(frame.register["zt0"].GetError().Fail())
+
+        process.Continue()
+        frame = thread.GetFrameAtIndex(0)
+        self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)
+
+        # Now in SME enabled mode
+        self.assertTrue(frame.register["svl"].GetError().Success())
+        self.assertTrue(frame.register["z0"].GetError().Success())
+        self.assertTrue(frame.register["p0"].GetError().Success())
+        self.assertTrue(frame.register["za"].GetError().Success())
+        self.assertTrue(frame.register["zt0"].GetError().Success())
+
+        # SSVE and SME modes should be enabled (reflecting PSTATE.SM and PSTATE.ZA)
+        svcr = frame.register["svcr"]
+        self.assertEqual(svcr.GetValueAsUnsigned(), 3)
+
+        svl_reg = frame.register["svl"]
+        svl = svl_reg.GetValueAsUnsigned()
+
+        z0 = frame.register["z0"]
+        self.assertEqual(z0.GetNumChildren(), svl)
+        self.assertEqual(z0.GetChildAtIndex(0).GetValueAsUnsigned(), 0x1)
+        self.assertEqual(z0.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 0x1)
+
+        z31 = frame.register["z31"]
+        self.assertEqual(z31.GetNumChildren(), svl)
+        self.assertEqual(z31.GetChildAtIndex(0).GetValueAsUnsigned(), 32)
+        self.assertEqual(z31.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 32)
+
+        p0 = frame.register["p0"]
+        self.assertEqual(p0.GetNumChildren(), svl / 8)
+        self.assertEqual(p0.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
+        self.assertEqual(
+            p0.GetChildAtIndex(p0.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
+        )
+
+        p15 = frame.register["p15"]
+        self.assertEqual(p15.GetNumChildren(), svl / 8)
+        self.assertEqual(p15.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
+        self.assertEqual(
+            p15.GetChildAtIndex(p15.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
+        )
+
+        za = frame.register["za"]
+        self.assertEqual(za.GetNumChildren(), (svl * svl))
+        za_0 = za.GetChildAtIndex(0)
+        self.assertEqual(za_0.GetValueAsUnsigned(), 4)
+        za_final = za.GetChildAtIndex(za.GetNumChildren() - 1)
+        self.assertEqual(za_final.GetValueAsUnsigned(), 67)
+
+        zt0 = frame.register["zt0"]
+        self.assertEqual(zt0.GetNumChildren(), 64)
+        zt0_0 = zt0.GetChildAtIndex(0)
+        self.assertEqual(zt0_0.GetValueAsUnsigned(), 0)
+        zt0_final = zt0.GetChildAtIndex(63)
+        self.assertEqual(zt0_final.GetValueAsUnsigned(), 63)
+
+        # Modify all of the registers, instruction step, confirm that the
+        # registers have the new values.  Without the instruction step, it's
+        # possible debugserver or lldb could lie about the write succeeding.
+
+        z0_old_values = []
+        z0_new_values = []
+        z0_new_str = '"{'
+        for i in range(svl):
+            z0_old_values.append(z0.GetChildAtIndex(i).GetValueAsUnsigned())
+            z0_new_values.append(z0_old_values[i] + 5)
+            z0_new_str = z0_new_str + ("0x%02x " % z0_new_values[i])
+        z0_new_str = z0_new_str + '}"'
+        self.runCmd("reg write z0 %s" % z0_new_str)
+
+        z31_old_values = []
+        z31_new_values = []
+        z31_new_str = '"{'
+        for i in range(svl):
+            z31_old_values.append(z31.GetChildAtIndex(i).GetValueAsUnsigned())
+            z31_new_values.append(z31_old_values[i] + 3)
+            z31_new_str = z31_new_str + ("0x%02x " % z31_new_values[i])
+        z31_new_str = z31_new_str + '}"'
+        self.runCmd("reg write z31 %s" % z31_new_str)
+
+        p0_old_values = []
+        p0_new_values = []
+        p0_new_str = '"{'
+        for i in range(int(svl / 8)):
+            p0_old_values.append(p0.GetChildAtIndex(i).GetValueAsUnsigned())
+            p0_new_values.append(p0_old_values[i] - 5)
+            p0_new_str = p0_new_str + ("0x%02x " % p0_new_values[i])
+        p0_new_str = p0_new_str + '}"'
+        self.runCmd("reg write p0 %s" % p0_new_str)
+
+        p15_old_values = []
+        p15_new_values = []
+        p15_new_str = '"{'
+        for i in range(int(svl / 8)):
+            p15_old_values.append(p15.GetChildAtIndex(i).GetValueAsUnsigned())
+            p15_new_values.append(p15_old_values[i] - 8)
+            p15_new_str = p15_new_str + ("0x%02x " % p15_new_values[i])
+        p15_new_str = p15_new_str + '}"'
+        self.runCmd("reg write p15 %s" % p15_new_str)
+
+        za_old_values = []
+        za_new_values = []
+        za_new_str = '"{'
+        for i in range(svl * svl):
+            za_old_values.append(za.GetChildAtIndex(i).GetValueAsUnsigned())
+            za_new_values.append(za_old_values[i] + 7)
+            za_new_str = za_new_str + ("0x%02x " % za_new_values[i])
+        za_new_str = za_new_str + '}"'
+        self.runCmd("reg write za %s" % za_new_str)
+
+        zt0_old_values = []
+        zt0_new_values = []
+        zt0_new_str = '"{'
+        for i in range(64):
+            zt0_old_values.append(zt0.GetChildAtIndex(i).GetValueAsUnsigned())
+            zt0_new_values.append(zt0_old_values[i] + 2)
+            zt0_new_str = zt0_new_str + ("0x%02x " % zt0_new_values[i])
+        zt0_new_str = zt0_new_str + '}"'
+        self.runCmd("reg write zt0 %s" % zt0_new_str)
+
+        thread.StepInstruction(False)
+        frame = thread.GetFrameAtIndex(0)
+
+        if self.TraceOn():
+            self.runCmd("reg read -a")
+
+        z0 = frame.register["z0"]
+        for i in range(z0.GetNumChildren()):
+            self.assertEqual(
+                z0_new_values[i], z0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        z31 = frame.register["z31"]
+        for i in range(z31.GetNumChildren()):
+            self.assertEqual(
+                z31_new_values[i], z31.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        p0 = frame.register["p0"]
+        for i in range(p0.GetNumChildren()):
+            self.assertEqual(
+                p0_new_values[i], p0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        p15 = frame.register["p15"]
+        for i in range(p15.GetNumChildren()):
+            self.assertEqual(
+                p15_new_values[i], p15.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        za = frame.register["za"]
+        for i in range(za.GetNumChildren()):
+            self.assertEqual(
+                za_new_values[i], za.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        zt0 = frame.register["zt0"]
+        for i in range(zt0.GetNumChildren()):
+            self.assertEqual(
+                zt0_new_values[i], zt0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        process.Continue()
+        frame = thread.GetFrameAtIndex(0)
+        self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)
+
+        self.assertTrue(frame.register["svl"].GetError().Fail())
+        self.assertTrue(frame.register["z0"].GetError().Fail())
+        self.assertTrue(frame.register["p0"].GetError().Fail())
+        self.assertTrue(frame.register["za"].GetError().Fail())
+        self.assertTrue(frame.register["zt0"].GetError().Fail())
@@ -0,0 +1,113 @@
+///  BUILT with
+///    xcrun -sdk macosx.internal clang -mcpu=apple-m4 -g sme.c -o sme
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void write_sve_regs() {
+  asm volatile("ptrue p0.b\n\t");
+  asm volatile("ptrue p1.h\n\t");
+  asm volatile("ptrue p2.s\n\t");
+  asm volatile("ptrue p3.d\n\t");
+  asm volatile("pfalse p4.b\n\t");
+  asm volatile("ptrue p5.b\n\t");
+  asm volatile("ptrue p6.h\n\t");
+  asm volatile("ptrue p7.s\n\t");
+  asm volatile("ptrue p8.d\n\t");
+  asm volatile("pfalse p9.b\n\t");
+  asm volatile("ptrue p10.b\n\t");
+  asm volatile("ptrue p11.h\n\t");
+  asm volatile("ptrue p12.s\n\t");
+  asm volatile("ptrue p13.d\n\t");
+  asm volatile("pfalse p14.b\n\t");
+  asm volatile("ptrue p15.b\n\t");
+
+  asm volatile("cpy  z0.b, p0/z, #1\n\t");
+  asm volatile("cpy  z1.b, p5/z, #2\n\t");
+  asm volatile("cpy  z2.b, p10/z, #3\n\t");
+  asm volatile("cpy  z3.b, p15/z, #4\n\t");
+  asm volatile("cpy  z4.b, p0/z, #5\n\t");
+  asm volatile("cpy  z5.b, p5/z, #6\n\t");
+  asm volatile("cpy  z6.b, p10/z, #7\n\t");
+  asm volatile("cpy  z7.b, p15/z, #8\n\t");
+  asm volatile("cpy  z8.b, p0/z, #9\n\t");
+  asm volatile("cpy  z9.b, p5/z, #10\n\t");
+  asm volatile("cpy  z10.b, p10/z, #11\n\t");
+  asm volatile("cpy  z11.b, p15/z, #12\n\t");
+  asm volatile("cpy  z12.b, p0/z, #13\n\t");
+  asm volatile("cpy  z13.b, p5/z, #14\n\t");
+  asm volatile("cpy  z14.b, p10/z, #15\n\t");
+  asm volatile("cpy  z15.b, p15/z, #16\n\t");
+  asm volatile("cpy  z16.b, p0/z, #17\n\t");
+  asm volatile("cpy  z17.b, p5/z, #18\n\t");
+  asm volatile("cpy  z18.b, p10/z, #19\n\t");
+  asm volatile("cpy  z19.b, p15/z, #20\n\t");
+  asm volatile("cpy  z20.b, p0/z, #21\n\t");
+  asm volatile("cpy  z21.b, p5/z, #22\n\t");
+  asm volatile("cpy  z22.b, p10/z, #23\n\t");
+  asm volatile("cpy  z23.b, p15/z, #24\n\t");
+  asm volatile("cpy  z24.b, p0/z, #25\n\t");
+  asm volatile("cpy  z25.b, p5/z, #26\n\t");
+  asm volatile("cpy  z26.b, p10/z, #27\n\t");
+  asm volatile("cpy  z27.b, p15/z, #28\n\t");
+  asm volatile("cpy  z28.b, p0/z, #29\n\t");
+  asm volatile("cpy  z29.b, p5/z, #30\n\t");
+  asm volatile("cpy  z30.b, p10/z, #31\n\t");
+  asm volatile("cpy  z31.b, p15/z, #32\n\t");
+}
+
+#define MAX_VL_BYTES 256
+void set_za_register(int svl, int value_offset) {
+  uint8_t data[MAX_VL_BYTES];
+
+  // ldr za will actually wrap the selected vector row, by the number of rows
+  // you have. So setting one that didn't exist would actually set one that did.
+  // That's why we need the streaming vector length here.
+  for (int i = 0; i < svl; ++i) {
+    // This may involve instructions that require the smefa64 extension.
+    for (int j = 0; j < MAX_VL_BYTES; j++)
+      data[j] = i + value_offset;
+    // Each one of these loads a VL sized row of ZA.
+    asm volatile("mov w12, %w0\n\t"
+                 "ldr za[w12, 0], [%1]\n\t" ::"r"(i),
+                 "r"(&data)
+                 : "w12");
+  }
+}
+
+static uint16_t arm_sme_svl_b(void) {
+  uint64_t ret = 0;
+  asm volatile("rdsvl  %[ret], #1" : [ret] "=r"(ret));
+  return (uint16_t)ret;
+}
+
+void arm_sme2_set_zt0() {
+#define ZTO_LEN (512 / 8)
+  uint8_t data[ZTO_LEN];
+  for (unsigned i = 0; i < ZTO_LEN; ++i)
+    data[i] = i + 0;
+
+  asm volatile("ldr zt0, [%0]" ::"r"(&data));
+#undef ZT0_LEN
+}
+
+int main() {
+  printf("Enable SME mode\n"); // break before sme
+
+  asm volatile("smstart");
+
+  write_sve_regs();
+
+  set_za_register(arm_sme_svl_b(), 4);
+
+  arm_sme2_set_zt0();
+
+  int c = 10; // break while sme
+  c += 5;
+  c += 5;
+
+  asm volatile("smstop");
+
+  printf("SME mode disabled\n"); // break after sme
+}
@@ -312,16 +312,21 @@ struct DNBRegisterValue {
     uint64_t uint64;
     float float32;
     double float64;
-    int8_t v_sint8[64];
-    int16_t v_sint16[32];
-    int32_t v_sint32[16];
-    int64_t v_sint64[8];
-    uint8_t v_uint8[64];
-    uint16_t v_uint16[32];
-    uint32_t v_uint32[16];
-    uint64_t v_uint64[8];
-    float v_float32[16];
-    double v_float64[8];
+    // AArch64 SME's ZA register max size is 64k, this object must be
+    // large enough to hold that much data.  The current Apple cores
+    // have a much smaller maximum ZA reg size, but there are not
+    // multiple copies of this object so increase the static size to
+    // maximum possible.
+    int8_t v_sint8[65536];
+    int16_t v_sint16[32768];
+    int32_t v_sint32[16384];
+    int64_t v_sint64[8192];
+    uint8_t v_uint8[65536];
+    uint16_t v_uint16[32768];
+    uint32_t v_uint32[16384];
+    uint64_t v_uint64[8192];
+    float v_float32[16384];
+    double v_float64[8192];
     void *pointer;
     char *c_str;
   } value;