llvm
diff --git a/‎.ci/generate_test_report_github.py‎
Lines changed: 10 additions & 5 deletions b/‎.ci/generate_test_report_github.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎.ci/utils.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/utils.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/build-ci-container-tooling.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-ci-container-tooling.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/build-ci-container.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-ci-container.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/premerge.yaml‎
Lines changed: 41 additions & 8 deletions b/‎.github/workflows/premerge.yaml‎
Lines changed: 41 additions & 8 deletions
diff --git a/‎.github/workflows/release-asset-audit.py‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/release-asset-audit.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bolt/docs/PacRetDesign.md‎
Lines changed: 228 additions & 0 deletions b/‎bolt/docs/PacRetDesign.md‎
Lines changed: 228 additions & 0 deletions
@@ -8,10 +8,15 @@
 
 import generate_test_report_lib
 
-PLATFORM_TITLES = {
-    "Windows": ":window: Windows x64 Test Results",
-    "Linux": ":penguin: Linux x64 Test Results",
-}
+def compute_platform_title() -> str:
+    logo = ":window:" if platform.system() == "Windows" else ":penguin:"
+    # On Linux the machine value is x86_64 on Windows it is AMD64.
+    if platform.machine() == "x86_64" or platform.machine() == "AMD64":
+        arch = "x64"
+    else:
+        arch = platform.machine()
+    return f"{logo} {platform.system()} {arch} Test Results"
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -22,7 +27,7 @@
     args = parser.parse_args()
 
     report = generate_test_report_lib.generate_report_from_files(
-        PLATFORM_TITLES[platform.system()], args.return_code, args.build_test_logs
+        compute_platform_title(), args.return_code, args.build_test_logs
     )
 
     print(report)
@@ -56,6 +56,7 @@ function start-group {
 export PIP_BREAK_SYSTEM_PACKAGES=1
 pip install -q -r "${MONOREPO_ROOT}"/.ci/all_requirements.txt
 
-if [[ "$GITHUB_ACTIONS" != "" ]]; then
+# The ARM64 builders run on AWS and don't have access to the GCS cache.
+if [[ "$GITHUB_ACTIONS" != "" ]] && [[ "$RUNNER_ARCH" != "ARM64" ]]; then
   python .ci/cache_lit_timing_files.py download
 fi
@@ -1,4 +1,4 @@
-name: Build CI Container
+name: Build CI Tooling Containers
 
 permissions:
   contents: read
@@ -101,7 +101,7 @@ jobs:
           }
 
           podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          for f in $(find . -iname *.tar); do
+          for f in $(find . -iname '*.tar'); do
             image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
             push_container $image_name
 
 
@@ -103,7 +103,7 @@ jobs:
           }
 
           podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          for f in $(find . -iname *.tar); do
+          for f in $(find . -iname '*.tar'); do
             image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
             push_container $image_name
 
 
@@ -24,17 +24,45 @@ concurrency:
 
 jobs:
   premerge-checks-linux:
-    name: Build and Test Linux
+    name: Build and Test Linux${{ (startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') && ' AArch64') || '' }}
     if: >-
         github.repository_owner == 'llvm' &&
         (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: llvm-premerge-linux-runners
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - depot-ubuntu-24.04-arm-16
+          - llvm-premerge-linux-runners
+    runs-on: ${{ matrix.runs-on }}
+    container:
+      # The llvm-premerge agents are already containers and running the
+      # this same image, so we can't use a container for the github action
+      # job.  The depot containers are running on VMs, so we can use a
+      # container.  This helps ensure the build environment is as close
+      # as possible on both the depot runners and the llvm-premerge runners.
+      image: ${{ (startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') && format('ghcr.io/{0}/arm64v8/ci-ubuntu-24.04',github.repository_owner) ) || null }}
+      # --privileged is needed to run the lldb tests that disable aslr.
+      # The SCCACHE environment variables are need to be copied from the host
+      # to the container to make sure it is configured correctly to use the
+      # depot cache.
+      options: >-
+         --privileged
+         --env SCCACHE_WEBDAV_ENDPOINT
+         --env SCCACHE_WEBDAV_TOKEN
+    defaults:
+      run:
+        # The run step defaults to using sh as the shell when running in a
+        # container, so make bash the default to ensure consistency between
+        # container and non-container jobs.
+        shell: bash
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Build and Test
+        continue-on-error: ${{ runner.arch == 'ARM64' }}
         run: |
           git config --global --add safe.directory '*'
 
@@ -54,11 +82,16 @@ jobs:
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          # This environment variable is passes into the container through the
-          # runner pod definition. This differs between our two clusters which
-          # why we do not hardcode it.
-          export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET
-          export SCCACHE_GCS_RW_MODE=READ_WRITE
+          # The linux-premerge runners are hosted on GCP and have a different
+          # cache setup than the depot runners.
+          if [[ "${{ matrix.runs-on }}" = "llvm-premerge-linux-runners" ]]; then
+            # This environment variable is passes into the container through the
+            # runner pod definition. This differs between our two clusters which
+            # why we do not hardcode it.
+            export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET
+            export SCCACHE_GCS_RW_MODE=READ_WRITE
+          fi
+          env
 
           # Set the idle timeout to zero to ensure sccache runs for the
           # entire duration of the job. Otherwise it might stop if we run
@@ -78,7 +111,7 @@ jobs:
         if: '!cancelled()'
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
-          name: Premerge Artifacts (Linux)
+          name: Premerge Artifacts (Linux ${{ runner.arch }})
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
 
@@ -54,6 +54,8 @@ def _get_uploaders(release_version):
                 "tru",
                 "tstellar",
                 "github-actions[bot]",
+                "c-rhodes",
+                "dyung",
             ]
         )
 
 
@@ -0,0 +1,228 @@
+# Optimizing binaries with pac-ret hardening
+
+This is a design document about processing the `DW_CFA_AARCH64_negate_ra_state`
+DWARF instruction in BOLT. As it describes internal design decisions, the
+intended audience is BOLT developers. The document is an updated version of the
+[RFC posted on the LLVM Discourse](https://discourse.llvm.org/t/rfc-bolt-aarch64-handle-opnegaterastate-to-enable-optimizing-binaries-with-pac-ret-hardening/86594).
+
+
+`DW_CFA_AARCH64_negate_ra_state` is also referred to as  `.cfi_negate_ra_state`
+in assembly, or `OpNegateRAState` in BOLT sources. In this document, I will use
+**negate-ra-state** as a shorthand.
+
+## Introduction
+
+### Pointer Authentication
+
+For more information, see the [pac-ret section of the BOLT-binary-analysis document](BinaryAnalysis.md#pac-ret-analysis).
+
+### DW_CFA_AARCH64_negate_ra_state
+
+The negate-ra-state CFI is a vendor-specific Call Frame Instruction defined in
+the [Arm ABI](https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst#id1).
+
+```
+The DW_CFA_AARCH64_negate_ra_state operation negates bit[0] of the RA_SIGN_STATE pseudo-register.
+```
+
+This bit indicates to the unwinder whether the current return address is signed
+or not (hence the name). The unwinder uses this information to authenticate the
+pointer, and remove the Pointer Authentication Code (PAC) bits.
+Incorrect placement of negate-ra-state CFIs causes the unwinder to either attempt
+to authenticate an unsigned pointer (resulting in a segmentation fault), or skip
+authentication on a signed pointer, which can also cause a fault.
+
+Note: some unwinders use the `xpac` instruction to strip the PAC bits without
+authenticating the pointer. This is an incorrect (incomplete) implementation,
+as it allows control-flow modification in the case of unwinding.
+
+There are no DWARF instructions to directly set or clear the RA State. However,
+two other CFIs can also affect the RA state:
+- `DW_CFA_remember_state`: this CFI stores register rules onto an implicit stack.
+- `DW_CFA_restore_state`:  this CFI pops rules from this stack.
+
+Example:
+
+| CFI                            | Effect on RA state             |
+| ------------------------------ | ------------------------------ |
+| (default)                      | 0                              |
+| DW_CFA_AARCH64_negate_ra_state | 0 -> 1                         |
+| DW_CFA_remember_state          | 1 pushed to the stack          |
+| DW_CFA_AARCH64_negate_ra_state | 1 -> 0                         |
+| DW_CFA_restore_state           | 0 -> 1 (popped from the stack) |
+
+The Arm ABI also defines the DW_CFA_AARCH64_negate_ra_state_with_pc CFI, but it
+is not widely used, and is [likely to become deprecated](https://github.com/ARM-software/abi-aa/issues/327).
+
+### Where are these CFIs needed?
+
+Whenever two consecutive instructions have different RA states, the unwinder must
+be informed of the change. This typically occurs during pointer signing or
+authentication. If adjacent instructions differ in RA state but neither signs
+nor authenticates the return address, they must belong to different control flow
+paths. One is part of an execution path with signed RA, the other is part of a
+path with an unsigned RA.
+
+In the example below, the first BasicBlock ends in a conditional branch, and
+jumps to two different BasicBlocks, each with their own authentication, and
+return. The instructions on the border of the second and third BasicBlock have
+different RA states. The `ret` at the end of the second BasicBlock is in unsigned
+state. The start of the third BasicBlock is after the `paciasp` in the control
+flow, but before the authentication. In this case, a negate-ra-state is needed
+at the end of the second BasicBlock.
+
+```
+        +----------------+
+        |     paciasp    |
+        |                |
+        |      b.cc      |
+        +--------+-------+
+                 |
++----------------+
+|                |
+|       +--------v-------+
+|       |                |
+|       |    autiasp     |
+|       |      ret       |   // RA: unsigned
+|       +----------------+
++----------------+
+                 |
+        +--------v-------+  // RA: signed
+        |                |
+        |     autiasp    |
+        |      ret       |
+        +----------------+
+```
+
+> [!important]
+> The unwinder does not follow the control flow graph. It reads unwind
+> information in the layout order.
+
+Because these locations are dependent on how the function layout looks,
+negate-ra-state CFIs will become invalid during BasicBlock reordering.
+
+## Solution design
+
+The implementation introduces two new passes:
+1. `MarkRAStatesPass`: assigns the RA state to each instruction based on the CFIs
+    in the input binary
+2. `InsertNegateRAStatePass`: reads those assigned instruction RA states after
+    optimizations, and emits `DW_CFA_AARCH64_negate_ra_state` CFIs at the correct
+    places: wherever there is a state change between two consecutive instructions
+    in the layout order.
+
+To track metadata on individual instructions, the `MCAnnotation` class was
+extended. These also have helper functions in `MCPlusBuilder`.
+
+### Saving annotations at CFI reading
+
+CFIs are read and added to BinaryFunctions in `CFIReaderWriter::FillCFIInfoFor`.
+At this point, we add MCAnnotations about negate-ra-state, remember-state and
+restore-state CFIs to the instructions they refer to. This is to not interfere
+with the CFI processing that already happens in BOLT (e.g. remember-state and
+restore-state CFIs are removed in `normalizeCFIState` for reasons unrelated to PAC).
+
+As we add the MCAnnotations *to instructions*, we have to account for the case
+where the function starts with a CFI altering the RA state. As CFIs modify the RA
+state of the instructions before them, we cannot add the annotation to the first
+instruction.
+This special case is handled by adding an `initialRAState` bool to each BinaryFunction.
+If the `Offset` the CFI refers to is zero, we don't store an annotation, but set
+the `initialRAState` in `FillCFIInfoFor`. This information is then used in
+`MarkRAStates`.
+
+### Binaries without DWARF info
+
+In some cases, the DWARF tables are stripped from the binary. These programs
+usually have some other unwind-mechanism.
+These passes only run on functions that include at least one negate-ra-state CFI.
+This avoids processing functions that do not use Pointer Authentication, or on
+functions that use Pointer Authentication, but do not have DWARF info.
+
+In summary:
+- pointer auth is not used: no change, the new passes do not run.
+- pointer auth is used, but DWARF info is stripped: no change, the new passes
+  do not run.
+- pointer auth is used, and we have DWARF CFIs: passes run, and rewrite the
+  negate-ra-state CFI.
+
+### MarkRAStates pass
+
+This pass runs before optimizations reorder anything.
+
+It processes MCAnnotations generated during the CFI reading stage to check if
+instructions have either of the three CFIs that can modify RA state:
+- negate-ra-state,
+- remember-state,
+- restore-state.
+
+Then it adds new MCAnnotations to each instruction, indicating their RA state.
+Those annotations are:
+- Signed,
+- Unsigned.
+
+Below is a simple example, that shows the two different type of annotations:
+what we have before the pass, and after it.
+
+| Instruction                   | Before          |  After   |
+| ----------------------------- | --------------- | -------- |
+| paciasp                       | negate-ra-state | unsigned |
+| stp	x29, x30, [sp, #-0x10]! |                 | signed   |
+| mov	x29, sp                 |                 | signed   |
+| ldp	x29, x30, [sp], #0x10   |                 | signed   |
+| autiasp                       | negate-ra-state | signed   |
+| ret                           |                 | unsigned |
+
+##### Error handling in MarkRAState Pass:
+
+Whenever the MarkRAStates pass finds inconsistencies in the current
+BinaryFunction, it marks the function as ignored using `BF.setIgnored()`. BOLT
+will not optimize this function but will emit it unchanged in the original section
+(`.bolt.org.text`).
+
+The inconsistencies are as follows:
+- finding a `pac*` instruction when already in signed state
+- finding an `aut*` instruction when already in unsigned state
+- finding `pac*` and `aut*` instructions without `.cfi_negate_ra_state`.
+
+Users will be informed about the number of ignored functions in the pass, the
+exact functions ignored, and the found inconsistency.
+
+### InsertNegateRAStatePass
+
+This pass runs after optimizations. It performns the _inverse_ of MarkRAState pa s:
+1. it reads the RA state annotations attached to the instructions, and
+2. whenever the state changes, it adds a PseudoInstruction that holds an
+   OpNegateRAState CFI.
+
+##### Covering newly generated instructions:
+
+Some BOLT passes can add new Instructions. In InsertNegateRAStatePass, we have
+to know what RA state these have.
+
+The current solution has the `inferUnknownStates` function to cover these, using
+a fairly simple strategy: unknown states inherit the last known state.
+
+This will be updated to a more robust solution.
+
+> [!important]
+> As issue #160989 describes, unwind info is incorrect in stubs with multiple callers.
+> For this same reason, we cannot generate correct pac-specific unwind info: the signess
+> of the _incorrect_ return address is meaningless.
+
+### Optimizations requiring special attention
+
+Marking states before optimizations ensure that instructions can be moved around
+freely. The only special case is function splitting. When a function is split,
+the split part becomes a new function in the emitted binary. For unwinding to
+work, it needs to "replay" all CFIs that lead up to the split point. BOLT does
+this for other CFIs. As negate-ra-state is not read (only stored as an Annotation),
+we have to do this manually in InsertNegateRAStatePass. Here, if the split part
+starts with an instruction that has Signed RA state, we add a negate-ra-state CFI
+to indicate this.
+
+## Option to disallow the feature
+
+The feature can be guarded with the `--update-branch-prediction` flag, which is
+on by default. If the flag is set to false, and a function
+`containedNegateRAState()` after `FillCFIInfoFor()`, BOLT exits with an error.
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ jobs:`
`103`	`103`	`}`
`104`	`104`
`105`	`105`	`podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io`
`106`		`- for f in $(find . -iname *.tar); do`
	`106`	`+ for f in $(find . -iname '*.tar'); do`
`107`	`107`	`image_name=$(podman load -q -i $f \| sed 's/Loaded image: //g')`
`108`	`108`	`push_container $image_name`
`109`	`109`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,8 @@ def _get_uploaders(release_version):`
`54`	`54`	`"tru",`
`55`	`55`	`"tstellar",`
`56`	`56`	`"github-actions[bot]",`
	`57`	`+ "c-rhodes",`
	`58`	`+ "dyung",`
`57`	`59`	`]`
`58`	`60`	`)`
`59`	`61`