Skip to content

Conversation

@cofibrant
Copy link
Contributor

This patch adds several tests identifying potential opportunities for eliminating dead stores and redundant loads when using the llvm.matrix.column.major.store.* and llvm.matrix.column.major.load.* intrinsics.

CC @fhahn

@llvmbot llvmbot added llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms labels Oct 15, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 15, 2025

@llvm/pr-subscribers-llvm-analysis

@llvm/pr-subscribers-llvm-transforms

Author: Nathan Corbyn (cofibrant)

Changes

This patch adds several tests identifying potential opportunities for eliminating dead stores and redundant loads when using the llvm.matrix.column.major.store.* and llvm.matrix.column.major.load.* intrinsics.

CC @fhahn


Full diff: https://github.com/llvm/llvm-project/pull/163573.diff

3 Files Affected:

  • (added) llvm/test/Analysis/BasicAA/matrix-intrinsics.ll (+81)
  • (added) llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll (+53)
  • (added) llvm/test/Transforms/GVN/matrix-intrinsics.ll (+45)
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..71647f139725e
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; BasicAA should prove that loads from sufficiently large static offsets
+; don't overlap with matrix loads with a statically known size.
+
+define <8 x double> @non_overlapping_strided_load(ptr %src) {
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+; CHECK-LABEL: Function: non_overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Ref: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+
+define <8 x double> @non_overlapping_strided_load_i128(ptr %src) {
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i128 u0x200000000
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 u0x100000000, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i128 u0x100000000, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i28(ptr %src.offset, i128 u0x100000000, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+; CHECK-LABEL: Function: non_overlapping_strided_load_i128
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; CHECK: Just Ref: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2)
+
+define <8 x double> @overlapping_strided_load(ptr %src) {
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i32 15
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+; CHECK-LABEL: Function: overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: NoModRef: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Ref: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+
+define <8 x double> @overlapping_strided_load_i128(ptr %src) {
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i128 u0x100000000
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 u0x100000000, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i128 u0x100000000, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i28(ptr %src.offset, i128 u0x100000000, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+; Function: overlapping_strided_load_i128
+; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2)
+; NoModRef: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2) <-> %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; NoModRef: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2)
+; Just Ref: %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr %src.offset, i128 4294967296, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i128(<8 x double> %l, ptr %src, i128 4294967296, i1 false, i32 4, i32 2)
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i128(ptr, i128, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i128(<8 x double>, ptr, i128, i1, i32, i32)
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..7199006ccbeb0
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=dse -S < %s | FileCheck %s
+
+define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..7fd2855e11868
--- /dev/null
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=gvn -S < %s | FileCheck %s
+
+define <8 x double> @redundant_unstrided_load(ptr %src) {
+; CHECK-LABEL: define <8 x double> @redundant_unstrided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[S:%.*]] = fadd <8 x double> [[L]], [[L_2]]
+; CHECK-NEXT:    ret <8 x double> [[S]]
+;
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i32 8
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+define <8 x double> @redundant_strided_load(ptr %src) {
+; CHECK-LABEL: define <8 x double> @redundant_strided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[S:%.*]] = fadd <8 x double> [[L]], [[L_2]]
+; CHECK-NEXT:    ret <8 x double> [[S]]
+;
+entry:
+  %src.offset = getelementptr inbounds double, double* %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  %s = fadd <8 x double> %l, %l.2
+  ret <8 x double> %s
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)

@cofibrant cofibrant force-pushed the cofibrant/memory-location-matrix-load-store-tests branch from 0d89928 to d4db855 Compare October 15, 2025 15:40
@cofibrant cofibrant force-pushed the cofibrant/memory-location-matrix-load-store-tests branch from d4db855 to a85cc0a Compare October 15, 2025 16:55
@cofibrant
Copy link
Contributor Author

Currently blocked on a bug in the IR verifier

%s = fadd <8 x double> %l, %l.2
ret <8 x double> %s
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for DSE, we probably also want tests that cover the interaction between matrix intrinsics and normal loads/stores:

  • normal store is dead because the last read was from a matrix intrinsic that happened before the store
  • normal store is not dead because the last read by a matrix intrinsic happens after the store
  • matrix intrinsic store is dead because the last read was from a normal store that happened before the store
  • matrix intrinsic store is not dead because the last read from a normal store happens after the store

@cofibrant
Copy link
Contributor Author

Once I've actioned Jon's suggestion, this should be ready for review once #163729 lands

@cofibrant cofibrant force-pushed the cofibrant/memory-location-matrix-load-store-tests branch from a85cc0a to 3611780 Compare October 16, 2025 15:21
@cofibrant
Copy link
Contributor Author

Given the change to the objective of #163729, I've updated this PR so that these two changes are now decoupled

@cofibrant cofibrant force-pushed the cofibrant/memory-location-matrix-load-store-tests branch 2 times, most recently from ed18760 to 1eaff3a Compare October 20, 2025 09:13
@cofibrant cofibrant force-pushed the cofibrant/memory-location-matrix-load-store-tests branch from 1eaff3a to 7de09f4 Compare October 21, 2025 10:04
@cofibrant cofibrant requested a review from jroelofs October 21, 2025 15:04
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=gvn -S %s | FileCheck %s

define <8 x double> @redundant_unstrided_load(ptr %src) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might have missed it, but I think it would also be good to add a few tests where we have 2 matrix stores with different number of rows and columns for DSE (i.e 2 stores to the same pointer one storing more than the other, with both possible orderings; one case should eventually be simplified). And something similar for GVN with loads

Copy link
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks

@fhahn fhahn merged commit c636a39 into llvm:main Oct 22, 2025
10 checks passed
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Oct 22, 2025
…or matrix store / load intrinsics (#163573)

This patch adds several tests identifying potential opportunities for
eliminating dead stores and redundant loads when using the
`llvm.matrix.column.major.store.*` and `llvm.matrix.column.major.load.*`
intrinsics.

PR: llvm/llvm-project#163573
@llvm-ci
Copy link
Collaborator

llvm-ci commented Oct 22, 2025

LLVM Buildbot has detected a new failure on builder clang-armv8-quick running on linaro-clang-armv8-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/154/builds/23027

Here is the relevant piece of the build log for the reference
Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'Clangd Unit Tests :: ./ClangdTests/244/335' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:/home/tcwg-buildbot/worker/clang-armv8-quick/stage1/tools/clang/tools/extra/clangd/unittests/./ClangdTests-Clangd Unit Tests-3533744-244-335.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=335 GTEST_SHARD_INDEX=244 /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/tools/clang/tools/extra/clangd/unittests/./ClangdTests
--

Note: This is test shard 245 of 335.
[==========] Running 4 tests from 4 test suites.
[----------] Global test environment set-up.
[----------] 1 test from CompletionStringTest
[ RUN      ] CompletionStringTest.Documentation
[       OK ] CompletionStringTest.Documentation (105 ms)
[----------] 1 test from CompletionStringTest (105 ms total)

[----------] 1 test from FuzzyMatch
[ RUN      ] FuzzyMatch.Matches
[       OK ] FuzzyMatch.Matches (119 ms)
[----------] 1 test from FuzzyMatch (119 ms total)

[----------] 1 test from CrossFileRenameTests
[ RUN      ] CrossFileRenameTests.WithUpToDateIndex
ASTWorker building file /clangd-test/foo.h version null with command 
[/clangd-test]
clang -xobjective-c++ /clangd-test/foo.h
Driver produced command: cc1 -cc1 -triple armv8a-unknown-linux-gnueabihf -fsyntax-only -disable-free -clear-ast-before-backend -main-file-name foo.h -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -target-feature +read-tp-tpidruro -target-feature +vfp2 -target-feature +vfp2sp -target-feature +vfp3 -target-feature +vfp3d16 -target-feature +vfp3d16sp -target-feature +vfp3sp -target-feature +fp16 -target-feature +vfp4 -target-feature +vfp4d16 -target-feature +vfp4d16sp -target-feature +vfp4sp -target-feature +fp-armv8 -target-feature +fp-armv8d16 -target-feature +fp-armv8d16sp -target-feature +fp-armv8sp -target-feature -fullfp16 -target-feature +fp64 -target-feature +d32 -target-feature +sha2 -target-feature +aes -target-feature -fp16fml -target-feature +neon -target-abi aapcs-linux -mfloat-abi hard -debugger-tuning=gdb -fdebug-compilation-dir=/clangd-test -fcoverage-compilation-dir=/clangd-test -resource-dir lib/clang/22 -internal-isystem lib/clang/22/include -internal-isystem /usr/local/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -fdeprecated-macro -ferror-limit 19 -fno-signed-char -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fobjc-runtime=gcc -fobjc-encode-cxx-class-template-spec -fobjc-exceptions -fcxx-exceptions -fexceptions -no-round-trip-args -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -x objective-c++ /clangd-test/foo.h
Building first preamble for /clangd-test/foo.h version null
Built preamble of size 421072 for file /clangd-test/foo.h version null in 2.12 seconds
indexed preamble AST for /clangd-test/foo.h version null:
  symbol slab: 0 symbols, 68 bytes
  ref slab: 0 symbols, 0 refs, 72 bytes
  relations slab: 0 relations, 12 bytes
indexed file AST for /clangd-test/foo.h version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 3 symbols, 5 refs, 4232 bytes
  relations slab: 0 relations, 12 bytes
Build dynamic index for main-file symbols with estimated memory usage of 11148 bytes
ASTWorker building file /clangd-test/foo.cc version null with command 
[/clangd-test]
clang -xobjective-c++ /clangd-test/foo.cc
Driver produced command: cc1 -cc1 -triple armv8a-unknown-linux-gnueabihf -fsyntax-only -disable-free -clear-ast-before-backend -main-file-name foo.cc -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -target-cpu generic -target-feature +read-tp-tpidruro -target-feature +vfp2 -target-feature +vfp2sp -target-feature +vfp3 -target-feature +vfp3d16 -target-feature +vfp3d16sp -target-feature +vfp3sp -target-feature +fp16 -target-feature +vfp4 -target-feature +vfp4d16 -target-feature +vfp4d16sp -target-feature +vfp4sp -target-feature +fp-armv8 -target-feature +fp-armv8d16 -target-feature +fp-armv8d16sp -target-feature +fp-armv8sp -target-feature -fullfp16 -target-feature +fp64 -target-feature +d32 -target-feature +sha2 -target-feature +aes -target-feature -fp16fml -target-feature +neon -target-abi aapcs-linux -mfloat-abi hard -debugger-tuning=gdb -fdebug-compilation-dir=/clangd-test -fcoverage-compilation-dir=/clangd-test -resource-dir lib/clang/22 -internal-isystem lib/clang/22/include -internal-isystem /usr/local/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -fdeprecated-macro -ferror-limit 19 -fno-signed-char -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -fobjc-runtime=gcc -fobjc-encode-cxx-class-template-spec -fobjc-exceptions -fcxx-exceptions -fexceptions -no-round-trip-args -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -x objective-c++ /clangd-test/foo.cc
Building first preamble for /clangd-test/foo.cc version null
Built preamble of size 422020 for file /clangd-test/foo.cc version null in 0.22 seconds
indexed preamble AST for /clangd-test/foo.cc version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 0 symbols, 0 refs, 72 bytes
  relations slab: 0 relations, 12 bytes
Build dynamic index for header symbols with estimated memory usage of 6408 bytes
indexed file AST for /clangd-test/foo.cc version null:
  symbol slab: 3 symbols, 4584 bytes
  ref slab: 4 symbols, 9 refs, 4232 bytes
...

@cofibrant cofibrant deleted the cofibrant/memory-location-matrix-load-store-tests branch October 22, 2025 14:47
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

llvm:analysis Includes value tracking, cost tables and constant folding llvm:transforms

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants