From 8711cea80a35d672de2a3b69ec0a4c4207ae5c1b Mon Sep 17 00:00:00 2001 From: "jian.wu" Date: Tue, 23 Sep 2025 12:07:14 +0800 Subject: [PATCH 1/3] [DAGCombiner] Preserve debug location of original load in fold (conv (load x)) This patch fixes a debug information loss issue during the combine of a conversion (e.g., bitcast) with a load into a new load: `fold (conv (load x)) -> (load (conv*)x)`. The newly created load node was incorrectly using the debug location (`SDLoc`) of the conversion operation (the `conv` node, `N`) instead of the location of the original load operation (the `load` node, `LN0`). The location of the conversion operation often points to compiler-internal instructions and provides little value for source-level debugging. In contrast, the original load's location accurately represents the source of the data access in the user's code. This change ensures the new load inherits the debug location from `LN0` by using `SDLoc(LN0)`, which improves debugging experience and fixes a test case failure observed in the Triton compiler. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +- llvm/test/CodeGen/AMDGPU/combine-conv-load.ll | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6ba6e518899f..4cb0a35aa7b25 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16703,7 +16703,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { } } SDValue Load = - DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), + DAG.getLoad(VT, SDLoc(LN0), LN0->getChain(), LN0->getBasePtr(), LN0->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; diff --git a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll new file mode 100644 index 0000000000000..900c973b712ae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll @@ -0,0 +1,41 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s + +; CHECK-LABEL: test: +; CHECK: .loc 1 8 16 ; test.py:8:16 +; CHECK-NEXT: s_load_dword + +; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define amdgpu_kernel void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1, ptr addrspace(1) inreg readnone captures(none) %2, ptr addrspace(1) inreg readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { + %5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7 + %6 = and i32 %5, 255, !dbg !7 + %7 = icmp eq i32 %6, 0, !dbg !7 + br i1 %7, label %8, label %10, !dbg !7 + +8: ; preds = %4 + %9 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6 + store <1 x float> %9, ptr addrspace(1) %1, align 4, !dbg !7 + br label %10, !dbg !7 + +10: ; preds = %8, %4 + ret void, !dbg !9 +} + +; Function Attrs: alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" } +attributes #1 = { alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "test.py", directory: "/path") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 9, column: 20, scope: !4) +!8 = !DILocation(line: 8, column: 16, scope: !4) +!9 = !DILocation(line: 9, column: 4, scope: !4) From ef8440786cd05ac197c75751b5b388d54f2d36a6 Mon Sep 17 00:00:00 2001 From: "jian.wu" Date: Tue, 23 Sep 2025 17:44:25 +0800 Subject: [PATCH 2/3] update test case --- llvm/test/CodeGen/AMDGPU/combine-conv-load.ll | 41 ------------------- .../DebugInfo/AMDGPU/combine-conv-load.ll | 26 ++++++++++++ 2 files changed, 26 insertions(+), 41 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll create mode 100644 llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll diff --git a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll deleted file mode 100644 index 900c973b712ae..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s - -; CHECK-LABEL: test: -; CHECK: .loc 1 8 16 ; test.py:8:16 -; CHECK-NEXT: s_load_dword - -; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define amdgpu_kernel void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1, ptr addrspace(1) inreg readnone captures(none) %2, ptr addrspace(1) inreg readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 { - %5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7 - %6 = and i32 %5, 255, !dbg !7 - %7 = icmp eq i32 %6, 0, !dbg !7 - br i1 %7, label %8, label %10, !dbg !7 - -8: ; preds = %4 - %9 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6 - store <1 x float> %9, ptr addrspace(1) %1, align 4, !dbg !7 - br label %10, !dbg !7 - -10: ; preds = %8, %4 - ret void, !dbg !9 -} - -; Function Attrs: alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 - -attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" } -attributes #1 = { alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) } - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!2, !3} - -!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) -!1 = !DIFile(filename: "test.py", directory: "/path") -!2 = !{i32 2, !"Debug Info Version", i32 3} -!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} -!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) -!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) -!6 = !{} -!7 = !DILocation(line: 9, column: 20, scope: !4) -!8 = !DILocation(line: 8, column: 16, scope: !4) -!9 = !DILocation(line: 9, column: 4, scope: !4) diff --git a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll new file mode 100644 index 0000000000000..14ce1d9cba098 --- /dev/null +++ b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s + +; CHECK-LABEL: test: +; CHECK: .loc 1 8 16 prologue_end ; test.py:8:16 +; CHECK-NEXT: s_load_dword + +define void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1) local_unnamed_addr !dbg !4 { + %3 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6 + store <1 x float> %3, ptr addrspace(1) %1, align 4, !dbg !7 + + ret void, !dbg !9 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly) +!1 = !DIFile(filename: "test.py", directory: "/path") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!5 = !DISubroutineType(cc: DW_CC_normal, types: !6) +!6 = !{} +!7 = !DILocation(line: 9, column: 20, scope: !4) +!8 = !DILocation(line: 8, column: 16, scope: !4) +!9 = !DILocation(line: 9, column: 4, scope: !4) From e647079f174f1996e5b345576d82d743e5583007 Mon Sep 17 00:00:00 2001 From: "jian.wu" Date: Tue, 23 Sep 2025 19:14:30 +0800 Subject: [PATCH 3/3] Simplify the test case --- llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll index 14ce1d9cba098..0bb3d383248fb 100644 --- a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll +++ b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll @@ -4,10 +4,9 @@ ; CHECK: .loc 1 8 16 prologue_end ; test.py:8:16 ; CHECK-NEXT: s_load_dword -define void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1) local_unnamed_addr !dbg !4 { - %3 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6 - store <1 x float> %3, ptr addrspace(1) %1, align 4, !dbg !7 - +define void @test(ptr addrspace(1) inreg readonly captures(none) %arg0, ptr addrspace(1) inreg writeonly captures(none) %arg1) local_unnamed_addr !dbg !4 { + %ld = load <1 x float>, ptr addrspace(1) %arg0, align 4, !dbg !8, !amdgpu.noclobber !6 + store <1 x float> %ld, ptr addrspace(1) %arg1, align 4, !dbg !7 ret void, !dbg !9 }