Rebase for new test + improve comment

lucas-rami · lucas-rami · commit 846238b1fc89 · 2025-10-07T15:22:33.000Z
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -484,6 +484,9 @@ class PreRARematStage : public GCNSchedStage {
     }
 
     /// Returns whether is is always beneficial to rematerialize this register.
+    /// These are rematerializations that never move instructions into higher
+    /// frequency regions and at least shorten live intervals, so they are
+    /// always useful irrespective of RP targets.
     bool isAlwaysBeneficial() const {
       // When the using region is executed a single time, we know
       // rematerializing will be beneficial whatever the defining region's
@@ -555,7 +558,7 @@ class PreRARematStage : public GCNSchedStage {
     /// Per-region contribution weights to RP score depending on whether RP is
     /// guaranteed or only likely to be reduced in the region. Only their
     /// relative value w.r.t. one another matter.
-    static constexpr int WeightRP = 10, WeightRPMaybe = 5;
+    static constexpr int WeightRP = 2, WeightRPMaybe = 1;
 
     /// Number of 32-bit registers this rematerialization covers.
     const unsigned NumRegs;
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll
@@ -11,11 +11,11 @@ define amdgpu_kernel void @uniform_build_vector(i64 %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    s_mov_b32 s6, s5
 ; GCN-NEXT:    s_mov_b32 s7, s5
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    ; sched_barrier mask(0x00000000)
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -35,4 +35,4 @@ entry:
 declare void @llvm.amdgcn.sched.barrier(i32 immarg) #0
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
-declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1