From ff9e19478988a0a0f0d2a0ae7cf8018e296f56dd Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Thu, 25 Sep 2025 14:27:36 +0530
Subject: [PATCH 1/6] [InstCombine] Optimize AMDGPU ballot + assume uniformity
 patterns

When we encounter assume(ballot(cmp) == -1), we know that cmp is uniform
across all lanes and evaluates to true. This optimization recognizes this
pattern and replaces the condition with a constant true, allowing
subsequent passes to eliminate dead code and optimize control flow.

The optimization handles both i32 and i64 ballot intrinsics and only
applies when the ballot result is compared against -1 (all lanes active).
This is a conservative approach that ensures correctness while enabling
significant optimizations for uniform control flow patterns.
---
 .../InstCombine/InstCombineCalls.cpp          |  33 ++++++
 .../amdgpu-assume-ballot-uniform.ll           | 108 ++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e1e24a99d0474..5dd29a7c5155c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,6 +3540,39 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Optimize AMDGPU ballot uniformity assumptions:
+    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
+    // This allows us to optimize away the ballot and replace cmp with true
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
+      // Check if this is an AMDGPU ballot intrinsic
+      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotCondition = BallotCall->getArgOperand(0);
+
+          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
+          // evaluates to true We can safely replace BallotCondition with true
+          // since ballot == -1 implies all lanes are true
+          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
+              !isa<Constant>(BallotCondition)) {
+
+            // Add the condition to the worklist for further optimization
+            Worklist.pushValue(BallotCondition);
+
+            // Replace BallotCondition with true
+            BallotCondition->replaceAllUsesWith(
+                ConstantInt::getTrue(BallotCondition->getType()));
+
+            // The assumption is now always true, so we can simplify it
+            replaceUse(II->getOperandUse(0),
+                       ConstantInt::getTrue(II->getContext()));
+            return II;
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
new file mode 100644
index 0000000000000..3bf3b317b0771
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test case for optimizing AMDGPU ballot + assume patterns
+; When we assume that ballot(cmp) == -1, we know that cmp is uniform
+; This allows us to optimize away the ballot and directly branch
+
+define void @test_assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test case with partial optimization - only ballot removal without branch optimization
+define void @test_assume_ballot_partial(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_partial(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - ballot not compared to -1
+define void @test_assume_ballot_not_uniform(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_not_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %some = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %some)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot
+define void @test_assume_ballot_uniform_i32(i32 %x) {
+; CHECK-LABEL: @test_assume_ballot_uniform_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
+  %all = icmp eq i32 %ballot, -1  
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

From d86e4924b143400edf116143e620e14bce509ee4 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Mon, 29 Sep 2025 12:50:10 +0530
Subject: [PATCH 2/6] [InstCombine] Add constant folding for AMDGPU ballot
 intrinsics

Address reviewer feedback by implementing free-form ballot intrinsic optimization
instead of assume-dependent patterns. This approach:

1. Optimizes ballot(constant) directly as a standard intrinsic optimization
2. Allows uniformity analysis to handle assumes through proper channels
3. Follows established AMDGPU intrinsic patterns (amdgcn_cos, amdgcn_sin)
4. Enables broader optimization opportunities beyond assume contexts

Optimizations:
- ballot(true) -> -1 (all lanes active)
- ballot(false) -> 0 (no lanes active)

This addresses the core reviewer concern about performing optimization
in assume context rather than as a free-form pattern, and lets the
uniformity analysis framework handle assumes as intended.

Test cases focus on constant folding rather than assume-specific patterns,
demonstrating the more general applicability of this approach.
---
 .github/copilot-instructions.md               |  78 ++++++++++++-
 .../InstCombine/InstCombineCalls.cpp          |  51 +++-----
 .../InstCombine/InstCombineInternal.h         |   2 +
 .../amdgpu-assume-ballot-uniform.ll           | 108 -----------------
 .../amdgpu-ballot-constant-fold.ll            | 109 ++++++++++++++++++
 5 files changed, 204 insertions(+), 144 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
 create mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 03748938700e3..922584f7bc9aa 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,4 +1,74 @@
-When performing a code review, pay close attention to code modifying a function's
-control flow. Could the change result in the corruption of performance profile
-data? Could the change result in invalid debug information, in particular for
-branches and calls?
+# LLVM Project AI Coding Agent Instructions
+
+## Architecture Overview
+
+LLVM is a compiler infrastructure with modular components:
+- **Core LLVM** (`llvm/`): IR processing, optimizations, code generation
+- **Clang** (`clang/`): C/C++/Objective-C frontend 
+- **LLD** (`lld/`): Linker
+- **libc++** (`libcxx/`): C++ standard library
+- **Target backends** (`llvm/lib/Target/{AMDGPU,X86,ARM,...}/`): Architecture-specific code generation
+
+## Essential Development Workflows
+
+### Build System (CMake + Ninja)
+```bash
+# Configure with common options for development
+cmake -G Ninja -S llvm-project/llvm -B build \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DLLVM_ENABLE_PROJECTS="clang;lld" \
+  -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" \
+  -DLLVM_ENABLE_ASSERTIONS=ON
+
+# Build and install
+cmake --build build
+cmake --install build --prefix install/
+```
+
+### Testing with LIT
+- Use `opt < file.ll -passes=instcombine -S | FileCheck %s` pattern for IR transforms
+- Test files go in `llvm/test/Transforms/{PassName}/` with `.ll` extension
+- Always include both positive and negative test cases
+- Use `CHECK-LABEL:` for function boundaries, `CHECK-NEXT:` for strict sequence
+
+### Key Patterns for Transforms
+
+**InstCombine Pattern** (`llvm/lib/Transforms/InstCombine/`):
+- Implement in `InstCombine*.cpp` using visitor pattern (`visitCallInst`, `visitBinaryOperator`)
+- Use `PatternMatch.h` matchers: `match(V, m_Add(m_Value(X), m_ConstantInt()))`
+- Return `nullptr` for no change, modified instruction, or replacement
+- Add to worklist with `Worklist.pushValue()` for dependent values
+
+**Target-Specific Intrinsics**:
+- AMDGPU: `@llvm.amdgcn.*` intrinsics in `llvm/include/llvm/IR/IntrinsicsAMDGPU.td`
+- Pattern: `if (II->getIntrinsicID() == Intrinsic::amdgcn_ballot)`
+
+## Code Quality Standards
+
+### Control Flow & Debug Info
+When modifying control flow, ensure changes don't corrupt:
+- Performance profiling data (branch weights, call counts)
+- Debug information for branches and calls
+- Exception handling unwind information
+
+### Target-Specific Considerations
+- **AMDGPU**: Wavefront uniformity analysis affects ballot intrinsics
+- **X86**: Vector width and ISA feature dependencies
+- Use `TargetTransformInfo` for cost models and capability queries
+
+### Testing Requirements
+- Every optimization needs regression tests showing before/after IR
+- Include edge cases: constants, undef, poison values
+- Test target-specific intrinsics with appropriate triple
+- Use `; RUN: opt < %s -passes=... -S | FileCheck %s` format
+
+## Common Development Pitfalls
+- Don't assume instruction operand order without checking `isCommutative()`
+- Verify type compatibility before creating new instructions
+- Consider poison/undef propagation in optimizations
+- Check for side effects before eliminating instructions
+
+## Pass Pipeline Context
+- InstCombine runs early and multiple times in the pipeline
+- Subsequent passes like SimplifyCFG will clean up control flow
+- Use `replaceAllUsesWith()` carefully to maintain SSA form
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5dd29a7c5155c..0ab648e94e034 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,6 +85,8 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+
+
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2987,6 +2989,20 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::amdgcn_ballot: {
+    // Optimize ballot intrinsics when the condition is known to be uniform
+    Value *Condition = II->getArgOperand(0);
+    
+    // If the condition is a constant, we can evaluate the ballot directly
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
+      // ballot(true) -> -1 (all lanes active)
+      // ballot(false) -> 0 (no lanes active)
+      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
+      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
+    }
+    
+    break;
+  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3540,38 +3556,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Optimize AMDGPU ballot uniformity assumptions:
-    // assume(icmp eq (ballot(cmp), -1)) implies that cmp is uniform and true
-    // This allows us to optimize away the ballot and replace cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      // Check if this is an AMDGPU ballot intrinsic
-      if (auto *BallotCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (BallotCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotCondition = BallotCall->getArgOperand(0);
-
-          // If ballot(cmp) == -1, then cmp is uniform across all lanes and
-          // evaluates to true We can safely replace BallotCondition with true
-          // since ballot == -1 implies all lanes are true
-          if (BallotCondition->getType()->isIntOrIntVectorTy(1) &&
-              !isa<Constant>(BallotCondition)) {
-
-            // Add the condition to the worklist for further optimization
-            Worklist.pushValue(BallotCondition);
-
-            // Replace BallotCondition with true
-            BallotCondition->replaceAllUsesWith(
-                ConstantInt::getTrue(BallotCondition->getType()));
-
-            // The assumption is now always true, so we can simplify it
-            replaceUse(II->getOperandUse(0),
-                       ConstantInt::getTrue(II->getContext()));
-            return II;
-          }
-        }
-      }
-    }
+
 
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
@@ -3586,6 +3571,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
+
+
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index e01c145bf5de3..820bec8d4b7ac 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,6 +124,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
+  
+
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll b/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
deleted file mode 100644
index 3bf3b317b0771..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-assume-ballot-uniform.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Test case for optimizing AMDGPU ballot + assume patterns
-; When we assume that ballot(cmp) == -1, we know that cmp is uniform
-; This allows us to optimize away the ballot and directly branch
-
-define void @test_assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test case with partial optimization - only ballot removal without branch optimization
-define void @test_assume_ballot_partial(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_partial(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Negative test - ballot not compared to -1
-define void @test_assume_ballot_not_uniform(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_not_uniform(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[SOME:%.*]] = icmp ne i64 [[BALLOT]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[SOME]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %some = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %some)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-; Test with 32-bit ballot
-define void @test_assume_ballot_uniform_i32(i32 %x) {
-; CHECK-LABEL: @test_assume_ballot_uniform_i32(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
-  %all = icmp eq i32 %ballot, -1  
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
new file mode 100644
index 0000000000000..6180760f7d511
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Test cases for optimizing AMDGPU ballot intrinsics
+; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+
+define void @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+define void @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
+; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
+  %none = icmp ne i64 %ballot, 0
+  call void @llvm.assume(i1 %none)
+  br i1 false, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Test with 32-bit ballot constants
+define void @test_ballot_i32_constant_true() {
+; CHECK-LABEL: @test_ballot_i32_constant_true(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
+  %all = icmp eq i32 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 true, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+; Negative test - variable condition should not be optimized
+define void @test_ballot_variable_condition(i32 %x) {
+; CHECK-LABEL: @test_ballot_variable_condition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
+declare i32 @llvm.amdgcn.ballot.i32(i1)
+declare void @llvm.assume(i1)

From a711d52301ca83dd76b8da934c955b7d494f360a Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Mon, 29 Sep 2025 21:24:37 +0530
Subject: [PATCH 3/6] [InstCombine] Implement generic assume-based uniformity
 optimization

Implement a comprehensive generic optimization for assume intrinsics that extracts
uniformity information and optimizes dominated uses. The optimization recognizes
multiple patterns that establish value uniformity and replaces dominated uses with
uniform constants.

Addresses uniformity analysis optimization opportunities identified in
AMDGPU ballot/readfirstlane + assume patterns for improved code generation
through constant propagation.
---
 .github/copilot-instructions.md               |  78 +---------
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  16 ++-
 .../InstCombine/InstCombineCalls.cpp          | 136 +++++++++++++++---
 .../InstCombine/InstCombineInternal.h         |   5 +-
 .../amdgpu-ballot-constant-fold.ll            | 117 +++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  30 +++-
 6 files changed, 195 insertions(+), 187 deletions(-)

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 922584f7bc9aa..03748938700e3 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,74 +1,4 @@
-# LLVM Project AI Coding Agent Instructions
-
-## Architecture Overview
-
-LLVM is a compiler infrastructure with modular components:
-- **Core LLVM** (`llvm/`): IR processing, optimizations, code generation
-- **Clang** (`clang/`): C/C++/Objective-C frontend 
-- **LLD** (`lld/`): Linker
-- **libc++** (`libcxx/`): C++ standard library
-- **Target backends** (`llvm/lib/Target/{AMDGPU,X86,ARM,...}/`): Architecture-specific code generation
-
-## Essential Development Workflows
-
-### Build System (CMake + Ninja)
-```bash
-# Configure with common options for development
-cmake -G Ninja -S llvm-project/llvm -B build \
-  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-  -DLLVM_ENABLE_PROJECTS="clang;lld" \
-  -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" \
-  -DLLVM_ENABLE_ASSERTIONS=ON
-
-# Build and install
-cmake --build build
-cmake --install build --prefix install/
-```
-
-### Testing with LIT
-- Use `opt < file.ll -passes=instcombine -S | FileCheck %s` pattern for IR transforms
-- Test files go in `llvm/test/Transforms/{PassName}/` with `.ll` extension
-- Always include both positive and negative test cases
-- Use `CHECK-LABEL:` for function boundaries, `CHECK-NEXT:` for strict sequence
-
-### Key Patterns for Transforms
-
-**InstCombine Pattern** (`llvm/lib/Transforms/InstCombine/`):
-- Implement in `InstCombine*.cpp` using visitor pattern (`visitCallInst`, `visitBinaryOperator`)
-- Use `PatternMatch.h` matchers: `match(V, m_Add(m_Value(X), m_ConstantInt()))`
-- Return `nullptr` for no change, modified instruction, or replacement
-- Add to worklist with `Worklist.pushValue()` for dependent values
-
-**Target-Specific Intrinsics**:
-- AMDGPU: `@llvm.amdgcn.*` intrinsics in `llvm/include/llvm/IR/IntrinsicsAMDGPU.td`
-- Pattern: `if (II->getIntrinsicID() == Intrinsic::amdgcn_ballot)`
-
-## Code Quality Standards
-
-### Control Flow & Debug Info
-When modifying control flow, ensure changes don't corrupt:
-- Performance profiling data (branch weights, call counts)
-- Debug information for branches and calls
-- Exception handling unwind information
-
-### Target-Specific Considerations
-- **AMDGPU**: Wavefront uniformity analysis affects ballot intrinsics
-- **X86**: Vector width and ISA feature dependencies
-- Use `TargetTransformInfo` for cost models and capability queries
-
-### Testing Requirements
-- Every optimization needs regression tests showing before/after IR
-- Include edge cases: constants, undef, poison values
-- Test target-specific intrinsics with appropriate triple
-- Use `; RUN: opt < %s -passes=... -S | FileCheck %s` format
-
-## Common Development Pitfalls
-- Don't assume instruction operand order without checking `isCommutative()`
-- Verify type compatibility before creating new instructions
-- Consider poison/undef propagation in optimizations
-- Check for side effects before eliminating instructions
-
-## Pass Pipeline Context
-- InstCombine runs early and multiple times in the pipeline
-- Subsequent passes like SimplifyCFG will clean up control flow
-- Use `replaceAllUsesWith()` carefully to maintain SSA form
+When performing a code review, pay close attention to code modifying a function's
+control flow. Could the change result in the corruption of performance profile
+data? Could the change result in invalid debug information, in particular for
+branches and calls?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4fe5d00679436..fc4e64fcd52a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,12 +1322,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-    }
+    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1341,6 +1336,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
+
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+      // Note: ballot(true) is NOT constant folded because the result depends
+      // on the active lanes in the wavefront, not just the condition value.
+    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 0ab648e94e034..efa31ebc45dc8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -85,8 +85,6 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-
-
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
@@ -2989,20 +2987,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::amdgcn_ballot: {
-    // Optimize ballot intrinsics when the condition is known to be uniform
-    Value *Condition = II->getArgOperand(0);
-    
-    // If the condition is a constant, we can evaluate the ballot directly
-    if (auto *ConstCond = dyn_cast<ConstantInt>(Condition)) {
-      // ballot(true) -> -1 (all lanes active)
-      // ballot(false) -> 0 (no lanes active)
-      uint64_t Result = ConstCond->isOne() ? ~0ULL : 0ULL;
-      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
-    }
-    
-    break;
-  }
   case Intrinsic::ldexp: {
     // ldexp(ldexp(x, a), b) -> ldexp(x, a + b)
     //
@@ -3556,8 +3540,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3571,7 +3553,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-
+    // Try to extract uniformity information from the assume and optimize
+    // dominated uses of any variables that are established as uniform.
+    optimizeAssumedUniformValues(cast<AssumeInst>(II));
 
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
@@ -5026,3 +5010,117 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
+
+/// Extract uniformity information from assume and optimize dominated uses.
+/// This works with any assume pattern that establishes value uniformity.
+void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
+  Value *AssumedCondition = Assume->getArgOperand(0);
+  
+  // Map of uniform values to their uniform constants
+  SmallDenseMap<Value *, Constant *> UniformValues;
+  
+  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
+  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
+    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+      Value *LHS = ICmp->getOperand(0);
+      Value *RHS = ICmp->getOperand(1);
+      
+      // X == constant -> X is uniform and equals constant
+      if (auto *C = dyn_cast<Constant>(RHS)) {
+        UniformValues[LHS] = C;
+      } else if (auto *C = dyn_cast<Constant>(LHS)) {
+        UniformValues[RHS] = C;
+      }
+      
+      // Handle intrinsic patterns in equality comparisons
+      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(RHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+              
+              // Special case: if BallotArg is an equality comparison, 
+              // we know the operands are equal
+              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
+                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
+                  Value *CmpLHS = CmpInst->getOperand(0);
+                  Value *CmpRHS = CmpInst->getOperand(1);
+                  
+                  // If one operand is constant, the other is uniform and equals that constant
+                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
+                    UniformValues[CmpLHS] = C;
+                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
+                    UniformValues[CmpRHS] = C;
+                  }
+                  // TODO: Handle case where both operands are variables
+                }
+              }
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          // assume(readfirstlane(x) == c) -> x is uniform and equals c
+          if (auto *C = dyn_cast<Constant>(RHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+      
+      // Handle the reverse case too
+      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
+        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          if (match(LHS, m_AllOnes())) {
+            Value *BallotArg = IntrinsicCall->getArgOperand(0);
+            if (BallotArg->getType()->isIntegerTy(1)) {
+              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
+            }
+          }
+        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
+          if (auto *C = dyn_cast<Constant>(LHS)) {
+            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
+            UniformValues[ReadFirstLaneArg] = C;
+          }
+        }
+      }
+    }
+  }
+  
+  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
+  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
+    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
+  }
+  
+  // Now optimize dominated uses of all discovered uniform values
+  for (auto &[UniformValue, UniformConstant] : UniformValues) {
+    SmallVector<Use *, 8> DominatedUses;
+    
+    // Find all uses dominated by the assume
+    // Skip if the value doesn't have a use list (e.g., constants)
+    if (!UniformValue->hasUseList())
+      continue;
+      
+    for (Use &U : UniformValue->uses()) {
+      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
+      if (!UseInst || UseInst == Assume)
+        continue;
+        
+      // Critical: Check dominance using InstCombine's infrastructure  
+      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
+        DominatedUses.push_back(&U);
+      }
+    }
+    
+    // Replace only dominated uses with the uniform constant
+    for (Use *U : DominatedUses) {
+      U->set(UniformConstant);
+      Worklist.pushValue(U->getUser());
+    }
+    
+    // Mark for further optimization if we made changes
+    if (!DominatedUses.empty()) {
+      Worklist.pushValue(UniformValue);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 820bec8d4b7ac..2a920d13ae495 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -124,8 +124,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       BinaryOperator &I);
   Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
       BinaryOperator &OldAShr);
-  
-
   Instruction *visitAShr(BinaryOperator &I);
   Instruction *visitLShr(BinaryOperator &I);
   Instruction *commonShiftTransforms(BinaryOperator &I);
@@ -231,6 +229,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
+  
+  /// Optimize uses of variables that are established as uniform by assume intrinsics.
+  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
index 6180760f7d511..b146487af9990 100644
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
@@ -1,109 +1,56 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
 
 ; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(true) -> -1 and ballot(false) -> 0
+; Focus on constant folding ballot(false) -> 0 and poison handling
 
-define void @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with constant false condition gets folded
+define i32 @test_ballot_constant_false() {
+; CHECK-LABEL: @test_ballot_constant_false(
+; CHECK-NEXT:    ret i32 0
 ;
-entry:
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
+  ret i32 %ballot
 }
 
-define void @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[NONE:%.*]] = icmp ne i64 0, 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[NONE]])
-; CHECK-NEXT:    br i1 false, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot.i64 with constant false condition gets folded
+define i64 @test_ballot_i64_constant_false() {
+; CHECK-LABEL: @test_ballot_i64_constant_false(
+; CHECK-NEXT:    ret i64 0
 ;
-entry:
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  %none = icmp ne i64 %ballot, 0
-  call void @llvm.assume(i1 %none)
-  br i1 false, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
-; Test with 32-bit ballot constants
-define void @test_ballot_i32_constant_true() {
-; CHECK-LABEL: @test_ballot_i32_constant_true(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i32 -1, -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; Test ballot with poison condition gets folded to poison
+define i64 @test_ballot_poison() {
+; CHECK-LABEL: @test_ballot_poison(
+; CHECK-NEXT:    ret i64 poison
 ;
-entry:
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 true)
-  %all = icmp eq i32 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 true, label %foo, label %bar
-
-foo:
-  ret void
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
+  ret i64 %ballot
+}
 
-bar:
-  ret void
+; Test that ballot(true) is NOT constant folded (depends on active lanes)
+define i64 @test_ballot_constant_true() {
+; CHECK-LABEL: @test_ballot_constant_true(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    ret i64 [[BALLOT]]
+;
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+  ret i64 %ballot
 }
 
-; Negative test - variable condition should not be optimized
-define void @test_ballot_variable_condition(i32 %x) {
+; Test that ballot with variable condition is not optimized
+define i64 @test_ballot_variable_condition(i32 %x) {
 ; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret i64 [[BALLOT]]
 ;
-entry:
   %cmp = icmp eq i32 %x, 0
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
+  ret i64 %ballot
 }
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i32 @llvm.amdgcn.ballot.i32(i1)
-declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b0b871513513..7b15d67dc4b69 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -82,7 +82,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 [[A]]
+; CHECK-NEXT:    ret i32 4
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1034,6 +1034,34 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
+; Test AMDGPU ballot uniformity pattern optimization  
+; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
+; of cmp to true, which then optimizes the branch condition
+define void @assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @assume_ballot_uniform(
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 

From b16fe0282dfc6782575c2347771eaadaacd462f5 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Thu, 2 Oct 2025 11:51:43 +0530
Subject: [PATCH 4/6] [InstCombine] Add focused assume-based optimizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements two targeted optimizations for assume intrinsics:

1. Basic equality optimization: assume(x == c) replaces dominated uses of x with c
2. AMDGPU ballot optimization: assume(ballot(cmp) == -1) replaces dominated
   uses of cmp with true, since ballot == -1 means cmp is true on all active lanes

Key design principles:
- No uniformity analysis concepts - uses simple mathematical facts
- Dominance-based replacement for correctness
- Clean pattern matching without complex framework
- Addresses reviewer feedback to keep it simple and focused

Examples:
  assume(x == 42); use = add x, 1  →  use = 43
  assume(ballot(cmp) == -1); br cmp  →  br true

This enables better optimization of GPU code patterns while remaining
architecture-agnostic through the mathematical properties of the operations.
---
 .../InstCombine/InstCombineCalls.cpp          | 191 +++++++-----------
 .../InstCombine/InstCombineInternal.h         |   3 -
 llvm/test/Transforms/InstCombine/assume.ll    |   9 +-
 3 files changed, 79 insertions(+), 124 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index efa31ebc45dc8..a19701d9fe0c6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,6 +3540,79 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
+      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
+        Value *LHS = ICmp->getOperand(0);
+        Value *RHS = ICmp->getOperand(1);
+        Value *Variable = nullptr;
+        Constant *ConstantVal = nullptr;
+        
+        if (auto *C = dyn_cast<Constant>(RHS)) {
+          Variable = LHS;
+          ConstantVal = C;
+        } else if (auto *C = dyn_cast<Constant>(LHS)) {
+          Variable = RHS;
+          ConstantVal = C;
+        }
+        
+        if (Variable && ConstantVal && Variable->hasUseList()) {
+          SmallVector<Use *, 8> DominatedUses;
+          for (Use &U : Variable->uses()) {
+            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+              if (UseInst != II && UseInst != ICmp &&
+                  isValidAssumeForContext(II, UseInst, &DT)) {
+                DominatedUses.push_back(&U);
+              }
+            }
+          }
+          
+          for (Use *U : DominatedUses) {
+            U->set(ConstantVal);
+            Worklist.pushValue(U->getUser());
+          }
+          
+          if (!DominatedUses.empty()) {
+            Worklist.pushValue(Variable);
+          }
+        }
+      }
+    }
+
+    // Optimize AMDGPU ballot patterns in assumes:
+    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
+    // We can replace uses of cmp with true in dominated contexts
+    Value *BallotInst;
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
+        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
+          Value *BallotArg = IntrCall->getArgOperand(0);
+          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
+            // Find dominated uses and replace with true
+            SmallVector<Use *, 8> DominatedUses;
+            for (Use &U : BallotArg->uses()) {
+              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
+                if (UseInst != II && UseInst != IntrCall &&
+                    isValidAssumeForContext(II, UseInst, &DT)) {
+                  DominatedUses.push_back(&U);
+                }
+              }
+            }
+            
+            // Replace dominated uses with true
+            for (Use *U : DominatedUses) {
+              U->set(ConstantInt::getTrue(BallotArg->getType()));
+              Worklist.pushValue(U->getUser());
+            }
+            
+            if (!DominatedUses.empty()) {
+              Worklist.pushValue(BallotArg);
+            }
+          }
+        }
+      }
+    }
+
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
@@ -3553,10 +3626,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(*II);
     }
 
-    // Try to extract uniformity information from the assume and optimize
-    // dominated uses of any variables that are established as uniform.
-    optimizeAssumedUniformValues(cast<AssumeInst>(II));
-
     // Update the cache of affected values for this assumption (we might be
     // here because we just simplified the condition).
     AC.updateAffectedValues(cast<AssumeInst>(II));
@@ -5011,116 +5080,4 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   return &Call;
 }
 
-/// Extract uniformity information from assume and optimize dominated uses.
-/// This works with any assume pattern that establishes value uniformity.
-void InstCombinerImpl::optimizeAssumedUniformValues(AssumeInst *Assume) {
-  Value *AssumedCondition = Assume->getArgOperand(0);
-  
-  // Map of uniform values to their uniform constants
-  SmallDenseMap<Value *, Constant *> UniformValues;
-  
-  // Pattern 1: assume(icmp eq (X, C)) -> X is uniform and equals C
-  if (auto *ICmp = dyn_cast<ICmpInst>(AssumedCondition)) {
-    if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-      Value *LHS = ICmp->getOperand(0);
-      Value *RHS = ICmp->getOperand(1);
-      
-      // X == constant -> X is uniform and equals constant
-      if (auto *C = dyn_cast<Constant>(RHS)) {
-        UniformValues[LHS] = C;
-      } else if (auto *C = dyn_cast<Constant>(LHS)) {
-        UniformValues[RHS] = C;
-      }
-      
-      // Handle intrinsic patterns in equality comparisons
-      // Pattern: assume(ballot(cmp) == -1) -> cmp is uniform and true
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(LHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(RHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-              
-              // Special case: if BallotArg is an equality comparison, 
-              // we know the operands are equal
-              if (auto *CmpInst = dyn_cast<ICmpInst>(BallotArg)) {
-                if (CmpInst->getPredicate() == ICmpInst::ICMP_EQ) {
-                  Value *CmpLHS = CmpInst->getOperand(0);
-                  Value *CmpRHS = CmpInst->getOperand(1);
-                  
-                  // If one operand is constant, the other is uniform and equals that constant
-                  if (auto *C = dyn_cast<Constant>(CmpRHS)) {
-                    UniformValues[CmpLHS] = C;
-                  } else if (auto *C = dyn_cast<Constant>(CmpLHS)) {
-                    UniformValues[CmpRHS] = C;
-                  }
-                  // TODO: Handle case where both operands are variables
-                }
-              }
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          // assume(readfirstlane(x) == c) -> x is uniform and equals c
-          if (auto *C = dyn_cast<Constant>(RHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-      
-      // Handle the reverse case too
-      if (auto *IntrinsicCall = dyn_cast<IntrinsicInst>(RHS)) {
-        if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          if (match(LHS, m_AllOnes())) {
-            Value *BallotArg = IntrinsicCall->getArgOperand(0);
-            if (BallotArg->getType()->isIntegerTy(1)) {
-              UniformValues[BallotArg] = ConstantInt::getTrue(BallotArg->getType());
-            }
-          }
-        } else if (IntrinsicCall->getIntrinsicID() == Intrinsic::amdgcn_readfirstlane) {
-          if (auto *C = dyn_cast<Constant>(LHS)) {
-            Value *ReadFirstLaneArg = IntrinsicCall->getArgOperand(0);
-            UniformValues[ReadFirstLaneArg] = C;
-          }
-        }
-      }
-    }
-  }
-  
-  // Pattern 2: assume(X) where X is i1 -> X is uniform and equals true  
-  if (AssumedCondition->getType()->isIntegerTy(1) && !isa<ICmpInst>(AssumedCondition)) {
-    UniformValues[AssumedCondition] = ConstantInt::getTrue(AssumedCondition->getType());
-  }
-  
-  // Now optimize dominated uses of all discovered uniform values
-  for (auto &[UniformValue, UniformConstant] : UniformValues) {
-    SmallVector<Use *, 8> DominatedUses;
-    
-    // Find all uses dominated by the assume
-    // Skip if the value doesn't have a use list (e.g., constants)
-    if (!UniformValue->hasUseList())
-      continue;
-      
-    for (Use &U : UniformValue->uses()) {
-      Instruction *UseInst = dyn_cast<Instruction>(U.getUser());
-      if (!UseInst || UseInst == Assume)
-        continue;
-        
-      // Critical: Check dominance using InstCombine's infrastructure  
-      if (isValidAssumeForContext(Assume, UseInst, &DT)) {
-        DominatedUses.push_back(&U);
-      }
-    }
-    
-    // Replace only dominated uses with the uniform constant
-    for (Use *U : DominatedUses) {
-      U->set(UniformConstant);
-      Worklist.pushValue(U->getUser());
-    }
-    
-    // Mark for further optimization if we made changes
-    if (!DominatedUses.empty()) {
-      Worklist.pushValue(UniformValue);
-    }
-  }
-}
+
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2a920d13ae495..e01c145bf5de3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -229,9 +229,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
 private:
   bool annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI);
   bool isDesirableIntType(unsigned BitWidth) const;
-  
-  /// Optimize uses of variables that are established as uniform by assume intrinsics.
-  void optimizeAssumedUniformValues(AssumeInst *Assume);
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 7b15d67dc4b69..0eb869dca0501 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1034,12 +1034,13 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot uniformity pattern optimization  
-; This demonstrates that assume(ballot(cmp) == -1) enables the optimization
-; of cmp to true, which then optimizes the branch condition
+; Test AMDGPU ballot pattern optimization  
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so dominated uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
 ; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
 ; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]

From 40eeaffac7bb50a2ec7ac6f86599c0b248fc140c Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Sun, 5 Oct 2025 13:51:02 +0530
Subject: [PATCH 5/6] Address @ssahasra's review feedback

- Remove 'dominated' terminology from comments and variable names
  (SSA values always dominate their uses)
- Rename DominatedUses -> Uses throughout
- Remove redundant UseInst != II check in ICmp block
- Fix code formatting (clang-format)
- Split long comment lines
- Remove extra blank lines at EOF
---
 .../InstCombine/InstCombineCalls.cpp          | 46 +++++++++----------
 llvm/test/Transforms/InstCombine/assume.ll    |  4 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a19701d9fe0c6..d856b0559e421 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,14 +3540,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace dominated uses of x with c
+    // Basic assume equality optimization: assume(x == c) -> replace uses of x
+    // with c
     if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
       if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
         Value *LHS = ICmp->getOperand(0);
         Value *RHS = ICmp->getOperand(1);
         Value *Variable = nullptr;
         Constant *ConstantVal = nullptr;
-        
+
         if (auto *C = dyn_cast<Constant>(RHS)) {
           Variable = LHS;
           ConstantVal = C;
@@ -3555,24 +3556,24 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           Variable = RHS;
           ConstantVal = C;
         }
-        
+
         if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> DominatedUses;
+          SmallVector<Use *, 8> Uses;
           for (Use &U : Variable->uses()) {
             if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != II && UseInst != ICmp &&
+              if (UseInst != ICmp &&
                   isValidAssumeForContext(II, UseInst, &DT)) {
-                DominatedUses.push_back(&U);
+                Uses.push_back(&U);
               }
             }
           }
-          
-          for (Use *U : DominatedUses) {
+
+          for (Use *U : Uses) {
             U->set(ConstantVal);
             Worklist.pushValue(U->getUser());
           }
-          
-          if (!DominatedUses.empty()) {
+
+          if (!Uses.empty()) {
             Worklist.pushValue(Variable);
           }
         }
@@ -3581,31 +3582,32 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
     // Optimize AMDGPU ballot patterns in assumes:
     // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true in dominated contexts
+    // We can replace uses of cmp with true
     Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst), m_AllOnes()))) {
+    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
+                                        m_AllOnes()))) {
       if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
         if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
           Value *BallotArg = IntrCall->getArgOperand(0);
           if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find dominated uses and replace with true
-            SmallVector<Use *, 8> DominatedUses;
+            // Find uses and replace with true
+            SmallVector<Use *, 8> Uses;
             for (Use &U : BallotArg->uses()) {
               if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != II && UseInst != IntrCall &&
+                if (UseInst != IntrCall &&
                     isValidAssumeForContext(II, UseInst, &DT)) {
-                  DominatedUses.push_back(&U);
+                  Uses.push_back(&U);
                 }
               }
             }
-            
-            // Replace dominated uses with true
-            for (Use *U : DominatedUses) {
+
+            // Replace uses with true
+            for (Use *U : Uses) {
               U->set(ConstantInt::getTrue(BallotArg->getType()));
               Worklist.pushValue(U->getUser());
             }
-            
-            if (!DominatedUses.empty()) {
+
+            if (!Uses.empty()) {
               Worklist.pushValue(BallotArg);
             }
           }
@@ -5079,5 +5081,3 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
   Call.setCalledFunction(FTy, NestF);
   return &Call;
 }
-
-
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 0eb869dca0501..51f284c95ed73 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -1034,9 +1034,9 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization  
+; Test AMDGPU ballot pattern optimization
 ; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so dominated uses of cmp can be replaced with true
+; so uses of cmp can be replaced with true
 define void @assume_ballot_uniform(i32 %x) {
 ; CHECK-LABEL: @assume_ballot_uniform(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0

From 08355f9a05dec0d097e889cda76c435d7a5e3525 Mon Sep 17 00:00:00 2001
From: TejaX-Alaghari <Teja.Alaghari@amd.com>
Date: Tue, 7 Oct 2025 23:08:11 +0530
Subject: [PATCH 6/6] Address feedback on the location of the opt

- Remove redundant const propagration (assume equality opt) from InstCombine.
- Moved assume(ballot(cmp) == -1) optimization from InstCombine to GVN.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 16 ++--
 .../InstCombine/InstCombineCalls.cpp          | 75 -------------------
 llvm/lib/Transforms/Scalar/GVN.cpp            | 18 +++++
 llvm/test/Transforms/GVN/assume-equal.ll      | 29 +++++++
 .../amdgpu-ballot-constant-fold.ll            | 56 --------------
 llvm/test/Transforms/InstCombine/assume.ll    | 31 +-------
 6 files changed, 54 insertions(+), 171 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index fc4e64fcd52a1..4fe5d00679436 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1322,7 +1322,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (isa<PoisonValue>(Arg))
       return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
 
-    // For Wave32 targets, convert i64 ballot to i32 ballot + zext
+    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+    }
     if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
       // %b64 = call i64 ballot.i64(...)
       // =>
@@ -1336,15 +1341,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       Call->takeName(&II);
       return IC.replaceInstUsesWith(II, Call);
     }
-
-    if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
-      if (Src->isZero()) {
-        // amdgcn.ballot(i1 0) is zero.
-        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
-      }
-      // Note: ballot(true) is NOT constant folded because the result depends
-      // on the active lanes in the wavefront, not just the condition value.
-    }
     break;
   }
   case Intrinsic::amdgcn_wavefrontsize: {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d856b0559e421..e1e24a99d0474 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,81 +3540,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
-    // Basic assume equality optimization: assume(x == c) -> replace uses of x
-    // with c
-    if (auto *ICmp = dyn_cast<ICmpInst>(IIOperand)) {
-      if (ICmp->getPredicate() == ICmpInst::ICMP_EQ) {
-        Value *LHS = ICmp->getOperand(0);
-        Value *RHS = ICmp->getOperand(1);
-        Value *Variable = nullptr;
-        Constant *ConstantVal = nullptr;
-
-        if (auto *C = dyn_cast<Constant>(RHS)) {
-          Variable = LHS;
-          ConstantVal = C;
-        } else if (auto *C = dyn_cast<Constant>(LHS)) {
-          Variable = RHS;
-          ConstantVal = C;
-        }
-
-        if (Variable && ConstantVal && Variable->hasUseList()) {
-          SmallVector<Use *, 8> Uses;
-          for (Use &U : Variable->uses()) {
-            if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-              if (UseInst != ICmp &&
-                  isValidAssumeForContext(II, UseInst, &DT)) {
-                Uses.push_back(&U);
-              }
-            }
-          }
-
-          for (Use *U : Uses) {
-            U->set(ConstantVal);
-            Worklist.pushValue(U->getUser());
-          }
-
-          if (!Uses.empty()) {
-            Worklist.pushValue(Variable);
-          }
-        }
-      }
-    }
-
-    // Optimize AMDGPU ballot patterns in assumes:
-    // assume(ballot(cmp) == -1) means cmp is true on all active lanes
-    // We can replace uses of cmp with true
-    Value *BallotInst;
-    if (match(IIOperand, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(BallotInst),
-                                        m_AllOnes()))) {
-      if (auto *IntrCall = dyn_cast<IntrinsicInst>(BallotInst)) {
-        if (IntrCall->getIntrinsicID() == Intrinsic::amdgcn_ballot) {
-          Value *BallotArg = IntrCall->getArgOperand(0);
-          if (BallotArg->getType()->isIntegerTy(1) && BallotArg->hasUseList()) {
-            // Find uses and replace with true
-            SmallVector<Use *, 8> Uses;
-            for (Use &U : BallotArg->uses()) {
-              if (auto *UseInst = dyn_cast<Instruction>(U.getUser())) {
-                if (UseInst != IntrCall &&
-                    isValidAssumeForContext(II, UseInst, &DT)) {
-                  Uses.push_back(&U);
-                }
-              }
-            }
-
-            // Replace uses with true
-            for (Use *U : Uses) {
-              U->set(ConstantInt::getTrue(BallotArg->getType()));
-              Worklist.pushValue(U->getUser());
-            }
-
-            if (!Uses.empty()) {
-              Worklist.pushValue(BallotArg);
-            }
-          }
-        }
-      }
-    }
-
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b9b5b5823d780..39727b1613653 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -54,6 +54,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -2206,6 +2207,23 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
           std::swap(CmpLHS, CmpRHS);
       }
 
+      // Optimize AMDGPU ballot pattern: assume(ballot(cmp) == -1) or
+      // assume(ballot(cmp) == exec_mask). This implies cmp is true on all
+      // active lanes and hence can be replaced with true.
+      if (isa<IntrinsicInst>(CmpLHS) && isa<Constant>(CmpRHS)) {
+        auto *IntrCall = cast<IntrinsicInst>(CmpLHS);
+        // Check if CmpLHS is a ballot intrinsic
+        if (IntrCall->getIntrinsicID() ==
+            Intrinsic::AMDGCNIntrinsics::amdgcn_ballot) {
+          Value *BallotArg = IntrCall->getArgOperand(0);
+          if (BallotArg->getType()->isIntegerTy(1) &&
+              (match(CmpRHS, m_AllOnes()) || !isa<Constant>(CmpRHS))) {
+            CmpLHS = BallotArg;
+            CmpRHS = ConstantInt::getTrue(BallotArg->getType());
+          }
+        }
+      }
+
       // Handle degenerate case where we either haven't pruned a dead path or a
       // removed a trivial assume yet.
       if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
diff --git a/llvm/test/Transforms/GVN/assume-equal.ll b/llvm/test/Transforms/GVN/assume-equal.ll
index 0c922daf82b32..3eb10f4ab99e7 100644
--- a/llvm/test/Transforms/GVN/assume-equal.ll
+++ b/llvm/test/Transforms/GVN/assume-equal.ll
@@ -387,6 +387,35 @@ define i8 @assume_ptr_eq_same_prov(ptr %p, i64 %x) {
   ret i8 %v
 }
 
+; Test AMDGPU ballot pattern optimization
+; assume(ballot(cmp) == -1) means cmp is true on all active lanes
+; so uses of cmp can be replaced with true
+define void @assume_ballot_uniform(i32 %x) {
+; CHECK-LABEL: @assume_ballot_uniform(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
+; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
+; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
+; CHECK:       foo:
+; CHECK-NEXT:    ret void
+; CHECK:       bar:
+; CHECK-NEXT:    ret void
+;
+  %cmp = icmp eq i32 %x, 0
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
+  %all = icmp eq i64 %ballot, -1
+  call void @llvm.assume(i1 %all)
+  br i1 %cmp, label %foo, label %bar
+
+foo:
+  ret void
+
+bar:
+  ret void
+}
+
+declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare noalias ptr @_Znwm(i64)
 declare void @_ZN1AC1Ev(ptr)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll b/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
deleted file mode 100644
index b146487af9990..0000000000000
--- a/llvm/test/Transforms/InstCombine/amdgpu-ballot-constant-fold.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S | FileCheck %s
-
-; Test cases for optimizing AMDGPU ballot intrinsics
-; Focus on constant folding ballot(false) -> 0 and poison handling
-
-; Test ballot with constant false condition gets folded
-define i32 @test_ballot_constant_false() {
-; CHECK-LABEL: @test_ballot_constant_false(
-; CHECK-NEXT:    ret i32 0
-;
-  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 false)
-  ret i32 %ballot
-}
-
-; Test ballot.i64 with constant false condition gets folded
-define i64 @test_ballot_i64_constant_false() {
-; CHECK-LABEL: @test_ballot_i64_constant_false(
-; CHECK-NEXT:    ret i64 0
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 false)
-  ret i64 %ballot
-}
-
-; Test ballot with poison condition gets folded to poison
-define i64 @test_ballot_poison() {
-; CHECK-LABEL: @test_ballot_poison(
-; CHECK-NEXT:    ret i64 poison
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 poison)
-  ret i64 %ballot
-}
-
-; Test that ballot(true) is NOT constant folded (depends on active lanes)
-define i64 @test_ballot_constant_true() {
-; CHECK-LABEL: @test_ballot_constant_true(
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-  ret i64 %ballot
-}
-
-; Test that ballot with variable condition is not optimized
-define i64 @test_ballot_variable_condition(i32 %x) {
-; CHECK-LABEL: @test_ballot_variable_condition(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    ret i64 [[BALLOT]]
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  ret i64 %ballot
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
-declare i32 @llvm.amdgcn.ballot.i32(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 51f284c95ed73..7b0b871513513 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -82,7 +82,7 @@ define i32 @simple(i32 %a) #1 {
 ; CHECK-LABEL: @simple(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    ret i32 4
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %cmp = icmp eq i32 %a, 4
   tail call void @llvm.assume(i1 %cmp)
@@ -1034,35 +1034,6 @@ define i1 @neg_assume_trunc_eq_one(i8 %x) {
   ret i1 %q
 }
 
-; Test AMDGPU ballot pattern optimization
-; assume(ballot(cmp) == -1) means cmp is true on all active lanes
-; so uses of cmp can be replaced with true
-define void @assume_ballot_uniform(i32 %x) {
-; CHECK-LABEL: @assume_ballot_uniform(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[CMP]])
-; CHECK-NEXT:    [[ALL:%.*]] = icmp eq i64 [[BALLOT]], -1
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ALL]])
-; CHECK-NEXT:    br i1 true, label [[FOO:%.*]], label [[BAR:%.*]]
-; CHECK:       foo:
-; CHECK-NEXT:    ret void
-; CHECK:       bar:
-; CHECK-NEXT:    ret void
-;
-  %cmp = icmp eq i32 %x, 0
-  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
-  %all = icmp eq i64 %ballot, -1
-  call void @llvm.assume(i1 %all)
-  br i1 %cmp, label %foo, label %bar
-
-foo:
-  ret void
-
-bar:
-  ret void
-}
-
-declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare void @use(i1)
 declare void @llvm.dbg.value(metadata, metadata, metadata)