[PROTON][AMD] Fix failing proton tests for AMD GPUs (#8763)

ZelboK · danial javady · anmyachev · commit 7f153c3af65f · 2025-12-01T10:59:17.000Z
Fixes upgrade to rocm7 breaking proton tests alongside implementing CircularStoreOp for gmem  - [x] I am not making a trivial change, such as fixing a typo in a comment. - [ ] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [x] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --------- Co-authored-by: danial javady <djavady@amd.com>
diff --git a/third_party/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AMDPatternProtonGPUOpToLLVM.cpp b/third_party/proton/Dialect/lib/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/AMDPatternProtonGPUOpToLLVM.cpp
@@ -2,8 +2,10 @@
 #include "Conversion/ProtonGPUToLLVM/ProtonAMDGPUToLLVM/TargetInfo.h"
 #include "Conversion/ProtonGPUToLLVM/Utility.h"
 #include "Dialect/ProtonGPU/IR/Dialect.h"
+#include "amd/lib/TritonAMDGPUToLLVM/Utility.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 
@@ -37,7 +39,8 @@ struct CircularStoreOpConversion
       // TODO(crobeck): see what buffer ops performance looks like here for
       // global mem (address space 1) compared to predicated ops to shared
       // memory
-      llvm::report_fatal_error("unimplemented");
+      mlir::LLVM::AMD::llStore(rewriter, loc, dataPack.ptr, dataPack.record,
+                               dataPack.isWriter);
     } else if (addrSpace == 3) {
       targetInfo.getTritonTargetInfo().storeDShared(
           rewriter, loc, dataPack.ptr, std::nullopt, dataPack.record,
diff --git a/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp b/third_party/proton/csrc/lib/Profiler/RocTracer/RoctracerProfiler.cpp
@@ -350,19 +350,22 @@ void RoctracerProfiler::RoctracerProfilerPimpl::activityCallback(
     // data on stop
     maxCorrelationId =
         std::max<uint64_t>(maxCorrelationId, record->correlation_id);
-    // TODO(Keren): Roctracer doesn't support cuda graph yet.
+    bool hasCorrelation =
+        correlation.corrIdToExternId.contain(record->correlation_id);
     auto externId =
-        correlation.corrIdToExternId.contain(record->correlation_id)
+        hasCorrelation
             ? correlation.corrIdToExternId.at(record->correlation_id).first
             : Scope::DummyScopeId;
     auto isAPI = correlation.apiExternIds.contain(externId);
     bool isGraph = pImpl->CorrIdToIsHipGraph.contain(record->correlation_id);
-    processActivity(correlation.corrIdToExternId, correlation.apiExternIds,
-                    externId, dataSet, record, isAPI, isGraph);
-    // Track correlation ids from the same stream and erase those <
-    // correlationId
-    correlation.corrIdToExternId.erase(record->correlation_id);
-    correlation.apiExternIds.erase(externId);
+    if (hasCorrelation) {
+      processActivity(correlation.corrIdToExternId, correlation.apiExternIds,
+                      externId, dataSet, record, isAPI, isGraph);
+      // Track correlation ids from the same stream and erase those <
+      // correlationId
+    } else {
+      correlation.apiExternIds.erase(externId);
+    }
     roctracer::getNextRecord<true>(record, &record);
   }
   correlation.complete(maxCorrelationId);
diff --git a/third_party/proton/test/test_instrumentation.py b/third_party/proton/test/test_instrumentation.py
@@ -15,7 +15,6 @@
     is_cuda,
     is_hip,
     is_hip_cdna2,
-    is_hip_cdna4,
     supports_tma,
     supports_ws,
 )
@@ -644,7 +643,6 @@ def foo(x, y, size: tl.constexpr):
         assert trace_events[-1]["args"]["call_stack"][-2] == "test"
 
 
-@pytest.mark.skipif(is_hip_cdna4(), reason="nondeterministic failure")
 def test_globaltime(tmp_path: pathlib.Path):
     temp_file = tmp_path / "test_globaltime.chrome_trace"
     mode = proton.mode.Default(
@@ -760,7 +758,6 @@ def session_kernel_time(session_name: str) -> Tuple[int, int]:
     assert session1_loop_time / session0_loop_time < loop_threshold, "Loop kernel overhead too high"
 
 
-@pytest.mark.skipif(is_hip(), reason="not implemented yet")
 def test_gmem_buffer(tmp_path: pathlib.Path):
 
     @triton.jit
diff --git a/third_party/proton/test/test_profile.py b/third_party/proton/test/test_profile.py
@@ -80,7 +80,6 @@ def foo(x, y):
     assert data[0]["children"][1]["frame"]["name"] == "test2"
 
 
-@pytest.mark.skipif(is_hip(), reason="Currently broken after updating to ROCm 7")
 def test_cudagraph(tmp_path: pathlib.Path, device: str):
     if is_xpu():
         pytest.skip("xpu doesn't support cudagraph; FIXME: double check")