intel
diff --git a/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-pytorch/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/pins/pytorch-upstream.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 5 additions & 4 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 22 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 22 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 5 additions & 5 deletions
@@ -82,7 +82,7 @@ runs:
       uses: ./.github/actions/load
       env:
         # Increase this value to reset cache
-        CACHE_NUMBER: 11
+        CACHE_NUMBER: 12
       with:
         path: pytorch
         key: pytorch-$PYTORCH_CACHE_KEY-$CACHE_NUMBER
 
@@ -1 +1 @@
-8321eec009c8c79145ebccd51fdfc336e5f8b848
+487873f7cafeb0fd390eaefe40496b804bceabbd
@@ -10,8 +10,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
 
@@ -9,8 +9,7 @@ name: Integration Tests
 on:
   workflow_dispatch:
   pull_request:
-    # You can name your branch dev-foo to get CI runs.
-    branches: [main, 'dev-**']
+    branches-ignore: ['llvm-**']
   merge_group:
     branches: [main, 'dev-**']
     types: [checks_requested]
 
@@ -34,6 +34,9 @@ python/triton/language/extra
 # Proton
 python/triton/profiler
 
+# Instrumentation
+python/triton/instrumentation
+
 # Python caches
 __pycache__/
 *.py[cod]
 
@@ -171,7 +171,7 @@ def forward(q, k, v, causal, sm_scale):
     assert Lk in {16, 32, 64, 128}
     o = torch.empty_like(q, dtype=torch.float32)
     BLOCK_M = 128
-    BLOCK_N = 64 if Lk <= 64 else 32
+    BLOCK_N = 64
     num_stages = 3
     num_warps = 8 if Lq == 64 else 16
     stage = 3 if causal else 1
@@ -205,7 +205,8 @@ def forward(q, k, v, causal, sm_scale):
             BLOCK_DMODEL=Lk,  #
             STAGE=stage,  #
             num_warps=num_warps,  #
-            num_stages=num_stages  #
+            num_stages=num_stages,  #
+            grf_mode='large',  #
         )
     return o
 
 
@@ -91,8 +91,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUStreamPipeline();
   mlir::registerTritonAMDGPUStreamPipelineV2();
   mlir::registerTritonAMDGPUCanonicalizePointers();
-  mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
-  mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 #endif
 
   // TODO: register Triton & TritonGPU passes
 
@@ -88,10 +88,11 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
       // encoding not available
       return resultVals;
     Attribute baseEncoding = encoding;
-    if (isa<AMDMfmaEncodingAttr>(baseEncoding))
-      // TODO: this logic seems incorrect for mfma layout. Skip for now.
-      // We saw mismatches for some flash-attention tests on AMD backend.
-      // Note that this logic works for sliced layout whose parent is
+    if (isa<AMDMfmaEncodingAttr>(baseEncoding) ||
+        isa<AMDWmmaEncodingAttr>(baseEncoding))
+      // TODO: this logic seems incorrect for mfma and wmma layout. Skip for
+      // now. We saw mismatches for some flash-attention and dot tests on AMD
+      // backend. Note that this logic works for sliced layout whose parent is
       // mfma layout. Therefore, this is not combined with the following check.
       return resultVals;
     while (auto sliced = dyn_cast<SliceEncodingAttr>(baseEncoding))
 
@@ -27,33 +27,15 @@ constexpr int patternBenefitPrioritizeOverLLVMConversions = 10;
 constexpr int patternBenefitClampOptimizedPattern = 20;
 constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
 
-struct BackendCallbacks {
-  /**
-   * A backend-specific callback for appending auxiliary data during
-   * `LocalStoreOp` conversion.
-   *
-   * @param[in] op The reference to the re-written `LocalStoreOp`.
-   * @param[in] count The number of issued LLVM instructions.
-   * @param[in] type The input type of issued LLVM instructions.
-   */
-  std::function<void(triton::gpu::LocalStoreOp op, size_t llvmOpCount,
-                     Type llvmOpType)>
-      localStoreOpConversion = nullptr;
-};
-
 void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, const TargetInfoBase &targetInfo,
     PatternBenefit benefit);
 
-// The given callback is invoked at the end of a successful rewrite. The
-// callback receives 1) the current source op, 2) the number of issued LLVM
-// instructions and 3) their input types. Each MLIR backend can provide a
-// callback and, thus, handle backend-specific behaviors.
-void populateMemoryOpToLLVMPattern(
-    LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
-    RewritePatternSet &patterns, PatternBenefit benefit,
-    std::optional<BackendCallbacks> backendCallbacks = std::nullopt);
+void populateMemoryOpToLLVMPattern(LLVMTypeConverter &typeConverter,
+                                   const TargetInfoBase &targetInfo,
+                                   RewritePatternSet &patterns,
+                                   PatternBenefit benefit);
 
 void populateAssertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                    RewritePatternSet &patterns,
 
@@ -1366,11 +1366,11 @@ SmallVector<Value> loadSharedToDistributed(RankedTensorType dstTy,
                                            Location loc, RewriterBase &rewriter,
                                            const TargetInfoBase &target);
 
-void storeDistributedToShared(
-    MemDescType dstTy, RankedTensorType srcTy, Type elemLlvmTy,
-    ArrayRef<Value> srcVals, Value smemBase, ArrayRef<Value> dstStrides,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    std::pair<size_t, Type> *const llvmOpCount = nullptr);
+void storeDistributedToShared(MemDescType dstTy, RankedTensorType srcTy,
+                              Type elemLlvmTy, ArrayRef<Value> srcVals,
+                              Value smemBase, ArrayRef<Value> dstStrides,
+                              Location loc, RewriterBase &rewriter,
+                              const TargetInfoBase &target);
 
 inline Value getStructFromSharedMemoryObject(Location loc,
                                              const SharedMemoryObject &smemObj,
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-8321eec009c8c79145ebccd51fdfc336e5f8b848`
	`1`	`+487873f7cafeb0fd390eaefe40496b804bceabbd`