Skip to content

Commit 3ca2a37

Browse files
authored
[LLVM Pulldown] Bump to LLVM revision 2cde7ac1ba256deb757946c449b790468ad0c45e (#1153)
1 parent 2ebf458 commit 3ca2a37

File tree

5 files changed

+21
-58
lines changed

5 files changed

+21
-58
lines changed

build_tools/llvm_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
928505c98345e0494f2d260788adb291efc9ee38
1+
2cde7ac1ba256deb757946c449b790468ad0c45e

build_tools/patches/0010-Extend-xegpu-op-fence-def.patch

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
From cfb25d2f76986fb2566b3a26fd82c3f152d88a8a Mon Sep 17 00:00:00 2001
1+
From 4c8eb35b04647e292bc129731c56487d277d8600 Mon Sep 17 00:00:00 2001
22
From: Garra1980 <igor.zamyatin@intel.com>
3-
Date: Wed, 30 Apr 2025 11:24:51 -0500
4-
Subject: [PATCH] Extend xegpu op fence def
3+
Date: Mon, 9 Mar 2026 01:35:37 +0100
4+
Subject: [PATCH] Extend xegpu op fence definition
55

66
---
77
.../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 24 +++++++++++++++++--
88
.../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 5 ++--
99
2 files changed, 25 insertions(+), 4 deletions(-)
1010

1111
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
12-
index f1bed70253ef..401043e590b1 100644
12+
index ce0cce65373e..ed2f97952f46 100644
1313
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
1414
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
15-
@@ -138,14 +138,34 @@ def XeGPU_CacheHintAttr
15+
@@ -159,14 +159,34 @@ def XeGPU_CacheHintAttr
1616
}
1717

1818
def XeGPU_FenceScopeWorkgroup: I32EnumAttrCase<"Workgroup", 0, "workgroup">;
@@ -50,18 +50,18 @@ index f1bed70253ef..401043e590b1 100644
5050
EnumAttr<XeGPU_Dialect, XeGPU_FenceScope, "fence_scope"> {
5151
let summary = [{Describes the scope of fence.
5252
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
53-
index 9b001a78d6fe..01c5357e33ca 100644
53+
index 48737352497e..4a5a3e8cfa76 100644
5454
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
5555
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
56-
@@ -976,8 +976,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
56+
@@ -1489,8 +1489,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
5757
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
5858
}];
5959
let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind,
6060
- XeGPU_FenceScopeAttr: $fence_scope);
61-
- let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
61+
- let assemblyFormat = [{`memory_kind` `=` $memory_kind `,` `fence_scope` `=` $fence_scope attr-dict}];
6262
+ XeGPU_FenceScopeAttr: $fence_scope,
6363
+ OptionalAttr<XeGPU_FenceOpFlushAttr>: $fence_op_flush);
64-
+ let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope (`,` `fence_op_flush` `=` `` $fence_op_flush^)? attr-dict}];
64+
+ let assemblyFormat = [{`memory_kind` `=` $memory_kind `,` `fence_scope` `=` $fence_scope (`,` `fence_op_flush` `=` $fence_op_flush^)? attr-dict}];
6565
let extraClassDeclaration = extraBaseClassDeclaration;
6666
}
6767

build_tools/patches/wg_fa_support.patch

Lines changed: 0 additions & 37 deletions
This file was deleted.

lib/Conversion/XeGPUToVC/XeGPUToVC.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,7 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
764764

765765
populateMathToVCPatterns(typeConverter, patterns);
766766

767-
mlir::vector::populateVectorMultiReductionReorderAndExpandPatterns(
767+
mlir::vector::populateVectorMultiReductionReorderPatterns(
768768
patterns, mlir::vector::VectorMultiReductionLowering::InnerReduction);
769769
mlir::vector::populateVectorMultiReductionFlatteningPatterns(
770770
patterns, mlir::vector::VectorMultiReductionLowering::InnerReduction);

test/Integration/Dialect/XeGPU/WG/flash_attention_fwd.mlir

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
#q = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64], inst_data = [8, 16]>
77
#k = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64], inst_data = [16, 16]>
88
#v = #k
9-
#kt = #xegpu.layout<sg_layout = [1, 8], sg_data = [64, 16], inst_data = [16, 16]>
9+
#kt = #xegpu.layout<sg_layout = [1, 8], sg_data = [64, 16], inst_data = [16, 16], order = [0, 1]>
1010
#k_prefetch = #xegpu.layout<sg_layout = [4, 2], sg_data = [16, 32], inst_data = [16, 16]>
1111
#v_prefetch = #k_prefetch
1212
#out = #q
13+
#out_t = #xegpu.layout<sg_layout = [1, 8], sg_data = [64, 16], inst_data = [16, 8], order = [0, 1]>
1314
#layout_128x1 = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 1], inst_data = [8, 1]>
14-
#layout_128x16 = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 16], inst_data = [8, 16]>
15+
#layout_128x16 = #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 16], inst_data = [8, 16] >
16+
#layout_128x16_t = #xegpu.layout<sg_layout = [1, 8], sg_data = [16, 16], inst_data = [16, 8], order = [0, 1]>
1517
#layout_128 = #xegpu.layout<sg_layout = [8], sg_data = [16], inst_data = [8]>
1618
module @flash_attention attributes {gpu.container_module} {
1719
gpu.module @flash_attention_fwd {
@@ -162,7 +164,6 @@ module @flash_attention attributes {gpu.container_module} {
162164
%qk_out_max_t3 = vector.multi_reduction <maximumf>, %qk_out_max_t2, %minus_inf_128
163165
{layout_result_0 = #xegpu.slice<#layout_128x16, dims = [1]>}
164166
[1] : vector<128x16xf32> to vector<128xf32>
165-
// %qk_out_max = vector.shape_cast %qk_out_max_t3 {layout_result_0 = #layout_128x1} : vector<128xf32> to vector<128x1xf32>
166167

167168
// Scale
168169
%qk_out_max_scaled = arith.mulf %qk_out_max_t3, %qk_scale_128 {layout_result_0 = #layout_128} : vector<128xf32>
@@ -174,8 +175,8 @@ module @flash_attention attributes {gpu.container_module} {
174175
%qk_out_2_scaled = arith.mulf %qk_out_2, %qk_scale_128x16 {layout_result_0 = #layout_128x16} : vector<128x16xf32>
175176
%qk_out_3_scaled = arith.mulf %qk_out_3, %qk_scale_128x16 {layout_result_0 = #layout_128x16} : vector<128x16xf32>
176177
// Broadcast m_ij_row to 128x16
177-
%m_ij_row_broadcasted0 = vector.shape_cast %m_ij_row {layout_result_0 = #layout_128x1, layout_operand_0 = #xegpu.slice<#layout_128x1, dims=[1]>} : vector<128xf32> to vector<128x1xf32>
178-
%m_ij_row_broadcasted = vector.broadcast %m_ij_row_broadcasted0 {layout_result_0 = #layout_128x16} : vector<128x1xf32> to vector<128x16xf32>
178+
%m_ij_row_broadcasted0 = vector.broadcast %m_ij_row {layout_result_0 = #layout_128x16_t} : vector<128xf32> to vector<16x128xf32>
179+
%m_ij_row_broadcasted = vector.transpose %m_ij_row_broadcasted0, [1, 0] {layout_result_0 = #layout_128x16} : vector<16x128xf32> to vector<128x16xf32>
179180
// Center qk_out by m_ij_row
180181
%qk_out_0_centered = arith.subf %qk_out_0_scaled, %m_ij_row_broadcasted {layout_result_0 = #layout_128x16} : vector<128x16xf32>
181182
%qk_out_1_centered = arith.subf %qk_out_1_scaled, %m_ij_row_broadcasted {layout_result_0 = #layout_128x16} : vector<128x16xf32>
@@ -193,16 +194,15 @@ module @flash_attention attributes {gpu.container_module} {
193194
%l_ij_row_t3 = vector.multi_reduction <add>, %l_ij_row_t2, %zero_128
194195
{layout_result_0 = #xegpu.slice<#layout_128x16, dims = [1]>}
195196
[1] : vector<128x16xf32> to vector<128xf32>
196-
// %l_ij_row = vector.shape_cast %l_ij_row_t3 {layout_result_0 = #layout_128x1} : vector<128xf32> to vector<128x1xf32>
197197
// Compute alpha
198198
%alpha_row_t1 = arith.subf %m_i_row, %m_ij_row {layout_result_0 = #layout_128} : vector<128xf32>
199199
%alpha_row = math.exp %alpha_row_t1 fastmath<fast> {layout_result_0 = #layout_128} : vector<128xf32>
200200
// Update l_i
201201
%l_i_row_new_t1 = arith.mulf %l_i_row, %alpha_row {layout_result_0 = #layout_128} : vector<128xf32>
202202
%l_i_row_new = arith.addf %l_i_row_new_t1, %l_ij_row_t3 {layout_result_0 = #layout_128} : vector<128xf32>
203203
// Update acc
204-
%alpha_row_broadcasted0 = vector.shape_cast %alpha_row {layout_result_0 = #layout_128x1, layout_operand_0 = #xegpu.slice<#layout_128x1, dims=[1]>} : vector<128xf32> to vector<128x1xf32>
205-
%alpha_row_broadcasted = vector.broadcast %alpha_row_broadcasted0 {layout_result_0 = #out} : vector<128x1xf32> to vector<128x64xf32>
204+
%alpha_row_broadcasted0 = vector.broadcast %alpha_row {layout_result_0 = #out_t} : vector<128xf32> to vector<64x128xf32>
205+
%alpha_row_broadcasted = vector.transpose %alpha_row_broadcasted0, [1, 0] {layout_result_0 = #out} : vector<64x128xf32> to vector<128x64xf32>
206206
%acc_in_updated = arith.mulf %acc_in, %alpha_row_broadcasted {layout_result_0 = #out} : vector<128x64xf32>
207207

208208
// Convert qk_out_tile to DPAS-A precision for P*V computation.
@@ -234,8 +234,8 @@ module @flash_attention attributes {gpu.container_module} {
234234
scf.yield %pv_out_iter3, %m_ij_row, %l_i_row_new : vector<128x64xf32>, vector<128xf32>, vector<128xf32>
235235
} {layout_result_0 = #out, layout_result_1 = #layout_128, layout_result_2 = #layout_128}// end of inner loop
236236
// Divide acc output by l_i
237-
%l_i_row_broadcast0 = vector.shape_cast %result#2 {layout_result_0 = #layout_128x1, layout_operand_0 = #xegpu.slice<#layout_128x1, dims=[0]>} : vector<128xf32> to vector<128x1xf32>
238-
%l_i_row_broadcast = vector.broadcast %l_i_row_broadcast0 {layout_result_0 = #out} : vector<128x1xf32> to vector<128x64xf32>
237+
%l_i_row_broadcast0 = vector.broadcast %result#2 {layout_result_0 = #out_t} : vector<128xf32> to vector<64x128xf32>
238+
%l_i_row_broadcast = vector.transpose %l_i_row_broadcast0, [1, 0] {layout_result_0 = #out} : vector<64x128xf32> to vector<128x64xf32>
239239
%o_val_final_t = arith.divf %result#0, %l_i_row_broadcast {layout_result_0 = #out} : vector<128x64xf32>
240240
// Store output tile.
241241
%o_val_final = arith.truncf %o_val_final_t {layout_result_0 = #out} : vector<128x64xf32> to vector<128x64xf16>

0 commit comments

Comments
 (0)