Skip to content

Commit 1ed9416

Browse files
committed
add tests for permament layout
Signed-off-by: dchigarev <[email protected]>
1 parent 0d0047d commit 1ed9416

File tree

5 files changed

+348
-0
lines changed

5 files changed

+348
-0
lines changed

mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,47 @@ gpu.module @xevm_module{
462462
}
463463
}
464464

465+
// -----
466+
// CHECK-LABEL: gpu.func @scatter_ops_chunksize_perm_layout({{.*}}) {
467+
// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
468+
// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
469+
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
470+
// CHECK-SAME: -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
471+
// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
472+
// CHECK-SAME: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
473+
// CHECK-NEXT: }
474+
// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
475+
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
476+
// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
477+
// CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
478+
gpu.module @xevm_module{
479+
gpu.func @scatter_ops_chunksize_perm_layout(%laneid: index, %src: memref<256xf16>) {
480+
gpu.warp_execute_on_lane_0(%laneid)[16] {
481+
%1 = arith.constant
482+
{layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
483+
dense<1>: vector<16xi1>
484+
%offset = arith.constant
485+
{layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
486+
dense<12> : vector<16xindex>
487+
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
488+
{
489+
layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
490+
layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
491+
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
492+
}
493+
: memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
494+
xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
495+
{
496+
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
497+
layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
498+
layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
499+
}
500+
: vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
501+
}
502+
gpu.return
503+
}
504+
}
505+
465506
// -----
466507
// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
467508
// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
@@ -502,6 +543,46 @@ gpu.module @xevm_module{
502543
}
503544
}
504545

546+
// -----
547+
// CHECK-LABEL: gpu.func @scatter_ops_perm_layout({{.*}}) {
548+
// CHECK: %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
549+
// CHECK: %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
550+
// CHECK: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
551+
// CHECK-SAME: -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
552+
// CHECK: gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
553+
// CHECK-SAME: : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
554+
// CHECK-NEXT: }
555+
// CHECK-NEXT: %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
556+
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
557+
// CHECK-NEXT: xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
558+
// CHECK-SAME: : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
559+
gpu.module @xevm_module{
560+
gpu.func @scatter_ops_perm_layout(%src: memref<256xf16>, %laneid: index) {
561+
gpu.warp_execute_on_lane_0(%laneid)[16] {
562+
%1 = arith.constant
563+
{layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
564+
dense<1> : vector<16xi1>
565+
%offset = arith.constant
566+
{layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
567+
dense<12> : vector<16xindex>
568+
%3 = xegpu.load %src[%offset], %1
569+
{
570+
layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
571+
layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
572+
layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>
573+
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
574+
xegpu.store %3, %src[%offset], %1
575+
{
576+
layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
577+
layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
578+
layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
579+
}
580+
: vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
581+
}
582+
gpu.return
583+
}
584+
}
585+
505586
// -----
506587
// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
507588
// CHECK: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {

mlir/test/Dialect/XeGPU/subgroup-distribute.mlir

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,43 @@ gpu.module @xevm_module{
151151
}
152152
}
153153

154+
// -----
155+
// CHECK-LABEL: gpu.func @scatter_ops_scf_yield_perm_layout
156+
// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
157+
// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
158+
// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
159+
// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
160+
// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
161+
// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
162+
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
163+
// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
164+
// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16>
165+
// CHECK-NEXT: } else {
166+
// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16>
167+
// CHECK-NEXT: }
168+
// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
169+
// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
170+
// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
171+
gpu.module @xevm_module{
172+
gpu.func @scatter_ops_scf_yield_perm_layout(%src: memref<256xf16>, %pred : i1) {
173+
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
174+
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
175+
%loaded = scf.if %pred -> (vector<16x8xf16>) {
176+
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8,
177+
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
178+
}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
179+
scf.yield %3 : vector<16x8xf16>
180+
} else {
181+
%3 = arith.constant {
182+
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
183+
} dense<12.> : vector<16x8xf16>
184+
scf.yield %3 : vector<16x8xf16>
185+
} { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
186+
xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
187+
gpu.return
188+
}
189+
}
190+
154191
// -----
155192
// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
156193
// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
@@ -177,6 +214,32 @@ gpu.module @xevm_module{
177214
}
178215
}
179216

217+
// -----
218+
// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield_perm_layout({{.*}}) {
219+
// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
220+
// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
221+
// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
222+
// CHECK: scf.if %[[PREDICATE]] {
223+
// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
224+
// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
225+
// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
226+
// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
227+
// CHECK-NEXT: }
228+
gpu.module @xevm_module{
229+
gpu.func @scatter_ops_scf_non_yield_perm_layout(%src: memref<256xf16>) {
230+
%pred = llvm.mlir.poison : i1
231+
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
232+
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
233+
scf.if %pred {
234+
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8,
235+
layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
236+
}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
237+
xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
238+
}
239+
gpu.return
240+
}
241+
}
242+
180243
// -----
181244
// CHECK-LABEL: gpu.func @mma_transpose_b(
182245
// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {

mlir/test/Dialect/XeGPU/xegpu-blocking.mlir

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,26 @@ gpu.module @test_kernel {
605605
}
606606
}
607607

608+
// -----
609+
gpu.module @test_kernel {
610+
// CHECK-LABEL: load_with_offsets_perm_layout
611+
// CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
612+
gpu.func @load_with_offsets_perm_layout(%src: ui64) -> vector<32xf32> {
613+
%cst = arith.constant dense<[
614+
0, 8, 16, 24, 32, 40, 48, 56,
615+
64, 72, 80, 88, 96, 104, 112, 120,
616+
128, 136, 144, 152, 160, 168, 176, 184,
617+
192, 200, 208, 216, 224, 232, 240, 248
618+
]> : vector<32xindex>
619+
620+
%c17 = arith.constant 17: index
621+
%mask = vector.create_mask %c17: vector<32xi1>
622+
%ld = xegpu.load %src[%cst], %mask <{chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
623+
624+
gpu.return %ld : vector<32xf32>
625+
}
626+
}
627+
608628
// -----
609629
gpu.module @test_kernel {
610630
// CHECK-LABEL: store_with_offsets
@@ -630,6 +650,31 @@ gpu.module @test_kernel {
630650
}
631651
}
632652

653+
// -----
654+
gpu.module @test_kernel {
655+
// CHECK-LABEL: store_with_offsets_perm_layout
656+
// CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
657+
gpu.func @store_with_offsets_perm_layout(%src: ui64) {
658+
%cst = arith.constant dense<[
659+
0, 8, 16, 24, 32, 40, 48, 56,
660+
64, 72, 80, 88, 96, 104, 112, 120,
661+
128, 136, 144, 152, 160, 168, 176, 184,
662+
192, 200, 208, 216, 224, 232, 240, 248
663+
]> : vector<32xindex>
664+
665+
%c17 = arith.constant 17: index
666+
%mask = vector.create_mask %c17: vector<32xi1>
667+
668+
%st_vec = arith.constant dense<1023.0>: vector<32xf32>
669+
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>,
670+
layout_operand_2 = #xegpu.layout<inst_data = [16]>,
671+
layout_operand_3 = #xegpu.layout<inst_data = [16]>,
672+
l1_hint = #xegpu.cache_hint<cached>} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1>
673+
674+
gpu.return
675+
}
676+
}
677+
633678
// -----
634679
gpu.module @test_kernel {
635680
// CHECK-LABEL: load_with_offsets_chunk
@@ -654,6 +699,30 @@ gpu.module @test_kernel {
654699
}
655700
}
656701

702+
// -----
703+
gpu.module @test_kernel {
704+
// CHECK-LABEL: load_with_offsets_chunk_perm_layout
705+
// CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32>
706+
// CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
707+
// CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
708+
// CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
709+
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
710+
// CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
711+
gpu.func @load_with_offsets_chunk_perm_layout(%src: ui64) -> vector<32x4xf32> {
712+
%cst = arith.constant dense<[
713+
0, 8, 16, 24, 32, 40, 48, 56,
714+
64, 72, 80, 88, 96, 104, 112, 120,
715+
128, 136, 144, 152, 160, 168, 176, 184,
716+
192, 200, 208, 216, 224, 232, 240, 248
717+
]> : vector<32xindex>
718+
719+
%c17 = arith.constant 17: index
720+
%mask = vector.create_mask %c17: vector<32xi1>
721+
%ld = xegpu.load %src[%cst], %mask <{chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
722+
gpu.return %ld : vector<32x4xf32>
723+
}
724+
}
725+
657726
// -----
658727
gpu.module @test_kernel {
659728
// CHECK-LABEL: store_with_offsets_chunk
@@ -682,3 +751,32 @@ gpu.module @test_kernel {
682751
gpu.return
683752
}
684753
}
754+
755+
// -----
756+
gpu.module @test_kernel {
757+
// CHECK-LABEL: store_with_offsets_chunk_perm_layout
758+
// CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32
759+
// CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
760+
// CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
761+
// CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
762+
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
763+
// CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
764+
gpu.func @store_with_offsets_chunk_perm_layout(%src: ui64) {
765+
%cst = arith.constant dense<[
766+
0, 8, 16, 24, 32, 40, 48, 56,
767+
64, 72, 80, 88, 96, 104, 112, 120,
768+
128, 136, 144, 152, 160, 168, 176, 184,
769+
192, 200, 208, 216, 224, 232, 240, 248
770+
]> : vector<32xindex>
771+
772+
%c17 = arith.constant 17: index
773+
%mask = vector.create_mask %c17: vector<32xi1>
774+
775+
%st_vec = arith.constant dense<1023.>: vector<32x4xf32>
776+
xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>,
777+
layout_operand_2 = #xegpu.layout<inst_data = [16, 2]>,
778+
layout_operand_3 = #xegpu.layout<inst_data = [16, 2]>,
779+
l1_hint = #xegpu.cache_hint<cached>} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1>
780+
gpu.return
781+
}
782+
}

mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,28 @@ gpu.module @test {
231231
gpu.return %ld : vector<32xf32>
232232
}
233233

234+
//-----
235+
236+
237+
// CHECK-LABEL: load_with_offsets_perm_layout
238+
// CHECK-SAME: [[arg0:%.+]]: ui64
239+
// CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
240+
gpu.func @load_with_offsets_perm_layout(%src: ui64) -> vector<32xf32> {
241+
%cst = arith.constant dense<[
242+
0, 8, 16, 24, 32, 40, 48, 56,
243+
64, 72, 80, 88, 96, 104, 112, 120,
244+
128, 136, 144, 152, 160, 168, 176, 184,
245+
192, 200, 208, 216, 224, 232, 240, 248
246+
]> : vector<32xindex>
247+
248+
%c17 = arith.constant 17: index
249+
%mask = vector.create_mask %c17: vector<32xi1>
250+
%ld = xegpu.load %src[%cst], %mask <{chunk_size = 1, layout = #xegpu.layout<inst_data = [16]>, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32>
251+
252+
gpu.return %ld : vector<32xf32>
253+
}
254+
255+
234256
//-----
235257

236258
// CHECK-LABEL: prefetch
@@ -385,6 +407,29 @@ gpu.module @test {
385407
gpu.return %ld : vector<32x4xf32>
386408
}
387409

410+
//-----
411+
// CHECK-LABEL: load_with_offsets_chunk_perm_layout
412+
// CHECK-SAME: [[arg0:%.+]]: ui64
413+
// CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32>
414+
// CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
415+
// CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
416+
// CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
417+
// CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
418+
// CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
419+
gpu.func @load_with_offsets_chunk_perm_layout(%src: ui64) -> vector<32x4xf32> {
420+
%cst = arith.constant dense<[
421+
0, 8, 16, 24, 32, 40, 48, 56,
422+
64, 72, 80, 88, 96, 104, 112, 120,
423+
128, 136, 144, 152, 160, 168, 176, 184,
424+
192, 200, 208, 216, 224, 232, 240, 248
425+
]> : vector<32xindex>
426+
427+
%c17 = arith.constant 17: index
428+
%mask = vector.create_mask %c17: vector<32xi1>
429+
%ld = xegpu.load %src[%cst], %mask <{chunk_size = 4, layout = #xegpu.layout<inst_data = [16, 2]>, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32>
430+
gpu.return %ld : vector<32x4xf32>
431+
}
432+
388433
//-----
389434
// CHECK-LABEL: store_chunk
390435
// CHECK-SAME: [[arg0:%.+]]: ui64

0 commit comments

Comments
 (0)