@@ -605,6 +605,26 @@ gpu.module @test_kernel {
605605 }
606606}
607607
608+ // -----
609+ gpu.module @test_kernel {
610+ // CHECK-LABEL: load_with_offsets_perm_layout
611+ // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
612+ gpu.func @load_with_offsets_perm_layout (%src: ui64 ) -> vector <32 xf32 > {
613+ %cst = arith.constant dense <[
614+ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ,
615+ 64 , 72 , 80 , 88 , 96 , 104 , 112 , 120 ,
616+ 128 , 136 , 144 , 152 , 160 , 168 , 176 , 184 ,
617+ 192 , 200 , 208 , 216 , 224 , 232 , 240 , 248
618+ ]> : vector <32 xindex >
619+
620+ %c17 = arith.constant 17 : index
621+ %mask = vector.create_mask %c17: vector <32 xi1 >
622+ %ld = xegpu.load %src [%cst ], %mask <{chunk_size = 1 , layout = #xegpu.layout <inst_data = [16 ]>, l1_hint = #xegpu.cache_hint <cached >}> : ui64 , vector <32 xindex >, vector <32 xi1 > -> vector <32 xf32 >
623+
624+ gpu.return %ld : vector <32 xf32 >
625+ }
626+ }
627+
608628// -----
609629gpu.module @test_kernel {
610630 // CHECK-LABEL: store_with_offsets
@@ -630,6 +650,31 @@ gpu.module @test_kernel {
630650 }
631651}
632652
653+ // -----
654+ gpu.module @test_kernel {
655+ // CHECK-LABEL: store_with_offsets_perm_layout
656+ // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1>
657+ gpu.func @store_with_offsets_perm_layout (%src: ui64 ) {
658+ %cst = arith.constant dense <[
659+ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ,
660+ 64 , 72 , 80 , 88 , 96 , 104 , 112 , 120 ,
661+ 128 , 136 , 144 , 152 , 160 , 168 , 176 , 184 ,
662+ 192 , 200 , 208 , 216 , 224 , 232 , 240 , 248
663+ ]> : vector <32 xindex >
664+
665+ %c17 = arith.constant 17 : index
666+ %mask = vector.create_mask %c17: vector <32 xi1 >
667+
668+ %st_vec = arith.constant dense <1023.0 >: vector <32 xf32 >
669+ xegpu.store %st_vec , %src [%cst ], %mask {chunk_size = 1 , layout = #xegpu.layout <inst_data = [16 ]>,
670+ layout_operand_2 = #xegpu.layout <inst_data = [16 ]>,
671+ layout_operand_3 = #xegpu.layout <inst_data = [16 ]>,
672+ l1_hint = #xegpu.cache_hint <cached >} : vector <32 xf32 >, ui64 , vector <32 xindex >, vector <32 xi1 >
673+
674+ gpu.return
675+ }
676+ }
677+
633678// -----
634679gpu.module @test_kernel {
635680 // CHECK-LABEL: load_with_offsets_chunk
@@ -654,6 +699,30 @@ gpu.module @test_kernel {
654699 }
655700}
656701
702+ // -----
703+ gpu.module @test_kernel {
704+ // CHECK-LABEL: load_with_offsets_chunk_perm_layout
705+ // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32>
706+ // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
707+ // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
708+ // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
709+ // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
710+ // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32>
711+ gpu.func @load_with_offsets_chunk_perm_layout (%src: ui64 ) -> vector <32 x4 xf32 > {
712+ %cst = arith.constant dense <[
713+ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ,
714+ 64 , 72 , 80 , 88 , 96 , 104 , 112 , 120 ,
715+ 128 , 136 , 144 , 152 , 160 , 168 , 176 , 184 ,
716+ 192 , 200 , 208 , 216 , 224 , 232 , 240 , 248
717+ ]> : vector <32 xindex >
718+
719+ %c17 = arith.constant 17 : index
720+ %mask = vector.create_mask %c17: vector <32 xi1 >
721+ %ld = xegpu.load %src [%cst ], %mask <{chunk_size = 4 , layout = #xegpu.layout <inst_data = [16 , 2 ]>, l1_hint = #xegpu.cache_hint <cached >}> : ui64 , vector <32 xindex >, vector <32 xi1 > -> vector <32 x4 xf32 >
722+ gpu.return %ld : vector <32 x4 xf32 >
723+ }
724+ }
725+
657726// -----
658727gpu.module @test_kernel {
659728 // CHECK-LABEL: store_with_offsets_chunk
@@ -682,3 +751,32 @@ gpu.module @test_kernel {
682751 gpu.return
683752 }
684753}
754+
755+ // -----
756+ gpu.module @test_kernel {
757+ // CHECK-LABEL: store_with_offsets_chunk_perm_layout
758+ // CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32
759+ // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex>
760+ // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex>
761+ // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
762+ // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
763+ // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint<cached>}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1>
764+ gpu.func @store_with_offsets_chunk_perm_layout (%src: ui64 ) {
765+ %cst = arith.constant dense <[
766+ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ,
767+ 64 , 72 , 80 , 88 , 96 , 104 , 112 , 120 ,
768+ 128 , 136 , 144 , 152 , 160 , 168 , 176 , 184 ,
769+ 192 , 200 , 208 , 216 , 224 , 232 , 240 , 248
770+ ]> : vector <32 xindex >
771+
772+ %c17 = arith.constant 17 : index
773+ %mask = vector.create_mask %c17: vector <32 xi1 >
774+
775+ %st_vec = arith.constant dense <1023. >: vector <32 x4 xf32 >
776+ xegpu.store %st_vec , %src [%cst ], %mask {chunk_size = 4 , layout = #xegpu.layout <inst_data = [16 , 2 ]>,
777+ layout_operand_2 = #xegpu.layout <inst_data = [16 , 2 ]>,
778+ layout_operand_3 = #xegpu.layout <inst_data = [16 , 2 ]>,
779+ l1_hint = #xegpu.cache_hint <cached >} : vector <32 x4 xf32 >, ui64 , vector <32 xindex >, vector <32 xi1 >
780+ gpu.return
781+ }
782+ }
0 commit comments