@@ -682,3 +682,73 @@ gpu.module @test_kernel {
682682 gpu.return
683683 }
684684}
685+
686+ // -----
687+ gpu.module @test_kernel {
688+ // CHECK-LABEL: remove_unit_dim_inst_data
689+ // CHECK-SAME: [[arg0:%.+]]: ui64
690+ // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32>
691+ // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1>
692+ // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
693+ // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
694+ // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
695+ // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
696+ // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
697+ // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
698+ gpu.func @remove_unit_dim_inst_data (%src: ui64 ) -> vector <1 x1 x32 xf32 > {
699+ %cst = arith.constant {layout_result_0 = #xegpu.layout <inst_data = [1 , 1 , 16 ]>} dense <[[
700+ [0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ,
701+ 64 , 72 , 80 , 88 , 96 , 104 , 112 , 120 ,
702+ 128 , 136 , 144 , 152 , 160 , 168 , 176 , 184 ,
703+ 192 , 200 , 208 , 216 , 224 , 232 , 240 , 248 ]
704+ ]]> : vector <1 x1 x32 xindex >
705+
706+ %mask = arith.constant {layout_result_0 = #xegpu.layout <inst_data = [1 , 1 , 16 ]>} dense <true > : vector <1 x1 x32 xi1 >
707+ %ld = xegpu.load %src [%cst ], %mask {chunk_size = 1 , layout_result_0 = #xegpu.layout <inst_data = [1 , 1 , 16 ]>, l1_hint = #xegpu.cache_hint <cached >} : ui64 , vector <1 x1 x32 xindex >, vector <1 x1 x32 xi1 > -> vector <1 x1 x32 xf32 >
708+
709+ gpu.return %ld : vector <1 x1 x32 xf32 >
710+ }
711+ }
712+
713+ // -----
714+ #l = #xegpu.layout <inst_data = [1 , 16 ]>
715+ gpu.module @test_kernel {
716+ // CHECK-LABEL: load_store_nd_with_offsets
717+ // CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32>
718+ // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
719+ // CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
720+ // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
721+ // CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
722+ // CHECK: [[tdesc_b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
723+ // CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
724+ // CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
725+ // CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
726+ // CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
727+ // CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
728+ // CHECK: [[cast_a0:%.+]] = vector.shape_cast [[ld_a0]] : vector<1x16xf32> to vector<16xf32>
729+ // CHECK: [[cast_b0:%.+]] = vector.shape_cast [[ld_b0]] : vector<1x16xf32> to vector<16xf32>
730+ // CHECK: [[add0:%.+]] = arith.addf [[cast_a0]], [[cast_b0]] : vector<16xf32>
731+ // CHECK: [[ins0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0, 0], strides = [1]} : vector<16xf32> into vector<1x32xf32>
732+ // CHECK: [[cast_a1:%.+]] = vector.shape_cast [[ld_a1]] : vector<1x16xf32> to vector<16xf32>
733+ // CHECK: [[cast_b1:%.+]] = vector.shape_cast [[ld_b1]] : vector<1x16xf32> to vector<16xf32>
734+ // CHECK: [[add1:%.+]] = arith.addf [[cast_a1]], [[cast_b1]] : vector<16xf32>
735+ // CHECK: [[ins1:%.+]] = vector.insert_strided_slice [[add1]], [[ins0]] {offsets = [0, 16], strides = [1]} : vector<16xf32> into vector<1x32xf32>
736+ // CHECK: [[ext0:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
737+ // CHECK: [[ext1:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
738+ // CHECK: xegpu.store_nd [[ext0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
739+ // CHECK: xegpu.store_nd [[ext1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
740+ gpu.func @load_store_nd_with_offsets (%A: memref <1024 x1024 xf32 >, %B: memref <1024 x1024 xf32 >, %C: memref <1024 x1024 xf32 >) {
741+ %c0 = arith.constant 0 : index
742+
743+ %a_tdesc = xegpu.create_nd_tdesc %A : memref <1024 x1024 xf32 > -> !xegpu.tensor_desc <1 x32 xf32 , #l >
744+ %b_tdesc = xegpu.create_nd_tdesc %B : memref <1024 x1024 xf32 > -> !xegpu.tensor_desc <1 x32 xf32 , #l >
745+ %c_tdesc = xegpu.create_nd_tdesc %C : memref <1024 x1024 xf32 > -> !xegpu.tensor_desc <1 x32 xf32 , #l >
746+
747+ %a = xegpu.load_nd %a_tdesc [%c0 , %c0 ] : !xegpu.tensor_desc <1 x32 xf32 , #l > -> vector <1 x32 xf32 >
748+ %b = xegpu.load_nd %b_tdesc [%c0 , %c0 ] : !xegpu.tensor_desc <1 x32 xf32 , #l > -> vector <1 x32 xf32 >
749+
750+ %result = arith.addf %a , %b {layout_result_0 = #l } : vector <1 x32 xf32 >
751+ xegpu.store_nd %result , %c_tdesc [%c0 , %c0 ] : vector <1 x32 xf32 >, !xegpu.tensor_desc <1 x32 xf32 , #l >
752+ gpu.return
753+ }
754+ }
0 commit comments