Skip to content

Commit 01fc929

Browse files
committed
save work and bug fixes
1 parent 586edb9 commit 01fc929

File tree

4 files changed

+408
-346
lines changed

4 files changed

+408
-346
lines changed

mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,6 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
2727
}];
2828
let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
2929
"vector::VectorDialect"];
30-
let options = [Option<
31-
"enableSGReductions", "enable-sg-reductions", "bool",
32-
/*default=*/"true",
33-
"Enable subgroup reductions using subgroup shuffles.">];
3430
}
3531

3632
def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {

mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,11 +1415,6 @@ namespace {
14151415
struct XeGPUSubgroupDistributePass final
14161416
: public xegpu::impl::XeGPUSubgroupDistributeBase<
14171417
XeGPUSubgroupDistributePass> {
1418-
XeGPUSubgroupDistributePass() = default;
1419-
XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
1420-
default;
1421-
XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
1422-
: XeGPUSubgroupDistributeBase(options) {}
14231418
void runOnOperation() override;
14241419
};
14251420
} // namespace
@@ -1527,10 +1522,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
15271522
return laneVal;
15281523
};
15291524

1530-
if (enableSGReductions)
1531-
vector::populateDistributeReduction(
1532-
patterns, warpReduction,
1533-
/*pattern benefit=*/regularPatternBenefit);
1525+
vector::populateDistributeReduction(
1526+
patterns, warpReduction,
1527+
/*pattern benefit=*/regularPatternBenefit);
15341528

15351529
vector::populatePropagateWarpVectorDistributionPatterns(
15361530
patterns, distributionFn, shuffleFn,

mlir/test/Dialect/XeGPU/propgate-layouts-and-subgroup-distribute.mlir

Lines changed: 136 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@
22
// RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
33

44
// CHECK-LABEL: gpu.func @load_dpas_postop_store
5-
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
6-
// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
7-
// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
8-
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
9-
// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
10-
// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
11-
// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
12-
// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
13-
// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
14-
// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
15-
// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
5+
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
6+
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
7+
// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
8+
// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
9+
// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
10+
// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
11+
// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
12+
// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
13+
// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
14+
// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
15+
// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
16+
// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
1617
gpu.module @xevm_module{
1718
gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
1819
%c0 = arith.constant 0 : index
@@ -47,26 +48,29 @@ gpu.module @xevm_module{
4748

4849
// -----
4950
// CHECK-LABEL: gpu.func @gemm
50-
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
51-
// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
52-
// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
53-
// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
54-
// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
55-
// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
56-
// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
57-
// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
58-
// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
59-
// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
60-
// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
61-
// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
62-
// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
63-
// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
64-
// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
65-
// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
66-
// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
67-
// CHECK-NEXT: }
68-
// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
69-
// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
51+
// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
52+
// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
53+
// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
54+
// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
55+
// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
56+
// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
57+
// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
58+
// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
59+
// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
60+
// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
61+
// CHECK-SAME: -> (vector<8x1xf32>) {
62+
// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
63+
// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
64+
// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
65+
// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
66+
// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
67+
// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
68+
// CHECK-SAME: : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
69+
// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
70+
// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
71+
// CHECK-NEXT: }
72+
// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
73+
// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
7074
gpu.module @xevm_module{
7175
gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
7276
%c0 = arith.constant 0 : index
@@ -109,3 +113,104 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
109113
gpu.return
110114
}
111115
}
116+
117+
// -----
118+
// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
119+
// CHECK: (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
120+
// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
121+
// CHECK-DAG: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
122+
// CHECK-DAG: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
123+
// CHECK: %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
124+
// CHECK-NEXT: %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
125+
// CHECK-SAME: : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
126+
// CHECK-NEXT: %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
127+
// CHECK-NEXT: scf.yield %[[LD_CAST]] : vector<1x8xf16>
128+
// CHECK-NEXT: } else {
129+
// CHECK-NEXT: scf.yield %[[CST]] : vector<1x8xf16>
130+
// CHECK-NEXT: }
131+
// CHECK-NEXT: %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
132+
// CHECK-NEXT: xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
133+
// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
134+
gpu.module @xevm_module{
135+
gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
136+
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
137+
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
138+
%loaded = scf.if %pred -> (vector<16x8xf16>) {
139+
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
140+
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
141+
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
142+
scf.yield %3 : vector<16x8xf16>
143+
} else {
144+
%3 = arith.constant {
145+
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
146+
} dense<12.> : vector<16x8xf16>
147+
scf.yield %3 : vector<16x8xf16>
148+
} { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
149+
xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
150+
gpu.return
151+
}
152+
}
153+
154+
// -----
155+
// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
156+
// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
157+
// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
158+
// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
159+
// CHECK: scf.if %[[PREDICATE]] {
160+
// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
161+
// CHECK-SAME: memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
162+
// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
163+
// CHECK-SAME: vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
164+
// CHECK-NEXT: }
165+
gpu.module @xevm_module{
166+
gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
167+
%pred = llvm.mlir.poison : i1
168+
%1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
169+
%offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
170+
scf.if %pred {
171+
%3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
172+
layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
173+
} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
174+
xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
175+
}
176+
gpu.return
177+
}
178+
}
179+
180+
// -----
181+
// CHECK-LABEL: gpu.func @mma_transpose_b(
182+
// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
183+
// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
184+
// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
185+
// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
186+
// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
187+
// CHECK-SAME: !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
188+
// CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
189+
// CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
190+
// CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
191+
// CHECK-NEXT: %[[C:.*]] = xegpu.dpas %[[A]], %[[BCAST2]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
192+
gpu.module @xevm_module{
193+
gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) {
194+
%c0 = arith.constant 0 : index
195+
%0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
196+
-> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
197+
%1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
198+
: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
199+
%2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32>
200+
-> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
201+
%3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
202+
: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
203+
%4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
204+
: vector<16x8xi32> to vector<16x16xf16>
205+
%5 = vector.transpose %4, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
206+
: vector<16x16xf16> to vector<16x16xf16>
207+
%6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
208+
: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
209+
%7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32>
210+
-> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
211+
xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>,
212+
!xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
213+
gpu.return
214+
215+
}
216+
}

0 commit comments

Comments
 (0)