Skip to content

Store transpose attribute in Subgroup2DBlockIO layouts #4549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: alex/use_subgroup_2d_block_encoding_pr
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions test/TritonIntelGPU/optimize-block-io-encoding.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 4], warpsPerCTA = [32, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
// CHECK: #mma = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [8, 16], numBlocks=2, order=[1, 0], kWidth=1, threadsPerWarp=16}>
// CHECK: #mma1 = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [16, 16], numBlocks=2, order=[0, 1], kWidth=2, threadsPerWarp=16}>
// CHECK: #mma = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [8, 16], numBlocks = 2, isTransposed = false, order = [1, 0], kWidth = 1, threadsPerWarp = 16}
// CHECK: #mma1 = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [16, 16], numBlocks = 2, isTransposed = false, order = [0, 1], kWidth = 2, threadsPerWarp = 16}
// CHECK: #mma2 = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32, ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64"} {
Expand Down Expand Up @@ -66,11 +66,79 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.tar

// -----

// COM: Dot Operand B transpose is supported
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 4], warpsPerCTA = [32, 1], order = [1, 0]}>
#blocked2 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 32], order = [0, 1]}>
#blocked3 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
// CHECK: #mma = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [8, 16], numBlocks = 2, isTransposed = false, order = [1, 0], kWidth = 1, threadsPerWarp = 16}>
// CHECK: #mma1 = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [16, 8], numBlocks = 1, isTransposed = true, order = [0, 1], kWidth = 2, threadsPerWarp = 16}>
// CHECK: #mma2 = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32, ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.support_sg_2d_block, ttig.target_arch = "spir64"} {
tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%c4_i32 = arith.constant 4 : i32
%c256_i32 = arith.constant 256 : i32
%c1024_i64 = arith.constant 1024 : i64
%c5120_i64 = arith.constant 5120 : i64
%c1_i64 = arith.constant 1 : i64
%c0_i32 = arith.constant 0 : i32
%c4096_i64 = arith.constant 4096 : i64
%c32_i32 = arith.constant 32 : i32
%c64_i32 = arith.constant 64 : i32
%c5120_i32 = arith.constant 5120 : i32
%cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #blocked>
%0 = tt.get_program_id x : i32
%1 = arith.divsi %0, %c64_i32 : i32
%2 = arith.muli %1, %c4_i32 : i32
%3 = arith.subi %c4_i32, %2 : i32
%4 = arith.minsi %3, %c4_i32 : i32
%5 = arith.remsi %0, %4 : i32
%6 = arith.addi %2, %5 : i32
%7 = arith.remsi %0, %c64_i32 : i32
%8 = arith.divsi %7, %4 : i32
%9 = arith.muli %6, %c256_i32 : i32
// CHECK: tt.make_tensor_ptr {{.*}} : <tensor<256x32xf16, #mma>>
%10 = tt.make_tensor_ptr %arg0, [%c1024_i64, %c5120_i64], [%c5120_i64, %c1_i64], [%9, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #blocked1>>
%11 = arith.muli %8, %c256_i32 : i32
// CHECK: tt.make_tensor_ptr {{.*}} : <tensor<32x256xf16, #mma1>>
%12 = tt.make_tensor_ptr %arg1, [%c5120_i64, %c4096_i64], [%c1_i64, %c5120_i64], [%c0_i32, %11] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #blocked2>>
%13:3 = scf.for %arg3 = %c0_i32 to %c5120_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %10, %arg6 = %12) -> (tensor<256x256xf32, #blocked>, !tt.ptr<tensor<256x32xf16, #blocked1>>, !tt.ptr<tensor<32x256xf16, #blocked2>>) : i32 {
// CHECK: %[[A_LOAD:.*]] = tt.load %arg5 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #mma>>
// CHECK: {{.*}} = ttg.convert_layout %[[A_LOAD]] : tensor<256x32xf16, #mma> -> tensor<256x32xf16, #blocked1>
%17 = tt.load %arg5 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "row_major"} : !tt.ptr<tensor<256x32xf16, #blocked1>>
// CHECK: %[[B_LOAD:.*]] = tt.load %arg6 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "column_major"} : !tt.ptr<tensor<32x256xf16, #mma1>>
// CHECK: {{.*}} = ttg.convert_layout %[[B_LOAD]] : tensor<32x256xf16, #mma1> -> tensor<32x256xf16, #blocked2>
%18 = tt.load %arg6 {boundaryCheck = array<i32: 0, 1>, ttig.block_io = "column_major"} : !tt.ptr<tensor<32x256xf16, #blocked2>>
%19 = ttg.convert_layout %17 : tensor<256x32xf16, #blocked1> -> tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
%20 = ttg.convert_layout %18 : tensor<32x256xf16, #blocked2> -> tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
%21 = ttg.convert_layout %arg4 : tensor<256x256xf32, #blocked> -> tensor<256x256xf32, #mma>
%22 = ttg.convert_layout %19 : tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
%23 = ttg.convert_layout %20 : tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
// CHECK: tt.dot {{.*}} : tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma2, kWidth = 1}>> * tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma2, kWidth = 2}>> -> tensor<256x256xf32, #mma2>
%24 = tt.dot %22, %23, %21, inputPrecision = tf32 : tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<256x256xf32, #mma>
%25 = ttg.convert_layout %24 : tensor<256x256xf32, #mma> -> tensor<256x256xf32, #blocked>
// CHECK: tt.advance {{.*}} : <tensor<256x32xf16, #mma>>
%26 = tt.advance %arg5, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #blocked1>>
// CHECK: tt.advance {{.*}} : <tensor<32x256xf16, #mma1>>
%27 = tt.advance %arg6, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #blocked2>>
scf.yield %25, %26, %27 : tensor<256x256xf32, #blocked>, !tt.ptr<tensor<256x32xf16, #blocked1>>, !tt.ptr<tensor<32x256xf16, #blocked2>>
}
%14 = tt.make_tensor_ptr %arg2, [%c1024_i64, %c4096_i64], [%c4096_i64, %c1_i64], [%9, %11] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #blocked3>>
%15 = arith.truncf %13#0 : tensor<256x256xf32, #blocked> to tensor<256x256xf16, #blocked>
%16 = ttg.convert_layout %15 : tensor<256x256xf16, #blocked> -> tensor<256x256xf16, #blocked3>
tt.store %14, %16 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #blocked3>>
tt.return
}
}

// -----

// COM: Dot operand A transpose currently not supported by subgroup 2d block io encoding
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 1], warpsPerCTA = [2, 16], order = [0, 1]}>
#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 16], warpsPerCTA = [16, 2], order = [1, 0]}>
// CHECK: #mma = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [16, 16], numBlocks=2, order=[0, 1], kWidth=2, threadsPerWarp=16}>
// CHECK: #mma = #ttig.subgroup_2d_block<{warpsPerCTA = [8, 4], instrShape = [16, 16], numBlocks = 2, isTransposed = false, order = [0, 1], kWidth = 2, threadsPerWarp = 16}>
// CHECK: #mma1 = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
// CHECK-NOT: #mma2
#mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ def Subgroup2DBlockEncodingAttr : DistributedEncoding<"Subgroup2DBlockEncoding",
For the layout, the following parameters are required:
- `instrShape` : contains the (height, width) block parameters for the block io operation
- `numBlocks` : the block count parameter allows a single load to load multiple blocks in row-major order (useful for increasing cache line utilization)
- `isTransposed` : indicates whether the data should be transposed post-load. The `instrShape` describes the shape of the data to load pre-transpose, i.e. if this is true then the output from the instruction (load + tranpose) will be the transposed `instrShape`.
- `threadsPerWarp` : currently a scalar, this parameter allows us to support different subgroup / warp configurations. Because the 2d block io operation is a subgroup operation, the size of the subgroup is important in determining the ordering of the loaded tensor.
- `warpsPerCTA` : the number of warps per block / subgroups per workgroup and their distribution
- `order` : The order within the block, used to determine along which dimension to broadcast.
Expand All @@ -310,14 +311,15 @@ def Subgroup2DBlockEncodingAttr : DistributedEncoding<"Subgroup2DBlockEncoding",
"CTALayoutAttr":$CTALayout,
ArrayRefParameter<"unsigned">:$instrShape,
"unsigned":$numBlocks,
"bool":$isTransposed,
ArrayRefParameter<"unsigned">:$order,
"unsigned":$kWidth,
"unsigned":$threadsPerWarp
);

let extraClassDeclaration = extraDistributedDeclaration # [{
SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
static SmallVector<unsigned, 3> getInstrShapeForLayout(DistributedEncodingTrait layout, ArrayRef<int64_t> shape, bool memoryRowMajor, unsigned kWidth, MLIRContext* context);
static SmallVector<unsigned, 3> getInstrShapeForLayout(DistributedEncodingTrait layout, ArrayRef<int64_t> shape, bool memoryRowMajor, bool isTransposed, unsigned kWidth, MLIRContext* context);
}];

let hasCustomAssemblyFormat = 1;
Expand Down
46 changes: 31 additions & 15 deletions third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,17 @@ static LogicalResult parseIntAttrValue(AsmParser &parser, Attribute attr,
return success();
}

static LogicalResult parseBoolAttrValue(AsmParser &parser, Attribute attr,
bool &value, StringRef desc) {
auto boolAttr = mlir::dyn_cast<BoolAttr>(attr);
if (!boolAttr) {
parser.emitError(parser.getNameLoc(), "expected a bool type in ") << desc;
return failure();
}
value = boolAttr.getValue();
return success();
}

// parse an array of integers
static LogicalResult parseIntArrayAttr(AsmParser &parser,
const NamedAttribute &attr,
Expand All @@ -83,6 +94,11 @@ static LogicalResult parseUInt(AsmParser &parser, const NamedAttribute &attr,
return parseIntAttrValue(parser, attr.getValue(), value, desc);
};

static LogicalResult parseBool(AsmParser &parser, const NamedAttribute &attr,
bool &value, StringRef desc) {
return parseBoolAttrValue(parser, attr.getValue(), value, desc);
};

//===----------------------------------------------------------------------===//
// Attribute methods
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -531,8 +547,8 @@ void maybePrintCTALayout(mlir::MLIRContext *context, mlir::AsmPrinter &printer,
LogicalResult Subgroup2DBlockEncodingAttr::verify(
function_ref<InFlightDiagnostic()> emitError,
ArrayRef<unsigned> warpsPerCTA, CTALayoutAttr CTALayout,
ArrayRef<unsigned> instrShape, unsigned numBlocks, ArrayRef<unsigned> order,
unsigned kWidth, unsigned threadsPerWarp) {
ArrayRef<unsigned> instrShape, unsigned numBlocks, bool isTransposed,
ArrayRef<unsigned> order, unsigned kWidth, unsigned threadsPerWarp) {
if (instrShape.size() != 2) {
return emitError() << "instrShape must be rank 2 but was: "
<< instrShape.size();
Expand Down Expand Up @@ -569,6 +585,7 @@ Attribute Subgroup2DBlockEncodingAttr::parse(AsmParser &parser, Type type) {
std::optional<SmallVector<unsigned>> CTAOrder;
SmallVector<unsigned> instrShape;
unsigned numBlocks = 0;
bool isTransposed = false;
SmallVector<unsigned> order;
unsigned kWidth = 0;
unsigned threadsPerWarp = 0;
Expand Down Expand Up @@ -601,6 +618,10 @@ Attribute Subgroup2DBlockEncodingAttr::parse(AsmParser &parser, Type type) {
if (parseUInt(parser, attr, numBlocks, "numBlocks").failed())
return {};
}
if (attr.getName() == "isTransposed") {
if (parseBool(parser, attr, isTransposed, "isTransposed").failed())
return {};
}
if (attr.getName() == "order") {
if (parseIntArrayAttr(parser, attr, order, "order").failed())
return {};
Expand All @@ -622,7 +643,7 @@ Attribute Subgroup2DBlockEncodingAttr::parse(AsmParser &parser, Type type) {

return parser.getChecked<Subgroup2DBlockEncodingAttr>(
parser.getContext(), warpsPerCTA, *CTALayout, instrShape, numBlocks,
order, kWidth, threadsPerWarp);
isTransposed, order, kWidth, threadsPerWarp);
}

SmallVector<unsigned> Subgroup2DBlockEncodingAttr::getRepOrder() const {
Expand Down Expand Up @@ -652,9 +673,10 @@ void Subgroup2DBlockEncodingAttr::print(AsmPrinter &printer) const {
maybePrintCTALayout(getContext(), printer, getCTALayout(), getRank());

printer << ", instrShape = [" << getInstrShape()
<< "], numBlocks=" << getNumBlocks() << ", order=[" << getOrder()
<< "], kWidth=" << getKWidth()
<< ", threadsPerWarp=" << getThreadsPerWarp() << "}>";
<< "], numBlocks = " << getNumBlocks()
<< ", isTransposed = " << getIsTransposed() << ", order = ["
<< getOrder() << "], kWidth = " << getKWidth()
<< ", threadsPerWarp = " << getThreadsPerWarp() << "}>";
}

LinearLayout
Expand All @@ -664,21 +686,15 @@ Subgroup2DBlockEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {

SmallVector<unsigned, 3> Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
DistributedEncodingTrait layout, ArrayRef<int64_t> tensorShape,
bool memoryRowMajor, unsigned kWidth, MLIRContext *context) {
bool memoryRowMajor, bool isTransposed, unsigned kWidth,
MLIRContext *context) {
const auto rank = tensorShape.size();

std::optional<LinearLayout> llEncoding = layout.toLinearLayout(tensorShape);
assert(llEncoding.has_value() && "invalid dot layout to linear layout");
LinearEncodingAttr llAttr = LinearEncodingAttr::get(context, *llEncoding);
SmallVector<unsigned> threadOrder = llAttr.getThreadOrder();

const bool valueRowMajor =
(threadOrder[rank - 2] == 1 && threadOrder[rank - 1] == 0);
assert((valueRowMajor ||
(threadOrder[rank - 2] == 0 && threadOrder[rank - 1] == 1)) &&
"Only row_major or column_major is allowed");
const bool isTransposeRequired = valueRowMajor ^ memoryRowMajor;

auto dotEncodingAttr = dyn_cast<DotOperandEncodingAttr>(layout);
const unsigned opIdx = dotEncodingAttr ? dotEncodingAttr.getOpIdx() : 2;

Expand Down Expand Up @@ -725,7 +741,7 @@ SmallVector<unsigned, 3> Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
unsigned dpasOperandsPerTileY =
isOperandA ? numReps[2] : repCluster[dimOuter];

if (isTransposeRequired) {
if (isTransposed) {
std::swap(tileWidth, tileHeight);

const unsigned threadsPerWarp = dpasLayout.getThreadsPerWarp();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -602,11 +602,15 @@ subgroup2DBlockToLinearLayout(ArrayRef<int64_t> blockShape,
assert(rank == layout.getRank() && "unexpected block shape rank, layout rank "
"and block shape rank must be equal");
auto dimNames = standardOutDimNames(ctx, rank);
auto loadTileSize = layout.getInstrShape();
auto loadTileSize = SmallVector<unsigned>(layout.getInstrShape());
assert(loadTileSize.size() == 2);
StringAttr kRegister = S("register");
StringAttr kLane = S("lane");
StringAttr kWarp = S("warp");

if (layout.getIsTransposed())
std::swap(loadTileSize[0], loadTileSize[1]);

// Start by creating register/lane bases corresponding to the desired load
// tile size
auto [regBases, laneBases] = createRegisterLaneBases(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1467,7 +1467,8 @@ struct LoadOpConversion
} else {
auto tileParams = Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
cast<DistributedEncodingTrait>(encoding), tensorType.getShape(),
memoryRowMajor, elemSizeInBits / 8, rewriter.getContext());
memoryRowMajor, isTransposeRequired, elemSizeInBits / 8,
rewriter.getContext());
return std::make_tuple(tileParams[0], tileParams[1], tileParams[2]);
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,14 +287,14 @@ class TritonIntelGPUOptimizeBlockIOEncodingPass

auto tileParams = Subgroup2DBlockEncodingAttr::getInstrShapeForLayout(
cast<DistributedEncodingTrait>(dotOperandEncoding),
oldTensorType.getShape(), memoryRowMajor, elemSizeInBits / 8,
&getContext());
oldTensorType.getShape(), memoryRowMajor, isTransposeRequired,
elemSizeInBits / 8, &getContext());
SmallVector<unsigned> instrShape{tileParams[0], tileParams[1]};
const unsigned vBlocks = tileParams[2];

auto subgroup2DBlockEncoding = Subgroup2DBlockEncodingAttr::get(
&getContext(), dpasLayout.getWarpsPerCTA(), CTALayout, instrShape,
tileParams[2],
tileParams[2], isTransposeRequired,
getOrderForDotOperand(dotOperandEncoding.getOpIdx(), /*rank*/ rank,
/*kContig*/ true),
kWidth, dpasLayout.getThreadsPerWarp());
Expand Down
Loading