Skip to content

Commit b8e202a

Browse files
committed
Minor changes
1 parent ba1a5b7 commit b8e202a

File tree

5 files changed

+18
-56
lines changed

5 files changed

+18
-56
lines changed

benchmarks/triton_kernels_benchmark/gemm_benchmark.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def get_dpas_layout(num_warps: ttgl.constexpr, m_shape: ttgl.constexpr, n_shape:
201201
key=['M', 'N', 'K'],
202202
)
203203
@gluon.jit
204-
def gluon_matmul_kernel_dpas_tensor_desc(
204+
def gluon_matmul_kernel_with_tensor_descriptors(
205205
# Pointers to matrices
206206
a_ptr, b_ptr, c_ptr,
207207
# Matrix dimensions
@@ -268,7 +268,7 @@ def gluon_matmul_kernel_dpas_tensor_desc(
268268
key=['B', 'M', 'N', 'K'],
269269
)
270270
@gluon.jit
271-
def gluon_matmul_kernel_dpas_tensor_desc_batched(
271+
def gluon_matmul_kernel_with_tensor_descriptors_batched(
272272
# Pointers to matrices
273273
a_ptr, b_ptr, c_ptr,
274274
# Matrix dimensions
@@ -461,8 +461,10 @@ def get_benchmark(
461461
providers_filter: Optional[list[str]] = None,
462462
transpose_a=False,
463463
transpose_b=False,
464-
matmul_kernel=matmul_kernel_with_block_pointers,
465-
matmul_kernel_batched=matmul_kernel_with_block_pointers_batched,
464+
triton_matmul_kernel=matmul_kernel_with_block_pointers,
465+
triton_matmul_kernel_batched=matmul_kernel_with_block_pointers_batched,
466+
gluon_matmul_kernel=gluon_matmul_kernel_with_tensor_descriptors,
467+
gluon_matmul_kernel_batched=gluon_matmul_kernel_with_tensor_descriptors_batched,
466468
plot_name='matmul-performance',
467469
):
468470
"""
@@ -472,13 +474,13 @@ def get_benchmark(
472474
supported_providers = {
473475
'gluon': 'Gluon',
474476
'triton': 'Triton',
475-
#'onednn': 'OneDNN',
477+
'onednn': 'OneDNN',
476478
}
477479
# use_cutlass
478-
# if not (transpose_a or transpose_b):
479-
# if torch.xpu.get_device_name() != 'Intel(R) Arc(TM) Graphics':
480-
# # FIXME: enable cutlass on LNL
481-
# supported_providers['cutlass'] = 'CUTLASS'
480+
if not (transpose_a or transpose_b):
481+
if torch.xpu.get_device_name() != 'Intel(R) Arc(TM) Graphics':
482+
# FIXME: enable cutlass on LNL
483+
supported_providers['cutlass'] = 'CUTLASS'
482484
providers = benchmark_suite.filter_providers(supported_providers, providers_filter)
483485

484486
# Benchmark Performance
@@ -532,8 +534,8 @@ def benchmark(B, M, N, K, provider):
532534
else:
533535
raise AssertionError(f'Unexpected shape of length {len(a.shape)}')
534536

535-
kernel = matmul_kernel if provider == 'triton' else gluon_matmul_kernel_dpas_tensor_desc
536-
batched_kernel = matmul_kernel_batched if provider == 'triton' else gluon_matmul_kernel_dpas_tensor_desc_batched
537+
kernel = triton_matmul_kernel if provider == 'triton' else gluon_matmul_kernel
538+
batched_kernel = triton_matmul_kernel_batched if provider == 'triton' else gluon_matmul_kernel_batched
537539

538540
matmul_fn = lambda: matmul(
539541
a,

third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ calculateRepCluster(unsigned capRepeatCount, unsigned capSystolicDepth,
7171
ArrayRef<int64_t> retShape, unsigned threadsPerWarp,
7272
unsigned int a_bitwidth, bool is_a_FP8,
7373
ArrayRef<int64_t> a_shape, ArrayRef<int64_t> b_shape,
74-
// RankedTensorType oldRetType,
7574
SmallVector<unsigned> warpsPerTile);
7675

7776
} // namespace mlir::triton::gpu::intel

third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ struct TritonIntelTensorDescToBlockPointer
147147
tensorType, pointerType.getAddressSpace());
148148

149149
auto makeTensorPtr = builder.create<tt::MakeTensorPtrOp>(
150-
loc, resultType, base, shape, strides, offsets,
150+
builder, loc, resultType, base, shape, strides, offsets,
151151
builder.getDenseI32ArrayAttr({1, 0}));
152152
return makeTensorPtr;
153153
};

third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ SmallVector<int64_t> DpasEncodingAttr::calculateDPASRepetitions(
211211
ArrayRef<unsigned> repCluster, unsigned repeatCount, unsigned systolicDepth,
212212
unsigned executionSize, unsigned opsPerChannel) {
213213
// Always return a 3D shape repetitions for the ease of value handling, same
214-
// to mma
214+
// to mma.
215215
size_t rank = shape.size();
216216
SmallVector<int64_t> rep(3, 1);
217217

third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -212,17 +212,11 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
212212
size_t rank = retShape.size();
213213

214214
SmallVector<unsigned> repCluster = ttgi::calculateRepCluster(
215-
// dpasCap,
216215
dpasCap.repeatCount, dpasCap.systolicDepth, dpasCap.executionSize,
217-
opsPerChan,
218-
// rank,
219-
retShape,
220-
// mod,
221-
threadsPerWarp, oldAType.getElementType().getIntOrFloatBitWidth(),
216+
opsPerChan, retShape, threadsPerWarp,
217+
oldAType.getElementType().getIntOrFloatBitWidth(),
222218
isa<Float8E5M2Type, Float8E4M3FNType>(oldAType.getElementType()),
223-
oldAType.getShape(), oldBType.getShape(),
224-
// oldRetType,
225-
warpsPerTile);
219+
oldAType.getShape(), oldBType.getShape(), warpsPerTile);
226220

227221
unsigned repeatCount =
228222
std::min(dpasCap.repeatCount, (unsigned)retShape[rank - 2] /*M*/);
@@ -237,39 +231,6 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
237231
dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
238232
threadsPerWarp);
239233

240-
// if (dpasCap.isPVC() || dpasCap.isFalconShore()) {
241-
// unsigned dpasElemBitWidths =
242-
// oldAType.getElementType().getIntOrFloatBitWidth();
243-
//
244-
// // We are upcasting FP8 to FP16
245-
// if (isa<Float8E5M2Type, Float8E4M3FNType>(oldAType.getElementType()))
246-
// dpasElemBitWidths = 2 * dpasElemBitWidths;
247-
//
248-
// // Enlarge the repCluster size to use the large 2D load for A and B
249-
// // operands.
250-
// unsigned maxRepClusterM =
251-
// PVC_2D_LOAD_MAXIMUM_NUMBER_OF_ROWS / dpasCap.repeatCount;
252-
// SmallVector<int64_t> repA =
253-
// dpasEnc.getDPASRepetitions(oldAType.getShape(), 0);
254-
// unsigned repClusterDimM =
255-
// std::min(maxRepClusterM, static_cast<unsigned>(repA[1]));
256-
//
257-
// unsigned maxRepClusterN =
258-
// PVC_2D_LOAD_MAXIMUM_BYTES_OF_COLS /
259-
// ((dpasElemBitWidths / 8) * dpasCap.executionSize);
260-
// SmallVector<int64_t> repB =
261-
// dpasEnc.getDPASRepetitions(oldBType.getShape(), 1);
262-
// unsigned repClusterDimN =
263-
// std::min(maxRepClusterN, static_cast<unsigned>(repB[2]));
264-
// repCluster[rank - 2] = repClusterDimM;
265-
// repCluster[rank - 1] = repClusterDimN;
266-
//
267-
// dpasEnc = ttgi::DpasEncodingAttr::get(
268-
// oldRetType.getContext(), repeatCount, dpasCap.systolicDepth,
269-
// dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
270-
// threadsPerWarp);
271-
// }
272-
273234
RankedTensorType newRetType =
274235
RankedTensorType::get(retShape, oldRetType.getElementType(), dpasEnc);
275236

0 commit comments

Comments
 (0)