Skip to content

Commit 8aa0ea5

Browse files
committed
Test
1 parent d2e9b1b commit 8aa0ea5

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

third_party/intel/backend/compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def make_ttgir(mod, metadata, opt, properties):
253253
passes.ttgpuir.add_optimize_dot_operands(pm, True)
254254
intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
255255
intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
256-
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
256+
#intel.passes.ttgpuir.add_remove_layout_conversions(pm)
257257
intel.passes.ttgpuir.add_reduce_data_duplication(pm)
258258
passes.ttgpuir.add_reorder_instructions(pm)
259259
passes.common.add_cse(pm)

third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,12 @@ RankedTensorType getOptimizedType(RankedTensorType type,
142142
[[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum());
143143
assert(ctaSplitNum == 1 && "Expecting single CTA");
144144

145+
llvm::errs() << linearLayout << "\n";
146+
145147
RankedTensorType::Builder typeBuilder(type);
146148
int32_t numWorkGroupPos = linearLayout.getInDimSizeLog2(kWarp);
147149
unsigned sizePerThread =
148-
numWorkGroupPos == 0 ? 1 : linearLayout.getBasis(kWarp, 0)[0];
150+
numWorkGroupPos == 0 ? 1 : linearLayout.getBasis(kWarp, 0)[0] / threadsPerWarp;
149151
CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(builder.getContext(), 1);
150152
auto newEncoding = builder.getAttr<BlockedEncodingAttr>(
151153
sizePerThread, threadsPerWarp, warpsPerCTA, /*order=*/0, ctaLayout);

0 commit comments

Comments
 (0)