-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][xegpu] Improve XeGPU op verification logic for SIMT flavor and update tests. #127920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Charitha Saumya (charithaintc) ChangesThis PR adds required changes for XeGPU ops to support the SIMT distribution.
Patch is 107.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127920.diff 6 Files Affected:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index cc2e93fb19a70..ccd91a928e1dd 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -103,7 +103,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
CArg<"mlir::Attribute", "mlir::Attribute()">:$sg_map)>
];
-
+
let extraClassDeclaration = [{
using TensorType::clone;
using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
@@ -176,6 +176,11 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
return scatter_attr.getChunkSize().getInt();
return 1;
}
+
+ // This returns a vector type that represents the fragment of data owned by
+ // a work item in SIMT mode if this tensor descriptor is used in a XeGPU
+ // load/store operation.
+ FailureOr<VectorType> getDistributedVectorType();
}];
let hasCustomAssemblyFormat = true;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 06fd03f3af3ad..af3faf141a66e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -8,8 +8,12 @@
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LogicalResult.h"
namespace mlir {
namespace xegpu {
@@ -276,14 +280,13 @@ LogicalResult TensorDescType::verify(
if (scatterAttr) {
// Validate subgroup mapping rules for scattered tensors.
// A work-item's slice of the tensor with shape [sg_size] or
- // [sg_size, chunk_size] will be [1] or [1, chunks_size] respectively,
- // the mapping should reflect that.
+ // [sg_size, chunk_size] will be [1] or [1, 32/element_ty_bit_width]
+ // respectively, the mapping should reflect that.
if (wiData[0] != 1)
return emitError()
<< "cannot map over non-contiguous scattered row elements";
- unsigned chunkSize = scatterAttr.getChunkSize().getInt();
- if (wiData[1] != chunkSize)
+ if (wiData[1] != (32 / elementType.getIntOrFloatBitWidth()))
return emitError() << "work item data mapping must match the number of "
"contiguous elements";
}
@@ -307,6 +310,85 @@ LogicalResult TensorDescType::verify(
return success();
}
+// If tensor descriptor has a sg_map attribute it is used in SIMT mode.
+// In this mode, the distributed vector shape is determined as follows:
+// Definitions:
+// wi_data_size = wi_data[0] × wi_data[1]
+// subgroup_size = wi_layout[0] × wi_layout[1]
+// distribution_unit_size = subgroup_size × wi_data_size
+// ---------------------------------------------------------------------
+// Case 1: Regular loads/stores.
+// ---------------------------------------------------------------------
+// Distributed vector shape must be:
+// [chunk_size / wi_data_size, wi_data_size]
+// If the tensor descriptor shape is 1D, first dimension is ignored (set to 1).
+// [wi_data_size]
+// ---------------------------------------------------------------------
+// Case 2: Block loads/stores
+// ---------------------------------------------------------------------
+// Additionalm definitions:
+// tensor_size = tensor_desc[0] * .. * tensor_desc[r-1] * array_length
+// n_distribution_units = tensor_size / distribution_unit_size
+// Given above definitions, the following conditions must be met:
+// * tensor_desc[0] % (wi_layout[0] × wi_data[0]) == 0
+// * tensor_desc[1] % (wi_layout[1] × wi_data[1]) == 0
+// Distributed vector shape must be:
+// [n_distribution_units, wi_data_size]
+FailureOr<VectorType> TensorDescType::getDistributedVectorType() {
+ auto sgMap = llvm::dyn_cast_if_present<SGMapAttr>(getSgMap());
+ // If no sg_map is provided, tensor desc is not used in SIMT mode.
+ if (!sgMap)
+ return failure();
+
+ SmallVector<int64_t> wiData(sgMap.getWiData());
+ SmallVector<int64_t> wiLayout(sgMap.getWiLayout());
+ auto tdescShape = getShape();
+
+ auto wiDataSize = 1, sgSize = 1;
+ for (auto [wiDim, wiDataDim] : llvm::zip_equal(wiLayout, wiData)) {
+ wiDataSize *= wiDataDim;
+ sgSize *= wiDim;
+ }
+
+ // Case 1: regular loads/stores
+ auto scatterAttr =
+ llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
+ if (scatterAttr) {
+ auto chunkSize = scatterAttr.getChunkSize().getInt();
+ // Check if the first dimension of the tensor descriptor shape is
+ // distributable.
+ if (tdescShape[0] % (wiLayout[0]) != 0)
+ return failure();
+ if (chunkSize > 1)
+ return VectorType::get({chunkSize / wiDataSize, wiDataSize},
+ getElementType());
+ return VectorType::get({wiDataSize}, getElementType());
+ }
+
+ // Case 2: block loads/stores
+ // Tensor descriptor shape can be 1D. For the 1D case, outer dims of wiData
+ // and wiLayout must be 1.
+ if (tdescShape.size() == 1) {
+ if (wiData[0] != 1 || wiLayout[0] != 1)
+ return failure();
+ wiData = {wiData[1]};
+ wiLayout = {wiLayout[1]};
+ }
+ // Check if the tensor descriptor shape is distributable.
+ int64_t tensorSize = 1;
+ for (auto [tdescDim, wiDim, wiDataDim] :
+ llvm::zip_equal(tdescShape, wiLayout, wiData)) {
+ if (tdescDim % (wiDim * wiDataDim) != 0)
+ return failure();
+ tensorSize *= tdescDim;
+ }
+ // tensorSize must be adjusted for array_length.
+ tensorSize *= getArrayLength();
+
+ return VectorType::get({tensorSize / (sgSize * wiDataSize), wiDataSize},
+ getElementType());
+}
+
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 25dc1f22f0432..ad8b4bf3427cc 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -10,9 +10,12 @@
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Support/LLVM.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/LogicalResult.h"
#define DEBUG_TYPE "xegpu"
@@ -73,43 +76,6 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
}
-// Validations for nd instruction arguments is successful if any of these are
-// true:
-// - tensor descriptor and the output vector shapes exactly match.
-// - tensor descriptor has a sg_map attribute and the distributed vector shape
-// matches the tensor descriptor shape when scaled using sg_map factors on
-// each dimension.
-static bool isArgShapesValid(ArrayRef<int64_t> descShape,
- ArrayRef<int64_t> valShape, SGMapAttr sgMap) {
- // Equal shapes with no distribution - no further verification needed.
- if (descShape == valShape && !sgMap)
- return true;
-
- // Unknown distribution - cannot perform operation on partial shape.
- if (!sgMap)
- return false;
-
- // Invalid rank or mixed rank usage.
- size_t descRank = descShape.size();
- if (descRank > 2 || valShape.size() != descRank)
- return false;
-
- // For 1D, SG map is guaranteed to be unit size in the outer dimension.
- // Only take the distribution over the innermost dimension for validation.
- ArrayRef<uint32_t> wiLayout = sgMap.getWiLayout();
- SmallVector<uint32_t> mapLayout(wiLayout.begin(), wiLayout.end());
- if (descRank == 1)
- mapLayout = {wiLayout.back()};
-
- for (const auto &[factor, dim, expected] :
- llvm::zip_equal(mapLayout, valShape, descShape)) {
- if (factor * dim != expected)
- return false;
- }
-
- return true;
-}
-
//===----------------------------------------------------------------------===//
// XeGPU_CreateNdDescOp
//===----------------------------------------------------------------------===//
@@ -280,7 +246,8 @@ LogicalResult LoadNdOp::verify() {
return emitOpError("invalid l3_hint: ") << getL3HintAttr();
auto array_len = tdescTy.getArrayLength();
- auto tdescShape = getShapeOf(tdescTy);
+ // adjusted tensor descriptor shape tracks the expected shape of the result.
+ auto adjustedTdescShape = getShapeOf(tdescTy);
auto valueShape = getShapeOf(valueTy);
if (getTranspose()) {
@@ -292,7 +259,7 @@ LogicalResult LoadNdOp::verify() {
});
if (valid)
- transpose(trans, tdescShape);
+ transpose(trans, adjustedTdescShape);
else
mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
}
@@ -301,8 +268,8 @@ LogicalResult LoadNdOp::verify() {
if (tdescTy.getRank() == 2) {
const int axis = 0;
auto vnni_factor = valueShape.back();
- tdescShape[axis] /= vnni_factor;
- tdescShape.push_back(vnni_factor);
+ adjustedTdescShape[axis] /= vnni_factor;
+ adjustedTdescShape.push_back(vnni_factor);
} else {
mlir::emitWarning(getLoc())
<< "Invalid Packed Attr. It is ignored (available for 2D "
@@ -311,17 +278,35 @@ LogicalResult LoadNdOp::verify() {
}
if (array_len > 1) {
- auto it = tdescShape.begin();
- tdescShape.insert(it, array_len);
+ auto it = adjustedTdescShape.begin();
+ adjustedTdescShape.insert(it, array_len);
}
- auto sgMap = tdescTy.getSGMapAttr();
- if (!isArgShapesValid(tdescShape, valueShape, sgMap))
- return emitOpError() << "Result shape doesn't match TensorDesc shape."
- << "The expected shape is " << makeString(tdescShape)
- << ". But the given shape is "
- << makeString(valueShape) << ".\n";
- return success();
+ auto sgMap = tdescTy.getSGMapAttr();
+ // sg_map not present means IR is in VC mode. In this case value shape must
+ // match adjusted tensor descriptor shape.
+ if (!sgMap)
+ return valueShape == adjustedTdescShape
+ ? success()
+ : emitOpError()
+ << "Result shape " << makeString(valueShape)
+ << " is not consistent with tensor descripter " << tdescTy;
+
+ // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+ // value shape.
+ auto expectedValueShapeOrFailure = tdescTy.getDistributedVectorType();
+ if (failed(expectedValueShapeOrFailure))
+ return emitOpError() << "Failed to compute distributed vector shape for "
+ "tensor descriptor "
+ << tdescTy;
+
+ return valueTy == expectedValueShapeOrFailure.value()
+ ? success()
+ : emitOpError()
+ << "Result shape " << makeString(valueShape)
+ << " is not consistent with distributed vector shape "
+ << makeString(expectedValueShapeOrFailure.value().getShape())
+ << " for tensor descriptor " << tdescTy;
}
//===----------------------------------------------------------------------===//
@@ -351,14 +336,33 @@ LogicalResult StoreNdOp::verify() {
auto tdescShape = getShapeOf(dstTy);
auto valueShape = getShapeOf(valTy);
- auto sgMap = dstTy.getSGMapAttr();
- if (!isArgShapesValid(tdescShape, valueShape, sgMap))
- return emitOpError() << "Result shape doesn't match TensorDesc shape."
- << "The expected shape is " << makeString(tdescShape)
- << ". But the given shape is "
- << makeString(valueShape) << ".\n";
- return success();
+ auto sgMap = dstTy.getSGMapAttr();
+ // sg_map not present means IR is in VC mode. In this case value shape must
+ // match adjusted tensor descriptor shape.
+ if (!sgMap)
+ return valueShape == tdescShape
+ ? success()
+ : emitOpError()
+ << "Result shape " << makeString(valueShape)
+ << " is not consistent with tensor descripter shape "
+ << makeString(tdescShape);
+
+ // sg_map present means IR is in SIMT mode. In this case sg_map determines the
+ // value shape.
+ auto expectedValueShapeOrFailure = dstTy.getDistributedVectorType();
+ if (failed(expectedValueShapeOrFailure))
+ return emitOpError() << "Failed to compute distributed vector shape for "
+ "tensor descriptor "
+ << dstTy;
+
+ return valTy == expectedValueShapeOrFailure.value()
+ ? success()
+ : emitOpError()
+ << "Result shape " << makeString(valueShape)
+ << " is not consistent with distributed vector shape "
+ << makeString(expectedValueShapeOrFailure.value().getShape())
+ << " for tensor descriptor " << dstTy;
}
//===----------------------------------------------------------------------===//
@@ -509,22 +513,25 @@ LogicalResult LoadGatherOp::verify() {
transpose({1, 0}, tdescShape);
}
- if (auto sgMap = tdescTy.getSGMapAttr()) {
- auto valueVecTy = cast<VectorType>(valueTy);
- const int32_t wiData =
- sgMap.getWiData()[0] > 1 ? sgMap.getWiData()[0] : sgMap.getWiData()[1];
- // All represent the same concept: a number of row elements to store.
- if (valueVecTy.getNumElements() != wiData ||
- valueVecTy.getNumElements() != tdescTy.getChunkSize()) {
- return emitOpError("Chunk size, vector size and wi_data must match.");
- }
- // Work-item's slice (i.e., vector shape to load) is [1] or [1, chunk_size].
- tdescShape[tdescTy.getRank() - 1] = 1;
- }
-
- if (valueShape != tdescShape)
+ auto sgMap = tdescTy.getSGMapAttr();
+ // In VC mode, sg_map is not present. In this case, value shape must match
+ // the tensor descriptor shape.
+ if (!sgMap)
+ return valueShape == tdescShape
+ ? success()
+ : emitOpError("Unexpected result shape")
+ << "(Expected shape: " << makeString(tdescShape)
+ << ", Given shape: " << makeString(valueShape) << ").\n";
+ // In SIMT mode, sg_map, wi_data, and chunk_size determine the value shape.
+ auto distributedVectorShapeOrFailure = tdescTy.getDistributedVectorType();
+ if (failed(distributedVectorShapeOrFailure))
+ return emitOpError("Failed to compute distributed vector shape for "
+ "tensor descriptor ")
+ << tdescTy;
+ if (cast<VectorType>(valueTy) != distributedVectorShapeOrFailure.value())
return emitOpError("Unexpected result shape")
- << "(Expected shape: " << makeString(tdescShape)
+ << "(Expected shape: "
+ << makeString(distributedVectorShapeOrFailure.value().getShape())
<< ", Given shape: " << makeString(valueShape) << ").\n";
return success();
@@ -561,22 +568,25 @@ LogicalResult StoreScatterOp::verify() {
transpose({1, 0}, tdescShape);
}
- if (auto sgMap = tdescTy.getSGMapAttr()) {
- auto valueVecTy = cast<VectorType>(valueTy);
- const int32_t wiData =
- sgMap.getWiData()[0] > 1 ? sgMap.getWiData()[0] : sgMap.getWiData()[1];
- // All represent the same concept: a number of row elements to store.
- if (valueVecTy.getNumElements() != wiData ||
- valueVecTy.getNumElements() != tdescTy.getChunkSize()) {
- return emitOpError("Chunk size, vector size and wi_data must match.");
- }
- // Work-item's slice (i.e., vector to store) is [1] or [1, chunk_size].
- tdescShape[tdescTy.getRank() - 1] = 1;
- }
-
- if (valueShape != tdescShape)
+ auto sgMap = tdescTy.getSGMapAttr();
+ // In VC mode, sg_map is not present. In this case, value shape must match
+ // the tensor descriptor shape.
+ if (!sgMap)
+ return valueShape == tdescShape
+ ? success()
+ : emitOpError("Unexpected value shape")
+ << "(Expected shape: " << makeString(tdescShape)
+ << ", Given shape: " << makeString(valueShape) << ").\n";
+ // In SIMT mode, sg_map, wi_data, and chunk_size determine the value shape.
+ auto distributedVectorShapeOrFailure = tdescTy.getDistributedVectorType();
+ if (failed(distributedVectorShapeOrFailure))
+ return emitOpError("Failed to compute distributed vector shape for "
+ "tensor descriptor ")
+ << tdescTy;
+ if (cast<VectorType>(valueTy) != distributedVectorShapeOrFailure.value())
return emitOpError("Unexpected value shape")
- << "(Expected shape: " << makeString(tdescShape)
+ << "(Expected shape: "
+ << makeString(distributedVectorShapeOrFailure.value().getShape())
<< ", Given shape: " << makeString(valueShape) << ").\n";
return success();
@@ -610,20 +620,61 @@ void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
LogicalResult DpasOp::verify() {
int64_t lhsRank = getLhsType().getRank();
int64_t rhsRank = getRhsType().getRank();
-
- if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3))
- return emitOpError("expecting lhs to be a 2D vector, and rhs to be either "
- "2D or 3D (packed) vector.");
-
+ int64_t resultRank = getResultType().getRank();
auto lhsShape = getLhsType().getShape();
auto rhsShape = getRhsType().getShape();
- auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
- if (bK != lhsShape[1])
+ auto resultShape = getResultType().getShape();
+
+ auto sgMapA = (*this)->getAttrOfType<xegpu::SGMapAttr>("sg_map_a");
+ auto sgMapB = (*this)->getAttrOfType<xegpu::SGMapAttr>("sg_map_b");
+ auto sgMapC = (*this)->getAttrOfType<xegpu::SGMapAttr>("sg_map_c");
+
+ // If sg_maps are not present, then the operation is in VC mode.
+ if (!sgMapA && !sgMapB && !sgMapC) {
+ if (lhsRank != 2 || (rhsRank != 2 && rhsRank != 3) || resultRank != 2)
+ return emitOpError(
+ "expecting lhs and result to be a 2D vector, and rhs to be either "
+ "2D or 3D (packed) vector.");
+ auto bK = rhsRank == 3 ? rhsShape[0] * rhsShape[2] : rhsShape[0];
+ if (bK != lhsShape[1])
+ return emitOpError("K-dimension mismatch.");
+ if (lhsShape[0] != resultShape[0])
+ return emitOpError("M-dimension mismatch.");
+ if (rhsShape[1] != resultShape[1])
+ return emitOpError("N-dimension mismatch.");
+ return success();
+ }
+ // Otherwise, in SIMT mode we expect sg_map attributes for all operands and
+ // result of DPAS operation.
+ if (!sgMapA || !sgMapB || !sgMapC)
+ return emitOpError("sg_map attributes for all operands and outputs are "
+ "expected in SIMT xegpu::Dpas operation");
+
+ // In SIMT mode, All data fragments must be 2D
+ if (lhsRank != 2 || rhsRank != 2 || resultRank != 2)
+ return emitOpError("expecting lhs, rhs, and result to be a 2D vector.");
+
+ auto wiLayoutA = sgMapA.getWiLayout();
+ auto wiLayoutB = sgMapB.getWiLayout();
+ auto wiLayoutC = sgMapC.getWiLayout();
+ // Obtain the expanded shapes of the operands and result using wi_layout.
+ // NOTE: For B, get rid of the packed dimension for the expanded shape.
+ SmallVector<int64_t> expandedShapeA = {lhsShape[0] * wiLayoutA[0],
+ lhsShape[1] * wiLayoutA[1]};
+ SmallVector<int64_t> expandedShapeB = {
+ rhsShape[0] * rhsShape[1] * wiLayoutB[0], 1 * wiLayoutB[1]};
+ SmallVector<int64_t> expandedShapeC = {resultShape[0] * wiLayoutC[0],
+ resultShape[1] * wiLayoutC[1]};
+ auto bK = expandedShapeB[0];
+ if (bK != expandedShapeA[1])
return emitOpError("K-dimension mismatch.");
+ if (expandedShapeA[0] != expandedShapeC[0])
+ return emitOpError("M-dimension mismatch.");
+ if (expandedShapeB[1] != expandedShapeC[1])
+ return emitOpError("N-dimension mismatch.");
return success();
}
-
} // namespace xegpu
} // namespace mlir
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
deleted file mode 100644
index 472176af72b19..0000000000000...
[truncated]
|
|
@Jianhui-Li @chencha3 @Garra1980 Can you please take a look? invalid test cases are still WIP. |
Jianhui-Li
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall LGTM.
|
@chencha3 @Garra1980 could you please have a look and review/approve this? thanks in advance. |
|
@akroviakov Hi Artem, if you have the bandwidth please review this PR. |
| llvm::ArrayRef<int64_t> shape, mlir::Type elementType, | ||
| mlir::Attribute encoding, mlir::Attribute sg_map) { | ||
| size_t rank = shape.size(); | ||
| unsigned packingFactor = 32 / elementType.getIntOrFloatBitWidth(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may be worth to make 32 a named variable just to clarify its meaning right away, this could also spare a comment below.
This is because each work item access data in 32 bit granularity
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good point. I have added a comment on this.
chencha3
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Co-authored-by: Artem Kroviakov <[email protected]>
adam-smnk
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good - thanks for bearing with all the iterations 👍
Thanks very much. These are really great comments. I think it looks much better now. One more thing: I am updating the op description as well to reflect the SIMT changes. I will notify you once I have done that. After that we can merge this? :-) |
Awesome. Sure thing 👍 |
Done! |
adam-smnk
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just small details
Fixed. thanks for catching them. |
|
This cause bazel build failure, see log |
Hi, do you know the process for fixing this now that this is merged already? |
|
Typically quick fix goes without approval, if some time is required for a fix - change should be reverted |
|
fix here: #128595 Can someone help in merging this? |
This PR adds required changes for XeGPU ops to support the SIMT distribution.