diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 6098eb34d04d5..5b1d7bb87a219 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1097,6 +1097,10 @@ def GPU_YieldOp : GPU_Op<"yield", [Pure, ReturnLike, Terminator]>, ``` }]; + let builders = [ + OpBuilder<(ins), [{ /* nothing to do */ }]> + ]; + let assemblyFormat = "attr-dict ($values^ `:` type($values))?"; } @@ -2921,4 +2925,138 @@ def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> { }]; } +def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0", + [DeclareOpInterfaceMethods, + SingleBlockImplicitTerminator<"gpu::YieldOp">, + RecursiveMemoryEffects]> { + let summary = "Executes operations in the associated region on thread #0 of a" + "SPMD program"; + let description = [{ + `warp_execute_on_lane_0` is an operation used to bridge the gap between + vector programming and SPMD programming model like GPU SIMT. It allows to + trivially convert a region of vector code meant to run on a multiple threads + into a valid SPMD region and then allows incremental transformation to + distribute vector operations on the threads. + + Any code present in the region would only be executed on first thread/lane + based on the `laneid` operand. The `laneid` operand is an integer ID between + [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in + a warp. + + Operands are vector values distributed on all lanes that may be used by + the single lane execution. The matching region argument is a vector of all + the values of those lanes available to the single active lane. The + distributed dimension is implicit based on the shape of the operand and + argument. the properties of the distribution may be described by extra + attributes (e.g. affine map). + + Return values are distributed on all lanes using laneId as index. The + vector is distributed based on the shape ratio between the vector type of + the yield and the result type. + If the shapes are the same this means the value is broadcasted to all lanes. + In the future the distribution can be made more explicit using affine_maps + and will support having multiple Ids. + + Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy + between lane0 and the lanes of the warp. When distributing a vector + from lane0 to all the lanes, the data are distributed in a block cyclic way. + For example `vector<64xf32>` gets distributed on 32 threads and map to + `vector<2xf32>` where thread 0 contains vector[0] and vector[1]. + + During lowering values passed as operands and return value need to be + visible to different lanes within the warp. This would usually be done by + going through memory. + + The region is *not* isolated from above. For values coming from the parent + region not going through operands only the lane 0 value will be accesible so + it generally only make sense for uniform values. + + Example: + ``` + // Execute in parallel on all threads/lanes. + gpu.warp_execute_on_lane_0 (%laneid)[32] { + // Serial code running only on thread/lane 0. + ... + } + // Execute in parallel on all threads/lanes. + ``` + + This may be lowered to an scf.if region as below: + ``` + // Execute in parallel on all threads/lanes. + %cnd = arith.cmpi eq, %laneid, %c0 : index + scf.if %cnd { + // Serial code running only on thread/lane 0. + ... + } + // Execute in parallel on all threads/lanes. + ``` + + When the region has operands and/or return values: + ``` + // Execute in parallel on all threads/lanes. + %0 = gpu.warp_execute_on_lane_0(%laneid)[32] + args(%v0 : vector<4xi32>) -> (vector<1xf32>) { + ^bb0(%arg0 : vector<128xi32>) : + // Serial code running only on thread/lane 0. + ... + gpu.yield %1 : vector<32xf32> + } + // Execute in parallel on all threads/lanes. + ``` + + values at the region boundary would go through memory: + ``` + // Execute in parallel on all threads/lanes. + ... + // Store the data from each thread into memory and Synchronization. + %tmp0 = memreg.alloc() : memref<128xf32> + %tmp1 = memreg.alloc() : memref<32xf32> + %cnd = arith.cmpi eq, %laneid, %c0 : index + vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32> + some_synchronization_primitive + scf.if %cnd { + // Serialized code running only on thread 0. + // Load the data from all the threads into a register from thread 0. This + // allow threads 0 to access data from all the threads. + %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32> + ... + // Store the data from thread 0 into memory. + vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32> + } + // Synchronization and load the data in a block cyclic way so that the + // vector is distributed on all threads. + some_synchronization_primitive + %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32> + // Execute in parallel on all threads/lanes. + ``` + + }]; + + let hasVerifier = 1; + let hasCustomAssemblyFormat = 1; + let arguments = (ins Index:$laneid, I64Attr:$warp_size, + Variadic:$args); + let results = (outs Variadic:$results); + let regions = (region SizedRegion<1>:$warpRegion); + + let skipDefaultBuilders = 1; + let builders = [ + OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, + "int64_t":$warpSize)>, + // `blockArgTypes` are different than `args` types as they are they + // represent all the `args` instances visibile to lane 0. Therefore we need + // to explicit pass the type. + OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, + "int64_t":$warpSize, "ValueRange":$args, + "TypeRange":$blockArgTypes)> + ]; + + let extraClassDeclaration = [{ + bool isDefinedOutsideOfRegion(Value value) { + return !getRegion().isAncestor(value.getParentRegion()); + } + }]; +} + #endif // GPU_OPS diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index c5b08d6aa022b..d0f11acb44835 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -2983,138 +2983,5 @@ def Vector_YieldOp : Vector_Op<"yield", [ let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?"; } -def Vector_WarpExecuteOnLane0Op : Vector_Op<"warp_execute_on_lane_0", - [DeclareOpInterfaceMethods, - SingleBlockImplicitTerminator<"vector::YieldOp">, - RecursiveMemoryEffects]> { - let summary = "Executes operations in the associated region on thread #0 of a" - "SPMD program"; - let description = [{ - `warp_execute_on_lane_0` is an operation used to bridge the gap between - vector programming and SPMD programming model like GPU SIMT. It allows to - trivially convert a region of vector code meant to run on a multiple threads - into a valid SPMD region and then allows incremental transformation to - distribute vector operations on the threads. - - Any code present in the region would only be executed on first thread/lane - based on the `laneid` operand. The `laneid` operand is an integer ID between - [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in - a warp. - - Operands are vector values distributed on all lanes that may be used by - the single lane execution. The matching region argument is a vector of all - the values of those lanes available to the single active lane. The - distributed dimension is implicit based on the shape of the operand and - argument. the properties of the distribution may be described by extra - attributes (e.g. affine map). - - Return values are distributed on all lanes using laneId as index. The - vector is distributed based on the shape ratio between the vector type of - the yield and the result type. - If the shapes are the same this means the value is broadcasted to all lanes. - In the future the distribution can be made more explicit using affine_maps - and will support having multiple Ids. - - Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy - between lane0 and the lanes of the warp. When distributing a vector - from lane0 to all the lanes, the data are distributed in a block cyclic way. - For exemple `vector<64xf32>` gets distributed on 32 threads and map to - `vector<2xf32>` where thread 0 contains vector[0] and vector[1]. - - During lowering values passed as operands and return value need to be - visible to different lanes within the warp. This would usually be done by - going through memory. - - The region is *not* isolated from above. For values coming from the parent - region not going through operands only the lane 0 value will be accesible so - it generally only make sense for uniform values. - - Example: - ``` - // Execute in parallel on all threads/lanes. - vector.warp_execute_on_lane_0 (%laneid)[32] { - // Serial code running only on thread/lane 0. - ... - } - // Execute in parallel on all threads/lanes. - ``` - - This may be lowered to an scf.if region as below: - ``` - // Execute in parallel on all threads/lanes. - %cnd = arith.cmpi eq, %laneid, %c0 : index - scf.if %cnd { - // Serial code running only on thread/lane 0. - ... - } - // Execute in parallel on all threads/lanes. - ``` - - When the region has operands and/or return values: - ``` - // Execute in parallel on all threads/lanes. - %0 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0 : vector<4xi32>) -> (vector<1xf32>) { - ^bb0(%arg0 : vector<128xi32>) : - // Serial code running only on thread/lane 0. - ... - vector.yield %1 : vector<32xf32> - } - // Execute in parallel on all threads/lanes. - ``` - - values at the region boundary would go through memory: - ``` - // Execute in parallel on all threads/lanes. - ... - // Store the data from each thread into memory and Synchronization. - %tmp0 = memreg.alloc() : memref<128xf32> - %tmp1 = memreg.alloc() : memref<32xf32> - %cnd = arith.cmpi eq, %laneid, %c0 : index - vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32> - some_synchronization_primitive - scf.if %cnd { - // Serialized code running only on thread 0. - // Load the data from all the threads into a register from thread 0. This - // allow threads 0 to access data from all the threads. - %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32> - ... - // Store the data from thread 0 into memory. - vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32> - } - // Synchronization and load the data in a block cyclic way so that the - // vector is distributed on all threads. - some_synchronization_primitive - %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32> - // Execute in parallel on all threads/lanes. - ``` - - }]; - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; - let arguments = (ins Index:$laneid, I64Attr:$warp_size, - Variadic:$args); - let results = (outs Variadic:$results); - let regions = (region SizedRegion<1>:$warpRegion); - - let skipDefaultBuilders = 1; - let builders = [ - OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, - "int64_t":$warpSize)>, - // `blockArgTypes` are different than `args` types as they are they - // represent all the `args` instances visibile to lane 0. Therefore we need - // to explicit pass the type. - OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid, - "int64_t":$warpSize, "ValueRange":$args, - "TypeRange":$blockArgTypes)> - ]; - - let extraClassDeclaration = [{ - bool isDefinedOutsideOfRegion(Value value) { - return !getRegion().isAncestor(value.getParentRegion()); - } - }]; -} #endif // MLIR_DIALECT_VECTOR_IR_VECTOR_OPS diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h index 8907a2a583609..dda45219b2acc 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h @@ -9,6 +9,7 @@ #ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_ #define MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_ +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" namespace mlir { @@ -23,15 +24,15 @@ struct WarpExecuteOnLane0LoweringOptions { /// type may be VectorType or a scalar) and be availble for the current warp. /// If there are several warps running in parallel the allocation needs to be /// split so that each warp has its own allocation. - using WarpAllocationFn = - std::function; + using WarpAllocationFn = std::function; WarpAllocationFn warpAllocationFn = nullptr; /// Lamdba function to let user emit operation to syncronize all the thread /// within a warp. After this operation all the threads can see any memory /// written before the operation. using WarpSyncronizationFn = - std::function; + std::function; WarpSyncronizationFn warpSyncronizationFn = nullptr; }; @@ -48,17 +49,17 @@ using DistributionMapFn = std::function; /// /// Example: /// ``` -/// %0 = vector.warp_execute_on_lane_0(%id){ +/// %0 = gpu.warp_execute_on_lane_0(%id){ /// ... /// vector.transfer_write %v, %A[%c0] : vector<32xf32>, memref<128xf32> -/// vector.yield +/// gpu.yield /// } /// ``` /// To /// ``` -/// %r:3 = vector.warp_execute_on_lane_0(%id) -> (vector<1xf32>) { +/// %r:3 = gpu.warp_execute_on_lane_0(%id) -> (vector<1xf32>) { /// ... -/// vector.yield %v : vector<32xf32> +/// gpu.yield %v : vector<32xf32> /// } /// vector.transfer_write %v, %A[%id] : vector<1xf32>, memref<128xf32> /// @@ -73,7 +74,7 @@ void populateDistributeTransferWriteOpPatterns( /// Move scalar operations with no dependency on the warp op outside of the /// region. -void moveScalarUniformCode(WarpExecuteOnLane0Op op); +void moveScalarUniformCode(gpu::WarpExecuteOnLane0Op op); /// Lambda signature to compute a warp shuffle of a given value of a given lane /// within a given warp size. diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 956877497d933..f019007faede8 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/StringSaver.h" #include +#include using namespace mlir; using namespace mlir::gpu; @@ -2188,6 +2189,187 @@ LogicalResult gpu::DynamicSharedMemoryOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// GPU WarpExecuteOnLane0Op +//===----------------------------------------------------------------------===// + +void WarpExecuteOnLane0Op::print(OpAsmPrinter &p) { + p << "(" << getLaneid() << ")"; + + SmallVector coreAttr = {getWarpSizeAttrName()}; + auto warpSizeAttr = getOperation()->getAttr(getWarpSizeAttrName()); + p << "[" << llvm::cast(warpSizeAttr).getInt() << "]"; + + if (!getArgs().empty()) + p << " args(" << getArgs() << " : " << getArgs().getTypes() << ")"; + if (!getResults().empty()) + p << " -> (" << getResults().getTypes() << ')'; + p << " "; + p.printRegion(getRegion(), + /*printEntryBlockArgs=*/true, + /*printBlockTerminators=*/!getResults().empty()); + p.printOptionalAttrDict(getOperation()->getAttrs(), coreAttr); +} + +ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser, + OperationState &result) { + // Create the region. + result.regions.reserve(1); + Region *warpRegion = result.addRegion(); + + auto &builder = parser.getBuilder(); + OpAsmParser::UnresolvedOperand laneId; + + // Parse predicate operand. + if (parser.parseLParen() || + parser.parseOperand(laneId, /*allowResultNumber=*/false) || + parser.parseRParen()) + return failure(); + + int64_t warpSize; + if (parser.parseLSquare() || parser.parseInteger(warpSize) || + parser.parseRSquare()) + return failure(); + result.addAttribute(getWarpSizeAttrName(OperationName(getOperationName(), + builder.getContext())), + builder.getI64IntegerAttr(warpSize)); + + if (parser.resolveOperand(laneId, builder.getIndexType(), result.operands)) + return failure(); + + llvm::SMLoc inputsOperandsLoc; + SmallVector inputsOperands; + SmallVector inputTypes; + if (succeeded(parser.parseOptionalKeyword("args"))) { + if (parser.parseLParen()) + return failure(); + + inputsOperandsLoc = parser.getCurrentLocation(); + if (parser.parseOperandList(inputsOperands) || + parser.parseColonTypeList(inputTypes) || parser.parseRParen()) + return failure(); + } + if (parser.resolveOperands(inputsOperands, inputTypes, inputsOperandsLoc, + result.operands)) + return failure(); + + // Parse optional results type list. + if (parser.parseOptionalArrowTypeList(result.types)) + return failure(); + // Parse the region. + if (parser.parseRegion(*warpRegion, /*arguments=*/{}, + /*argTypes=*/{})) + return failure(); + WarpExecuteOnLane0Op::ensureTerminator(*warpRegion, builder, result.location); + + // Parse the optional attribute list. + if (parser.parseOptionalAttrDict(result.attributes)) + return failure(); + return success(); +} + +void WarpExecuteOnLane0Op::getSuccessorRegions( + RegionBranchPoint point, SmallVectorImpl ®ions) { + if (!point.isParent()) { + regions.push_back(RegionSuccessor(getResults())); + return; + } + + // The warp region is always executed + regions.push_back(RegionSuccessor(&getWarpRegion())); +} + +void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result, + TypeRange resultTypes, Value laneId, + int64_t warpSize) { + build(builder, result, resultTypes, laneId, warpSize, + /*operands=*/std::nullopt, /*argTypes=*/std::nullopt); +} + +void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result, + TypeRange resultTypes, Value laneId, + int64_t warpSize, ValueRange args, + TypeRange blockArgTypes) { + result.addOperands(laneId); + result.addAttribute(getAttributeNames()[0], + builder.getI64IntegerAttr(warpSize)); + result.addTypes(resultTypes); + result.addOperands(args); + assert(args.size() == blockArgTypes.size()); + OpBuilder::InsertionGuard guard(builder); + Region *warpRegion = result.addRegion(); + Block *block = builder.createBlock(warpRegion); + for (auto [type, arg] : llvm::zip_equal(blockArgTypes, args)) + block->addArgument(type, arg.getLoc()); +} + +/// Helper check if the distributed vector type is consistent with the expanded +/// type and distributed size. +static LogicalResult verifyDistributedType(Type expanded, Type distributed, + int64_t warpSize, Operation *op) { + // If the types matches there is no distribution. + if (expanded == distributed) + return success(); + auto expandedVecType = llvm::dyn_cast(expanded); + auto distributedVecType = llvm::dyn_cast(distributed); + if (!expandedVecType || !distributedVecType) + return op->emitOpError("expected vector type for distributed operands."); + if (expandedVecType.getRank() != distributedVecType.getRank() || + expandedVecType.getElementType() != distributedVecType.getElementType()) + return op->emitOpError( + "expected distributed vectors to have same rank and element type."); + + SmallVector scales(expandedVecType.getRank(), 1); + for (int64_t i = 0, e = expandedVecType.getRank(); i < e; i++) { + int64_t eDim = expandedVecType.getDimSize(i); + int64_t dDim = distributedVecType.getDimSize(i); + if (eDim == dDim) + continue; + if (eDim % dDim != 0) + return op->emitOpError() + << "expected expanded vector dimension #" << i << " (" << eDim + << ") to be a multipler of the distributed vector dimension (" + << dDim << ")"; + scales[i] = eDim / dDim; + } + if (std::accumulate(scales.begin(), scales.end(), 1, + std::multiplies()) != warpSize) + return op->emitOpError() + << "incompatible distribution dimensions from " << expandedVecType + << " to " << distributedVecType << " with warp size = " << warpSize; + + return success(); +} + +LogicalResult WarpExecuteOnLane0Op::verify() { + if (getArgs().size() != getWarpRegion().getNumArguments()) + return emitOpError( + "expected same number op arguments and block arguments."); + auto yield = + cast(getWarpRegion().getBlocks().begin()->getTerminator()); + if (yield.getNumOperands() != getNumResults()) + return emitOpError( + "expected same number of yield operands and return values."); + int64_t warpSize = getWarpSize(); + for (auto [regionArg, arg] : + llvm::zip_equal(getWarpRegion().getArguments(), getArgs())) { + if (failed(verifyDistributedType(regionArg.getType(), arg.getType(), + warpSize, getOperation()))) + return failure(); + } + for (auto [yieldOperand, result] : + llvm::zip_equal(yield.getOperands(), getResults())) { + if (failed(verifyDistributedType(yieldOperand.getType(), result.getType(), + warpSize, getOperation()))) + return failure(); + } + return success(); +} +bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) { + return succeeded( + verifyDistributedType(lhs, rhs, getWarpSize(), getOperation())); +} + //===----------------------------------------------------------------------===// // GPU KernelMetadataAttr //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index db199a46e1637..2224c24dfc433 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -6501,188 +6501,6 @@ void SplatOp::inferResultRanges(ArrayRef argRanges, setResultRanges(getResult(), argRanges.front()); } -//===----------------------------------------------------------------------===// -// WarpExecuteOnLane0Op -//===----------------------------------------------------------------------===// - -void WarpExecuteOnLane0Op::print(OpAsmPrinter &p) { - p << "(" << getLaneid() << ")"; - - SmallVector coreAttr = {getWarpSizeAttrName()}; - auto warpSizeAttr = getOperation()->getAttr(getWarpSizeAttrName()); - p << "[" << llvm::cast(warpSizeAttr).getInt() << "]"; - - if (!getArgs().empty()) - p << " args(" << getArgs() << " : " << getArgs().getTypes() << ")"; - if (!getResults().empty()) - p << " -> (" << getResults().getTypes() << ')'; - p << " "; - p.printRegion(getRegion(), - /*printEntryBlockArgs=*/true, - /*printBlockTerminators=*/!getResults().empty()); - p.printOptionalAttrDict(getOperation()->getAttrs(), coreAttr); -} - -ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser, - OperationState &result) { - // Create the region. - result.regions.reserve(1); - Region *warpRegion = result.addRegion(); - - auto &builder = parser.getBuilder(); - OpAsmParser::UnresolvedOperand laneId; - - // Parse predicate operand. - if (parser.parseLParen() || - parser.parseOperand(laneId, /*allowResultNumber=*/false) || - parser.parseRParen()) - return failure(); - - int64_t warpSize; - if (parser.parseLSquare() || parser.parseInteger(warpSize) || - parser.parseRSquare()) - return failure(); - result.addAttribute(getWarpSizeAttrName(OperationName(getOperationName(), - builder.getContext())), - builder.getI64IntegerAttr(warpSize)); - - if (parser.resolveOperand(laneId, builder.getIndexType(), result.operands)) - return failure(); - - llvm::SMLoc inputsOperandsLoc; - SmallVector inputsOperands; - SmallVector inputTypes; - if (succeeded(parser.parseOptionalKeyword("args"))) { - if (parser.parseLParen()) - return failure(); - - inputsOperandsLoc = parser.getCurrentLocation(); - if (parser.parseOperandList(inputsOperands) || - parser.parseColonTypeList(inputTypes) || parser.parseRParen()) - return failure(); - } - if (parser.resolveOperands(inputsOperands, inputTypes, inputsOperandsLoc, - result.operands)) - return failure(); - - // Parse optional results type list. - if (parser.parseOptionalArrowTypeList(result.types)) - return failure(); - // Parse the region. - if (parser.parseRegion(*warpRegion, /*arguments=*/{}, - /*argTypes=*/{})) - return failure(); - WarpExecuteOnLane0Op::ensureTerminator(*warpRegion, builder, result.location); - - // Parse the optional attribute list. - if (parser.parseOptionalAttrDict(result.attributes)) - return failure(); - return success(); -} - -void WarpExecuteOnLane0Op::getSuccessorRegions( - RegionBranchPoint point, SmallVectorImpl ®ions) { - if (!point.isParent()) { - regions.push_back(RegionSuccessor(getResults())); - return; - } - - // The warp region is always executed - regions.push_back(RegionSuccessor(&getWarpRegion())); -} - -void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result, - TypeRange resultTypes, Value laneId, - int64_t warpSize) { - build(builder, result, resultTypes, laneId, warpSize, - /*operands=*/std::nullopt, /*argTypes=*/std::nullopt); -} - -void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result, - TypeRange resultTypes, Value laneId, - int64_t warpSize, ValueRange args, - TypeRange blockArgTypes) { - result.addOperands(laneId); - result.addAttribute(getAttributeNames()[0], - builder.getI64IntegerAttr(warpSize)); - result.addTypes(resultTypes); - result.addOperands(args); - assert(args.size() == blockArgTypes.size()); - OpBuilder::InsertionGuard guard(builder); - Region *warpRegion = result.addRegion(); - Block *block = builder.createBlock(warpRegion); - for (auto [type, arg] : llvm::zip_equal(blockArgTypes, args)) - block->addArgument(type, arg.getLoc()); -} - -/// Helper check if the distributed vector type is consistent with the expanded -/// type and distributed size. -static LogicalResult verifyDistributedType(Type expanded, Type distributed, - int64_t warpSize, Operation *op) { - // If the types matches there is no distribution. - if (expanded == distributed) - return success(); - auto expandedVecType = llvm::dyn_cast(expanded); - auto distributedVecType = llvm::dyn_cast(distributed); - if (!expandedVecType || !distributedVecType) - return op->emitOpError("expected vector type for distributed operands."); - if (expandedVecType.getRank() != distributedVecType.getRank() || - expandedVecType.getElementType() != distributedVecType.getElementType()) - return op->emitOpError( - "expected distributed vectors to have same rank and element type."); - - SmallVector scales(expandedVecType.getRank(), 1); - for (int64_t i = 0, e = expandedVecType.getRank(); i < e; i++) { - int64_t eDim = expandedVecType.getDimSize(i); - int64_t dDim = distributedVecType.getDimSize(i); - if (eDim == dDim) - continue; - if (eDim % dDim != 0) - return op->emitOpError() - << "expected expanded vector dimension #" << i << " (" << eDim - << ") to be a multipler of the distributed vector dimension (" - << dDim << ")"; - scales[i] = eDim / dDim; - } - if (std::accumulate(scales.begin(), scales.end(), 1, - std::multiplies()) != warpSize) - return op->emitOpError() - << "incompatible distribution dimensions from " << expandedVecType - << " to " << distributedVecType << " with warp size = " << warpSize; - - return success(); -} - -LogicalResult WarpExecuteOnLane0Op::verify() { - if (getArgs().size() != getWarpRegion().getNumArguments()) - return emitOpError( - "expected same number op arguments and block arguments."); - auto yield = - cast(getWarpRegion().getBlocks().begin()->getTerminator()); - if (yield.getNumOperands() != getNumResults()) - return emitOpError( - "expected same number of yield operands and return values."); - int64_t warpSize = getWarpSize(); - for (auto [regionArg, arg] : - llvm::zip_equal(getWarpRegion().getArguments(), getArgs())) { - if (failed(verifyDistributedType(regionArg.getType(), arg.getType(), - warpSize, getOperation()))) - return failure(); - } - for (auto [yieldOperand, result] : - llvm::zip_equal(yield.getOperands(), getResults())) { - if (failed(verifyDistributedType(yieldOperand.getType(), result.getType(), - warpSize, getOperation()))) - return failure(); - } - return success(); -} - -bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) { - return succeeded( - verifyDistributedType(lhs, rhs, getWarpSize(), getOperation())); -} - Value mlir::vector::makeArithReduction(OpBuilder &b, Location loc, CombiningKind kind, Value v1, Value acc, arith::FastMathFlagsAttr fastmath, diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index dc5eb2527f949..3e14259836995 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" @@ -22,14 +23,15 @@ using namespace mlir; using namespace mlir::vector; +using namespace mlir::gpu; /// Currently the distribution map is implicit based on the vector shape. In the /// future it will be part of the op. /// Example: /// ``` -/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) { +/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) { /// ... -/// vector.yield %3 : vector<32x16x64xf32> +/// gpu.yield %3 : vector<32x16x64xf32> /// } /// ``` /// Would have an implicit map of: @@ -117,13 +119,13 @@ struct DistributedLoadStoreHelper { /// 2. vectors of type V transit through a memref /// /// When broadcastMode is true, the load is not distributed to account for - /// the broadcast semantics of the `vector.warp_execute_on_lane_0` op. + /// the broadcast semantics of the `gpu.warp_execute_on_lane_0` op. /// /// Example: /// /// ``` - /// %r = vector.warp_execute_on_lane_0(...) -> (f32) { - /// vector.yield %cst : f32 + /// %r = gpu.warp_execute_on_lane_0(...) -> (f32) { + /// gpu.yield %cst : f32 /// } /// // Both types are f32. The constant %cst is broadcasted to all lanes. /// ``` @@ -180,10 +182,10 @@ static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns( "expected WarpOp with single block"); auto yield = - cast(newOpBody.getBlocks().begin()->getTerminator()); + cast(newOpBody.getBlocks().begin()->getTerminator()); rewriter.modifyOpInPlace( - yield, [&]() { yield.getOperandsMutable().assign(newYieldedValues); }); + yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); }); return newWarpOp; } @@ -195,7 +197,7 @@ static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns( llvm::SmallVector &indices) { SmallVector types(warpOp.getResultTypes().begin(), warpOp.getResultTypes().end()); - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); llvm::SmallSetVector yieldValues(yield.getOperands().begin(), yield.getOperands().end()); @@ -233,7 +235,7 @@ static bool canBeHoisted(Operation *op, /// condition and is not dead. static OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp, const std::function &fn) { - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); for (OpOperand &yieldOperand : yield->getOpOperands()) { Value yieldValues = yieldOperand.get(); @@ -348,7 +350,7 @@ struct WarpOpToScfIfPattern : public OpRewritePattern { // TODO: at this point, we can reuse the shared memory from previous // buffers. SmallVector replacements; - auto yieldOp = cast(ifOp.thenBlock()->getTerminator()); + auto yieldOp = cast(ifOp.thenBlock()->getTerminator()); Location yieldLoc = yieldOp.getLoc(); for (const auto &it : llvm::enumerate(yieldOp.getOperands())) { Value sequentialVal = it.value(); @@ -370,8 +372,8 @@ struct WarpOpToScfIfPattern : public OpRewritePattern { rewriter.setInsertionPointAfter(ifOp); // Result type and yielded value type are the same. This is a broadcast. // E.g.: - // %r = vector.warp_execute_on_lane_0(...) -> (f32) { - // vector.yield %cst : f32 + // %r = gpu.warp_execute_on_lane_0(...) -> (f32) { + // gpu.yield %cst : f32 // } // Both types are f32. The constant %cst is broadcasted to all lanes. // This is described in more detail in the documentation of the op. @@ -472,17 +474,17 @@ static VectorType getDistributedType(VectorType originalType, AffineMap map, /// /// Example: /// ``` -/// %0 = vector.warp_execute_on_lane_0(%id){ +/// %0 = gpu.warp_execute_on_lane_0(%id){ /// ... /// vector.transfer_write %v, %A[%c0] : vector<32xf32>, memref<128xf32> -/// vector.yield +/// gpu.yield /// } /// ``` /// To /// ``` -/// %r:3 = vector.warp_execute_on_lane_0(%id) -> (vector<1xf32>) { +/// %r:3 = gpu.warp_execute_on_lane_0(%id) -> (vector<1xf32>) { /// ... -/// vector.yield %v : vector<32xf32> +/// gpu.yield %v : vector<32xf32> /// } /// vector.transfer_write %v, %A[%id] : vector<1xf32>, memref<128xf32> struct WarpOpTransferWrite : public OpRewritePattern { @@ -598,7 +600,7 @@ struct WarpOpTransferWrite : public OpRewritePattern { // Do not process warp ops that contain only TransferWriteOps. if (llvm::all_of(warpOp.getOps(), - llvm::IsaPred)) + llvm::IsaPred)) return failure(); SmallVector yieldValues = {writeOp.getVector()}; @@ -617,13 +619,13 @@ struct WarpOpTransferWrite : public OpRewritePattern { cast(rewriter.clone(*writeOp.getOperation())); newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0])); rewriter.eraseOp(writeOp); - rewriter.create(newWarpOp.getLoc()); + rewriter.create(newWarpOp.getLoc()); return success(); } LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Operation *lastNode = yield->getPrevNode(); auto writeOp = dyn_cast_or_null(lastNode); @@ -658,19 +660,19 @@ struct WarpOpTransferWrite : public OpRewritePattern { /// Sink out elementwise op feeding into a warp op yield. /// ``` -/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { +/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { /// ... /// %3 = arith.addf %1, %2 : vector<32xf32> -/// vector.yield %3 : vector<32xf32> +/// gpu.yield %3 : vector<32xf32> /// } /// ``` /// To /// ``` -/// %r:3 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>, +/// %r:3 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>, /// vector<1xf32>, vector<1xf32>) { /// ... /// %4 = arith.addf %2, %3 : vector<32xf32> -/// vector.yield %4, %2, %3 : vector<32xf32>, vector<32xf32>, +/// gpu.yield %4, %2, %3 : vector<32xf32>, vector<32xf32>, /// vector<32xf32> /// } /// %0 = arith.addf %r#1, %r#2 : vector<1xf32> @@ -728,15 +730,15 @@ struct WarpOpElementwise : public OpRewritePattern { /// Sink out splat constant op feeding into a warp op yield. /// ``` -/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { +/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { /// ... /// %cst = arith.constant dense<2.0> : vector<32xf32> -/// vector.yield %cst : vector<32xf32> +/// gpu.yield %cst : vector<32xf32> /// } /// ``` /// To /// ``` -/// vector.warp_execute_on_lane_0(%arg0 { +/// gpu.warp_execute_on_lane_0(%arg0 { /// ... /// } /// %0 = arith.constant dense<2.0> : vector<1xf32> @@ -821,20 +823,20 @@ bool delinearizeLaneId(OpBuilder &builder, Location loc, /// Sink out transfer_read op feeding into a warp op yield. /// ``` -/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { +/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { /// ... // %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, // vector<32xf32> -/// vector.yield %2 : vector<32xf32> +/// gpu.yield %2 : vector<32xf32> /// } /// ``` /// To /// ``` -/// %dead = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>, +/// %dead = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>, /// vector<1xf32>, vector<1xf32>) { /// ... /// %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, -/// vector<32xf32> vector.yield %2 : vector<32xf32> +/// vector<32xf32> gpu.yield %2 : vector<32xf32> /// } /// %0 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<1xf32> struct WarpOpTransferRead : public OpRewritePattern { @@ -959,7 +961,7 @@ struct WarpOpDeadResult : public OpRewritePattern { newYieldValues.reserve(warpOp->getNumResults()); DenseMap dedupYieldOperandPositionMap; DenseMap dedupResultPositionMap; - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); // Some values may be yielded multiple times and correspond to multiple @@ -1016,7 +1018,7 @@ struct WarpOpForwardOperand : public OpRewritePattern { PatternRewriter &rewriter) const override { SmallVector resultTypes; SmallVector yieldValues; - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Value valForwarded; unsigned resultIndex; @@ -1135,16 +1137,16 @@ struct WarpOpShapeCast : public OpRewritePattern { /// Sink out vector.create_mask op feeding into a warp op yield. /// ``` /// %0 = ... -/// %1 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { +/// %1 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) { /// ... /// %mask = vector.create_mask %0 : vector<32xi1> -/// vector.yield %mask : vector<32xi1> +/// gpu.yield %mask : vector<32xi1> /// } /// ``` /// To /// ``` /// %0 = ... -/// vector.warp_execute_on_lane_0(%arg0) { +/// gpu.warp_execute_on_lane_0(%arg0) { /// ... /// } /// %cmp = arith.cmpi ult, %laneid, %0 @@ -1652,28 +1654,28 @@ struct WarpOpInsertElement : public OpRewritePattern { /// WarpExecuteOnLane0Op. The new scf.for region will contain a new /// WarpExecuteOnLane0Op region. Example: /// ``` -/// %w = vector.warp_execute_on_lane_0(%laneid) -> (vector<4xf32>) { +/// %w = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4xf32>) { /// ... /// %v1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %v) /// -> (vector<128xf32>) { /// ... /// scf.yield %r : vector<128xf32> /// } -/// vector.yield %v1 : vector<128xf32> +/// gpu.yield %v1 : vector<128xf32> /// } /// ``` /// To: -/// %w0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<4xf32>) { +/// %w0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<4xf32>) { /// ... -/// vector.yield %v : vector<128xf32> +/// gpu.yield %v : vector<128xf32> /// } /// %w = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%varg = %q0) /// -> (vector<4xf32>) { -/// %iw = vector.warp_execute_on_lane_0(%laneid) +/// %iw = gpu.warp_execute_on_lane_0(%laneid) /// args(%varg : vector<4xf32>) -> (vector<4xf32>) { /// ^bb0(%arg: vector<128xf32>): /// ... -/// vector.yield %ir : vector<128xf32> +/// gpu.yield %ir : vector<128xf32> /// } /// scf.yield %iw : vector<4xf32> /// } @@ -1686,7 +1688,7 @@ struct WarpOpScfForOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { - auto yield = cast( + auto yield = cast( warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); // Only pick up forOp if it is the last op in the region. Operation *lastNode = yield->getPrevNode(); @@ -1722,7 +1724,7 @@ struct WarpOpScfForOp : public OpRewritePattern { WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, escapingValues.getArrayRef(), distTypes, newRetIndices); - yield = cast( + yield = cast( newWarpOp.getBodyRegion().getBlocks().begin()->getTerminator()); SmallVector newOperands; @@ -1774,7 +1776,7 @@ struct WarpOpScfForOp : public OpRewritePattern { rewriter.eraseOp(forOp.getBody()->getTerminator()); rewriter.mergeBlocks(forOp.getBody(), innerWarp.getBody(), argMapping); rewriter.setInsertionPointToEnd(innerWarp.getBody()); - rewriter.create(innerWarp.getLoc(), yieldOperands); + rewriter.create(innerWarp.getLoc(), yieldOperands); rewriter.setInsertionPointAfter(innerWarp); if (!innerWarp.getResults().empty()) rewriter.create(forOp.getLoc(), innerWarp.getResults()); @@ -1807,17 +1809,17 @@ struct WarpOpScfForOp : public OpRewritePattern { /// The vector is reduced in parallel. Currently limited to vector size /// matching the warpOp size. E.g.: /// ``` -/// %r = vector_ext.warp_execute_on_lane_0(%laneid)[32] -> (f32) { +/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { /// %0 = "some_def"() : () -> (vector<32xf32>) /// %1 = vector.reduction "add", %0 : vector<32xf32> into f32 -/// vector_ext.yield %1 : f32 +/// gpu.yield %1 : f32 /// } /// ``` /// is lowered to: /// ``` -/// %0 = vector_ext.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { +/// %0 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { /// %1 = "some_def"() : () -> (vector<32xf32>) -/// vector_ext.yield %1 : vector<32xf32> +/// gpu.yield %1 : vector<32xf32> /// } /// %a = vector.extract %0[0] : f32 from vector<1xf32> /// %r = ("warp.reduction %a") diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir index cd62b7b13fa9a..2242786fe6759 100644 --- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir +++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir @@ -2,7 +2,7 @@ func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) { %c0 = arith.constant 0 : index - vector.warp_execute_on_lane_0(%arg0)[32] { + gpu.warp_execute_on_lane_0(%arg0)[32] { // CHECK:%[[val:[0-9]+]] = llvm.extractelement // CHECK:%[[base:[0-9]+]] = llvm.extractvalue // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]] diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 2a0f7e8c6b10c..16148a493ce6e 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -877,3 +877,89 @@ gpu.binary @binary [#gpu.object<#rocdl.target, ]>, bin = "BLOB"> ] + +// ----- + +func.func @warp_wrong_num_outputs(%laneid: index) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op expected same number of yield operands and return values.}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) { + } + return +} + +// ----- + +func.func @warp_wrong_num_inputs(%laneid: index) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op expected same number op arguments and block arguments.}} + gpu.warp_execute_on_lane_0(%laneid)[64] { + ^bb0(%arg0 : vector<128xi32>) : + } + return +} + +// ----- + +func.func @warp_wrong_return_distribution(%laneid: index) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) { + %0 = arith.constant dense<2>: vector<128xi32> + gpu.yield %0 : vector<128xi32> + } + return +} + + +// ----- + +func.func @warp_wrong_arg_distribution(%laneid: index, %v0 : vector<4xi32>) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}} + gpu.warp_execute_on_lane_0(%laneid)[64] + args(%v0 : vector<4xi32>) { + ^bb0(%arg0 : vector<128xi32>) : + } + return +} + +// ----- + +func.func @warp_2_distributed_dims(%laneid: index) { + // expected-error@+1 {{incompatible distribution dimensions from 'vector<128x128xi32>' to 'vector<4x4xi32>' with warp size = 32}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) { + %0 = arith.constant dense<2>: vector<128x128xi32> + gpu.yield %0 : vector<128x128xi32> + } + return +} + +// ----- + +func.func @warp_2_distributed_dims(%laneid: index) { + // expected-error@+1 {{expected expanded vector dimension #1 (8) to be a multipler of the distributed vector dimension (3)}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x3xi32>) { + %0 = arith.constant dense<2>: vector<4x8xi32> + gpu.yield %0 : vector<4x8xi32> + } + return +} + +// ----- + +func.func @warp_mismatch_rank(%laneid: index) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op expected distributed vectors to have same rank and element type.}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) { + %0 = arith.constant dense<2>: vector<128xi32> + gpu.yield %0 : vector<128xi32> + } + return +} + +// ----- + +func.func @warp_mismatch_rank(%laneid: index) { + // expected-error@+1 {{'gpu.warp_execute_on_lane_0' op expected vector type for distributed operands.}} + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (i32) { + %0 = arith.constant dense<2>: vector<128xi32> + gpu.yield %0 : vector<128xi32> + } + return +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index b9c0a0e79e8f2..c0ff2044b76c4 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -464,3 +464,39 @@ gpu.binary @kernel_attrs_2 [ ]>, bin = "BLOB"> ] + +// CHECK-LABEL: func @warp_execute_on_lane_0( +func.func @warp_execute_on_lane_0(%laneid: index) { +// CHECK-NEXT: gpu.warp_execute_on_lane_0(%{{.*}})[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { +// CHECK-NEXT: } + } +// CHECK-NEXT: return + return +} + +// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d +func.func @warp_execute_on_lane_0_2d(%laneid: index) { + // CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>) + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) { + %0 = arith.constant dense<2>: vector<4x32xi32> + // CHECK: gpu.yield %{{.+}} : vector<4x32xi32> + gpu.yield %0 : vector<4x32xi32> + } + return +} + +// CHECK-LABEL: func @warp_operand_result( +func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) { +// CHECK-NEXT: %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) { + %2 = gpu.warp_execute_on_lane_0(%laneid)[32] + args(%v0 : vector<4xi32>) -> (vector<4xi32>) { + ^bb0(%arg0 : vector<128xi32>) : + %0 = arith.constant dense<2>: vector<128xi32> + %1 = arith.addi %arg0, %0 : vector<128xi32> +// CHECK: gpu.yield %{{.*}} : vector<128xi32> + gpu.yield %1 : vector<128xi32> +// CHECK-NEXT: } + } + return %2 : vector<4xi32> +} diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index d591c60acb64e..b3077a38e92c0 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1611,92 +1611,6 @@ func.func @invalid_splat(%v : f32) { // ----- -func.func @warp_wrong_num_outputs(%laneid: index) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op expected same number of yield operands and return values.}} - %2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) { - } - return -} - -// ----- - -func.func @warp_wrong_num_inputs(%laneid: index) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op expected same number op arguments and block arguments.}} - vector.warp_execute_on_lane_0(%laneid)[64] { - ^bb0(%arg0 : vector<128xi32>) : - } - return -} - -// ----- - -func.func @warp_wrong_return_distribution(%laneid: index) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}} - %2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) { - %0 = arith.constant dense<2>: vector<128xi32> - vector.yield %0 : vector<128xi32> - } - return -} - - -// ----- - -func.func @warp_wrong_arg_distribution(%laneid: index, %v0 : vector<4xi32>) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}} - vector.warp_execute_on_lane_0(%laneid)[64] - args(%v0 : vector<4xi32>) { - ^bb0(%arg0 : vector<128xi32>) : - } - return -} - -// ----- - -func.func @warp_2_distributed_dims(%laneid: index) { - // expected-error@+1 {{incompatible distribution dimensions from 'vector<128x128xi32>' to 'vector<4x4xi32>' with warp size = 32}} - %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) { - %0 = arith.constant dense<2>: vector<128x128xi32> - vector.yield %0 : vector<128x128xi32> - } - return -} - -// ----- - -func.func @warp_2_distributed_dims(%laneid: index) { - // expected-error@+1 {{expected expanded vector dimension #1 (8) to be a multipler of the distributed vector dimension (3)}} - %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x3xi32>) { - %0 = arith.constant dense<2>: vector<4x8xi32> - vector.yield %0 : vector<4x8xi32> - } - return -} - -// ----- - -func.func @warp_mismatch_rank(%laneid: index) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op expected distributed vectors to have same rank and element type.}} - %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) { - %0 = arith.constant dense<2>: vector<128xi32> - vector.yield %0 : vector<128xi32> - } - return -} - -// ----- - -func.func @warp_mismatch_rank(%laneid: index) { - // expected-error@+1 {{'vector.warp_execute_on_lane_0' op expected vector type for distributed operands.}} - %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (i32) { - %0 = arith.constant dense<2>: vector<128xi32> - vector.yield %0 : vector<128xi32> - } - return -} - -// ----- - func.func @vector_mask_multiple_ops(%t0: tensor, %t1: tensor, %idx: index, %val: vector<16xf32>, %m0: vector<16xi1>) { %ft0 = arith.constant 0.0 : f32 // expected-error@+1 {{'vector.mask' op expects only one operation to mask}} diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index 3baacba9b6124..7a0f67590f3ec 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -942,41 +942,6 @@ func.func @vector_splat_0d(%a: f32) -> vector { return %0 : vector } -// CHECK-LABEL: func @warp_execute_on_lane_0( -func.func @warp_execute_on_lane_0(%laneid: index) { -// CHECK-NEXT: vector.warp_execute_on_lane_0(%{{.*}})[32] { - vector.warp_execute_on_lane_0(%laneid)[32] { -// CHECK-NEXT: } - } -// CHECK-NEXT: return - return -} - -// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d -func.func @warp_execute_on_lane_0_2d(%laneid: index) { - // CHECK: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>) - %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) { - %0 = arith.constant dense<2>: vector<4x32xi32> - // CHECK: vector.yield %{{.+}} : vector<4x32xi32> - vector.yield %0 : vector<4x32xi32> - } - return -} - -// CHECK-LABEL: func @warp_operand_result( -func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) { -// CHECK-NEXT: %{{.*}} = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) { - %2 = vector.warp_execute_on_lane_0(%laneid)[32] - args(%v0 : vector<4xi32>) -> (vector<4xi32>) { - ^bb0(%arg0 : vector<128xi32>) : - %0 = arith.constant dense<2>: vector<128xi32> - %1 = arith.addi %arg0, %0 : vector<128xi32> -// CHECK: vector.yield %{{.*}} : vector<128xi32> - vector.yield %1 : vector<128xi32> -// CHECK-NEXT: } - } - return %2 : vector<4xi32> -} // CHECK-LABEL: func @vector_mask func.func @vector_mask(%a: vector<8xi32>, %m0: vector<8xi1>) -> i32 { diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index b4491812dc26c..dbe0b39422369 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -44,7 +44,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index, // CHECK-SCF-IF-DAG: %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32 // CHECK-SCF-IF: scf.if %[[is_lane_0]] { - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) { ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>): // CHECK-SCF-IF: %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32> @@ -55,7 +55,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index, %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32> // CHECK-SCF-IF: vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]] // CHECK-SCF-IF: vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]] - vector.yield %2, %3 : vector<32xf32>, vector<64xf32> + gpu.yield %2, %3 : vector<32xf32>, vector<64xf32> } // CHECK-SCF-IF: } // CHECK-SCF-IF: gpu.barrier @@ -77,17 +77,17 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index, // CHECK-HOIST: memref.subview // CHECK-HOIST: memref.subview // CHECK-HOIST: memref.subview -// CHECK-HOIST: vector.warp_execute_on_lane_0 +// CHECK-HOIST: gpu.warp_execute_on_lane_0 -// CHECK-D: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) { +// CHECK-D: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) { // CHECK-D: arith.addf {{.*}} : vector<32xf32> // CHECK-D: arith.addf {{.*}} : vector<64xf32> -// CHECK-D: vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32> +// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32> // CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32 // CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}] // CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32 -// CHECK-DIST-AND-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-DIST-AND-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32> // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32> @@ -99,7 +99,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index, func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>, %arg3: memref<1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>> @@ -121,20 +121,20 @@ func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32> // ----- // CHECK-D-LABEL: func @warp_extract( -// CHECK-D: %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>) +// CHECK-D: %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>) // CHECK-D: "test.dummy_op" // CHECK-D: "test.dummy_op" -// CHECK-D: vector.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32> +// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32> // CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32> // CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32> // CHECK-D: } func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %c0 = arith.constant 0 : index %v = "test.dummy_op"() : () -> (vector<1xf32>) %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>) @@ -149,20 +149,20 @@ func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : ind // Check that we can distribute writes of the maximum allowed number of elements. // CHECK-D-LABEL: func @warp_extract_4_elems( -// CHECK-D: %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>) +// CHECK-D: %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>) // CHECK-D: "test.dummy_op" // CHECK-D: "test.dummy_op" -// CHECK-D: vector.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32> +// CHECK-D: gpu.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32> // CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<4x1xf32> // CHECK-D: } -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<4xf32> // CHECK-D: } func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %c0 = arith.constant 0 : index %v = "test.dummy_op"() : () -> (vector<4xf32>) %v1 = "test.dummy_op"() : () -> (vector<4x1xf32>) @@ -179,7 +179,7 @@ func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g // CHECK-D-LABEL: func @warp_extract_5_elems( // CHECK-D: arith.constant 0 : index -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: %[[V:.+]] = "test.dummy_op" // CHECK-D: %[[V1:.+]] = "test.dummy_op" // CHECK-D: vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<5x1xf32> @@ -187,7 +187,7 @@ func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g // CHECK-D: } func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %c0 = arith.constant 0 : index %v = "test.dummy_op"() : () -> (vector<5xf32>) %v1 = "test.dummy_op"() : () -> (vector<5x1xf32>) @@ -204,7 +204,7 @@ func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g // CHECK-D-LABEL: func @warp_extract_8_elems( // CHECK-D: arith.constant 0 : index -// CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: %[[V:.+]] = "test.dummy_op" // CHECK-D: %[[V1:.+]] = "test.dummy_op" // CHECK-D: vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<8x1xf32> @@ -212,7 +212,7 @@ func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g // CHECK-D: } func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) { - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %c0 = arith.constant 0 : index %v = "test.dummy_op"() : () -> (vector<8xf32>) %v1 = "test.dummy_op"() : () -> (vector<8x1xf32>) @@ -226,14 +226,14 @@ func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g // CHECK-PROP-LABEL: func @warp_dead_result( func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) { - // CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) - %r:3 = vector.warp_execute_on_lane_0(%laneid)[32] -> + // CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) + %r:3 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>, vector<1xf32>) { %2 = "some_def"() : () -> (vector<32xf32>) %3 = "some_def"() : () -> (vector<32xf32>) %4 = "some_def"() : () -> (vector<32xf32>) - // CHECK-PROP: vector.yield %{{.*}} : vector<32xf32> - vector.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32> + // CHECK-PROP: gpu.yield %{{.*}} : vector<32xf32> + gpu.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32> } // CHECK-PROP: return %[[R]] : vector<1xf32> return %r#1 : vector<1xf32> @@ -245,10 +245,10 @@ func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) { // CHECK-PROP-SAME: %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>) func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>) -> (vector<4xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] + %r = gpu.warp_execute_on_lane_0(%laneid)[32] args(%v0 : vector<4xf32>) -> (vector<4xf32>) { ^bb0(%arg0 : vector<128xf32>) : - vector.yield %arg0 : vector<128xf32> + gpu.yield %arg0 : vector<128xf32> } // CHECK-PROP: return %[[V]] : vector<4xf32> return %r : vector<4xf32> @@ -263,21 +263,21 @@ func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) { %c0 = arith.constant 0 : index %c32 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - // CHECK-PROP: %[[R:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>) - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> + // CHECK-PROP: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>) + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<2xf32>) { // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32> // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32> // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32> // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32> - // CHECK-PROP: vector.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32> + // CHECK-PROP: gpu.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32> %2 = "some_def"() : () -> (vector<32xf32>) %3 = "some_def"() : () -> (vector<32xf32>) %4 = "some_def"() : () -> (vector<64xf32>) %5 = "some_def"() : () -> (vector<64xf32>) %6 = arith.addf %2, %3 : vector<32xf32> %7 = arith.addf %4, %5 : vector<64xf32> - vector.yield %6, %7 : vector<32xf32>, vector<64xf32> + gpu.yield %6, %7 : vector<32xf32>, vector<64xf32> } // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32> // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32> @@ -292,18 +292,18 @@ func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) { // ----- // CHECK-PROP-LABEL: func @warp_propagate_scalar_arith( -// CHECK-PROP: %[[r:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} { +// CHECK-PROP: %[[r:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} { // CHECK-PROP: %[[some_def0:.*]] = "some_def" // CHECK-PROP: %[[some_def1:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def0]], %[[some_def1]] +// CHECK-PROP: gpu.yield %[[some_def0]], %[[some_def1]] // CHECK-PROP: } // CHECK-PROP: arith.addf %[[r]]#0, %[[r]]#1 : f32 func.func @warp_propagate_scalar_arith(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (f32) %1 = "some_def"() : () -> (f32) %2 = arith.addf %0, %1 : f32 - vector.yield %2 : f32 + gpu.yield %2 : f32 } vector.print %r : f32 return @@ -312,13 +312,13 @@ func.func @warp_propagate_scalar_arith(%laneid: index) { // ----- // CHECK-PROP-LABEL: func @warp_propagate_cast( -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-PROP: %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32 // CHECK-PROP: return %[[result]] func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %casted = arith.sitofp %i : i32 to f32 - vector.yield %casted : f32 + gpu.yield %casted : f32 } return %r : f32 } @@ -341,10 +341,10 @@ func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: me %c0 = arith.constant 0 : index %c32 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) { %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32> - vector.yield %2, %3 : vector<32xf32>, vector<64xf32> + gpu.yield %2, %3 : vector<32xf32>, vector<64xf32> } %id2 = affine.apply #map0()[%laneid] vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32> @@ -355,15 +355,15 @@ func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: me // ----- // CHECK-PROP-LABEL: func @fold_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) +// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) // CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : vector<1xf32> +// CHECK-PROP: gpu.yield %[[some_def]] : vector<1xf32> // CHECK-PROP: vector.print %[[r]] : vector<1xf32> func.func @fold_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { %0 = "some_def"() : () -> (vector<1xf32>) %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32> - vector.yield %1 : vector<32xf32> + gpu.yield %1 : vector<32xf32> } vector.print %r : vector<1xf32> return @@ -372,16 +372,16 @@ func.func @fold_vector_broadcast(%laneid: index) { // ----- // CHECK-PROP-LABEL: func @extract_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) +// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) // CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : vector<1xf32> +// CHECK-PROP: gpu.yield %[[some_def]] : vector<1xf32> // CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32> // CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> func.func @extract_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { %0 = "some_def"() : () -> (vector<1xf32>) %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32> - vector.yield %1 : vector<64xf32> + gpu.yield %1 : vector<64xf32> } vector.print %r : vector<2xf32> return @@ -390,16 +390,16 @@ func.func @extract_vector_broadcast(%laneid: index) { // ----- // CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (f32) +// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (f32) // CHECK-PROP: %[[some_def:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[some_def]] : f32 +// CHECK-PROP: gpu.yield %[[some_def]] : f32 // CHECK-PROP: %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32> // CHECK-PROP: vector.print %[[broadcasted]] : vector<2xf32> func.func @extract_scalar_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) { %0 = "some_def"() : () -> (f32) %1 = vector.broadcast %0 : f32 to vector<64xf32> - vector.yield %1 : vector<64xf32> + gpu.yield %1 : vector<64xf32> } vector.print %r : vector<2xf32> return @@ -408,16 +408,16 @@ func.func @extract_scalar_vector_broadcast(%laneid: index) { // ----- // CHECK-PROP-LABEL: func @warp_scf_for( -// CHECK-PROP: %[[INI:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) { +// CHECK-PROP: %[[INI:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) { // CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]] : vector<128xf32> +// CHECK-PROP: gpu.yield %[[INI1]] : vector<128xf32> // CHECK-PROP: } // CHECK-PROP: %[[F:.*]] = scf.for %[[IT:.+]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) { // CHECK-PROP: %[[A:.*]] = arith.addi %[[IT]], %{{.*}} : index -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) { // CHECK-PROP: ^bb0(%[[ARG:.*]]: vector<128xf32>): // CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[A]], %[[ARG]]) : (index, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC]] : vector<128xf32> +// CHECK-PROP: gpu.yield %[[ACC]] : vector<128xf32> // CHECK-PROP: } // CHECK-PROP: scf.yield %[[W]] : vector<4xf32> // CHECK-PROP: } @@ -426,14 +426,14 @@ func.func @warp_scf_for(%arg0: index) { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { + %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { %ini = "some_def"() : () -> (vector<128xf32>) %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { %add = arith.addi %arg3, %c1 : index %acc = "some_def"(%add, %arg4) : (index, vector<128xf32>) -> (vector<128xf32>) scf.yield %acc : vector<128xf32> } - vector.yield %3 : vector<128xf32> + gpu.yield %3 : vector<128xf32> } "some_use"(%0) : (vector<4xf32>) -> () return @@ -442,16 +442,16 @@ func.func @warp_scf_for(%arg0: index) { // ----- // CHECK-PROP-LABEL: func @warp_scf_for_use_from_above( -// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { // CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> // CHECK-PROP: %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32> +// CHECK-PROP: gpu.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32> // CHECK-PROP: } // CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) { -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) { // CHECK-PROP: ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>): // CHECK-PROP: %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC]] : vector<128xf32> +// CHECK-PROP: gpu.yield %[[ACC]] : vector<128xf32> // CHECK-PROP: } // CHECK-PROP: scf.yield %[[W]] : vector<4xf32> // CHECK-PROP: } @@ -460,14 +460,14 @@ func.func @warp_scf_for_use_from_above(%arg0: index) { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { + %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { %ini = "some_def"() : () -> (vector<128xf32>) %use_from_above = "some_def_above"() : () -> (vector<128xf32>) %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) { %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) scf.yield %acc : vector<128xf32> } - vector.yield %3 : vector<128xf32> + gpu.yield %3 : vector<128xf32> } "some_use"(%0) : (vector<4xf32>) -> () return @@ -476,17 +476,17 @@ func.func @warp_scf_for_use_from_above(%arg0: index) { // ----- // CHECK-PROP-LABEL: func @warp_scf_for_swap( -// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { // CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> // CHECK-PROP: %[[INI2:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: vector.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32> +// CHECK-PROP: gpu.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32> // CHECK-PROP: } // CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { // CHECK-PROP: ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>): // CHECK-PROP: %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32> // CHECK-PROP: %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: vector.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32> +// CHECK-PROP: gpu.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32> // CHECK-PROP: } // CHECK-PROP: scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32> // CHECK-PROP: } @@ -496,7 +496,7 @@ func.func @warp_scf_for_swap(%arg0: index) { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %0:2 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) { + %0:2 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) { %ini1 = "some_def"() : () -> (vector<128xf32>) %ini2 = "some_def"() : () -> (vector<128xf32>) %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) { @@ -504,7 +504,7 @@ func.func @warp_scf_for_swap(%arg0: index) { %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>) scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32> } - vector.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32> + gpu.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32> } "some_use"(%0#0) : (vector<4xf32>) -> () "some_use"(%0#1) : (vector<4xf32>) -> () @@ -515,7 +515,7 @@ func.func @warp_scf_for_swap(%arg0: index) { // CHECK-PROP-LABEL: func @warp_scf_for_swap_no_yield( // CHECK-PROP: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} { -// CHECK-PROP-NEXT: vector.warp_execute_on_lane_0(%{{.*}})[32] { +// CHECK-PROP-NEXT: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-PROP-NEXT: "some_op"() : () -> () // CHECK-PROP-NEXT: } // CHECK-PROP-NEXT: } @@ -523,7 +523,7 @@ func.func @warp_scf_for_swap_no_yield(%arg0: index) { %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - vector.warp_execute_on_lane_0(%arg0)[32] { + gpu.warp_execute_on_lane_0(%arg0)[32] { scf.for %arg3 = %c0 to %c128 step %c1 { "some_op"() : () -> () } @@ -538,15 +538,15 @@ func.func @warp_scf_for_swap_no_yield(%arg0: index) { #map2 = affine_map<()[s0] -> (s0 * 4 + 128)> // CHECK-PROP-LABEL: func @warp_scf_for_multiple_yield( -// CHECK-PROP: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { +// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { // CHECK-PROP-NEXT: "some_def"() : () -> vector<32xf32> -// CHECK-PROP-NEXT: vector.yield %{{.*}} : vector<32xf32> +// CHECK-PROP-NEXT: gpu.yield %{{.*}} : vector<32xf32> // CHECK-PROP-NEXT: } -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> // CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> // CHECK-PROP: %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> // CHECK-PROP: vector.transfer_read {{.*}} : memref, vector<4xf32> // CHECK-PROP: arith.addf {{.*}} : vector<4xf32> @@ -559,7 +559,7 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0:3 = vector.warp_execute_on_lane_0(%arg0)[32] -> + %0:3 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<1xf32>, vector<4xf32>, vector<4xf32>) { %def = "some_def"() : () -> (vector<32xf32>) %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref, vector<128xf32> @@ -574,7 +574,7 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 %7 = arith.addf %5, %arg5 : vector<128xf32> scf.yield %6, %7 : vector<128xf32>, vector<128xf32> } - vector.yield %def, %3#0, %3#1 : vector<32xf32>, vector<128xf32>, vector<128xf32> + gpu.yield %def, %3#0, %3#1 : vector<32xf32>, vector<128xf32>, vector<128xf32> } %1 = affine.apply #map()[%arg0] vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref @@ -594,8 +594,8 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 // CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 // CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 // CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) { -// CHECK-PROP: vector.yield %{{.*}} : vector<32xf32> +// CHECK-PROP: %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) { +// CHECK-PROP: gpu.yield %{{.*}} : vector<32xf32> // CHECK-PROP: } // CHECK-PROP: %[[a:.*]] = vector.extract %[[warp_op]][0] : f32 from vector<1xf32> // CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] @@ -610,10 +610,10 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 // CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] // CHECK-PROP: return %[[a4]] : f32 func.func @vector_reduction(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<32xf32>) %1 = vector.reduction , %0 : vector<32xf32> into f32 - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -624,11 +624,11 @@ func.func @vector_reduction(%laneid: index) -> (f32) { // CHECK-PROP-SAME: %[[ID:[a-zA-Z0-9]+]] // CHECK-PROP-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK-PROP-SAME: %[[DEST:[a-zA-Z0-9]+]] -// CHECK-PROP: vector.warp_execute_on_lane_0(%[[ID]])[32] +// CHECK-PROP: gpu.warp_execute_on_lane_0(%[[ID]])[32] // CHECK-PROP-NEXT: "some_def"() : () -> vector<4096xf32> // CHECK-PROP-NEXT: %{{.*}} = vector.reduction // CHECK-PROP: %[[DEF:.*]] = arith.divf %{{.*}}, %{{.*}} : vector<1xf32> -// CHECK-PROP-NOT: vector.warp_execute_on_lane_0 +// CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-PROP: scf.for // CHECK-PROP: %{{.*}} = arith.subf %{{.*}}, %[[DEF]] : vector<1xf32> func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<128xf32>){ @@ -637,7 +637,7 @@ func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<12 %c1 = arith.constant 1 : index %c128 = arith.constant 128 : index %f0 = arith.constant 0.000000e+00 : f32 - vector.warp_execute_on_lane_0(%arg0)[32]{ + gpu.warp_execute_on_lane_0(%arg0)[32]{ %cst_1 = arith.constant dense<2.621440e+05> : vector<1xf32> %0 = "some_def"() : () -> (vector<4096xf32>) %1 = vector.reduction , %0, %cst : vector<4096xf32> into f32 @@ -657,10 +657,10 @@ func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<12 func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref) { %c0 = arith.constant 0: index %f0 = arith.constant 0.0: f32 - // CHECK-D: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { - // CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] { + // CHECK-D: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { + // CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] { // CHECK-D: vector.transfer_write %[[R]], %{{.*}}[] : vector, memref - vector.warp_execute_on_lane_0(%laneid)[32] { + gpu.warp_execute_on_lane_0(%laneid)[32] { %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32> %1 = vector.transfer_read %m1[], %f0 : memref, vector %2 = vector.extractelement %1[] : vector @@ -682,8 +682,8 @@ func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref // CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 // CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 // CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) { -// CHECK-PROP: vector.yield %{{.*}} : vector<64xf32> +// CHECK-PROP: %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) { +// CHECK-PROP: gpu.yield %{{.*}} : vector<64xf32> // CHECK-PROP: } // CHECK-PROP: %[[a:.*]] = vector.reduction , %[[warp_op]] : vector<2xf32> into f32 // CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] @@ -698,10 +698,10 @@ func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref // CHECK-PROP: %[[a4:.*]] = arith.addf %[[a3]], %[[r4]] // CHECK-PROP: return %[[a4]] : f32 func.func @vector_reduction_large(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<64xf32>) %1 = vector.reduction , %0 : vector<64xf32> into f32 - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -716,8 +716,8 @@ func.func @vector_reduction_large(%laneid: index) -> (f32) { // CHECK-PROP-DAG: %[[c8:.*]] = arith.constant 8 : i32 // CHECK-PROP-DAG: %[[c16:.*]] = arith.constant 16 : i32 // CHECK-PROP-DAG: %[[c32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[warp_op:.*]]:2 = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) { -// CHECK-PROP: vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32 +// CHECK-PROP: %[[warp_op:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) { +// CHECK-PROP: gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32 // CHECK-PROP: } // CHECK-PROP: %[[a:.*]] = vector.reduction , %[[warp_op]]#0 : vector<2xf32> into f32 // CHECK-PROP: %[[r0:.*]], %{{.*}} = gpu.shuffle xor %[[a]], %[[c1]], %[[c32]] @@ -733,11 +733,11 @@ func.func @vector_reduction_large(%laneid: index) -> (f32) { // CHECK-PROP: %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1 // CHECK-PROP: return %[[a5]] : f32 func.func @vector_reduction_acc(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<64xf32>) %1 = "some_def"() : () -> (f32) %2 = vector.reduction , %0, %1 : vector<64xf32> into f32 - vector.yield %2 : f32 + gpu.yield %2 : f32 } return %r : f32 } @@ -746,15 +746,15 @@ func.func @vector_reduction_acc(%laneid: index) -> (f32) { // CHECK-PROP-LABEL: func @warp_duplicate_yield( func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) { - // CHECK-PROP: %{{.*}}:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>) - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) { + // CHECK-PROP: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>) + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) { %2 = "some_def"() : () -> (vector<32xf32>) %3 = "some_def"() : () -> (vector<32xf32>) %4 = arith.addf %2, %3 : vector<32xf32> %5 = arith.addf %2, %2 : vector<32xf32> // CHECK-PROP-NOT: arith.addf -// CHECK-PROP: vector.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32> - vector.yield %4, %5 : vector<32xf32>, vector<32xf32> +// CHECK-PROP: gpu.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32> + gpu.yield %4, %5 : vector<32xf32>, vector<32xf32> } return %r#0, %r#1 : vector<1xf32>, vector<1xf32> } @@ -765,9 +765,9 @@ func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32> // CHECK-PROP: %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32> // CHECK-PROP: return %[[C]] : vector<1xf32> func.func @warp_constant(%laneid: index) -> (vector<1xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { %cst = arith.constant dense<2.0> : vector<32xf32> - vector.yield %cst : vector<32xf32> + gpu.yield %cst : vector<32xf32> } return %r : vector<1xf32> } @@ -779,18 +779,18 @@ func.func @warp_constant(%laneid: index) -> (vector<1xf32>) { // CHECK-PROP-LABEL: func.func @vector_extract_1d( // CHECK-PROP-DAG: %[[C5_I32:.*]] = arith.constant 5 : i32 // CHECK-PROP-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) { +// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<64xf32> -// CHECK-PROP: vector.yield %[[V]] : vector<64xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<64xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32> // CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[E]], %[[C5_I32]] // CHECK-PROP: return %[[SHUFFLED]] : f32 func.func @vector_extract_1d(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<64xf32>) %1 = vector.extract %0[9] : f32 from vector<64xf32> - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -798,17 +798,17 @@ func.func @vector_extract_1d(%laneid: index) -> (f32) { // ----- // CHECK-PROP-LABEL: func.func @vector_extract_2d( -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<5x96xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<3xf32> from vector<5x3xf32> // CHECK-PROP: return %[[E]] func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { %0 = "some_def"() : () -> (vector<5x96xf32>) %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32> - vector.yield %1 : vector<96xf32> + gpu.yield %1 : vector<96xf32> } return %r : vector<3xf32> } @@ -816,17 +816,17 @@ func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) { // ----- // CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast_scalar( -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<5x96xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][1, 2] : f32 from vector<5x96xf32> // CHECK-PROP: return %[[E]] func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<5x96xf32>) %1 = vector.extract %0[1, 2] : f32 from vector<5x96xf32> - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -834,17 +834,17 @@ func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) { // ----- // CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast( -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<5x96xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<5x96xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<96xf32> from vector<5x96xf32> // CHECK-PROP: return %[[E]] func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { %0 = "some_def"() : () -> (vector<5x96xf32>) %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32> - vector.yield %1 : vector<96xf32> + gpu.yield %1 : vector<96xf32> } return %r : vector<96xf32> } @@ -852,17 +852,17 @@ func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) { // ----- // CHECK-PROP-LABEL: func.func @vector_extract_3d( -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<8x128x96xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<8x128x96xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[W]][2] : vector<4x96xf32> from vector<8x4x96xf32> // CHECK-PROP: return %[[E]] func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { %0 = "some_def"() : () -> (vector<8x128x96xf32>) %1 = vector.extract %0[2] : vector<128x96xf32> from vector<8x128x96xf32> - vector.yield %1 : vector<128x96xf32> + gpu.yield %1 : vector<128x96xf32> } return %r : vector<4x96xf32> } @@ -870,17 +870,17 @@ func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) { // ----- // CHECK-PROP-LABEL: func.func @vector_extractelement_0d( -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { +// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector) { // CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector -// CHECK-PROP: vector.yield %[[V]] : vector +// CHECK-PROP: gpu.yield %[[V]] : vector // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][] : f32 from vector // CHECK-PROP: return %[[E]] : f32 func.func @vector_extractelement_0d(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector) %1 = vector.extractelement %0[] : vector - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -888,18 +888,18 @@ func.func @vector_extractelement_0d(%laneid: index) -> (f32) { // ----- // CHECK-PROP-LABEL: func.func @vector_extractelement_1element( -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { +// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<1xf32> -// CHECK-PROP: vector.yield %[[V]] : vector<1xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<1xf32> // CHECK-PROP: } // CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][0] : f32 from vector<1xf32> // CHECK-PROP: return %[[E]] : f32 func.func @vector_extractelement_1element(%laneid: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<1xf32>) %c0 = arith.constant 0 : index %1 = vector.extractelement %0[%c0 : index] : vector<1xf32> - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -911,9 +911,9 @@ func.func @vector_extractelement_1element(%laneid: index) -> (f32) { // CHECK-PROP-LABEL: func.func @vector_extractelement_1d( // CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index // CHECK-PROP-DAG: %[[C32:.*]] = arith.constant 32 : i32 -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) { +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def" -// CHECK-PROP: vector.yield %[[V]] : vector<96xf32> +// CHECK-PROP: gpu.yield %[[V]] : vector<96xf32> // CHECK-PROP: } // CHECK-PROP: %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]] // CHECK-PROP: %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]] @@ -922,10 +922,10 @@ func.func @vector_extractelement_1element(%laneid: index) -> (f32) { // CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32 // CHECK-PROP: return %[[SHUFFLED]] func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) { %0 = "some_def"() : () -> (vector<96xf32>) %1 = vector.extractelement %0[%pos : index] : vector<96xf32> - vector.yield %1 : f32 + gpu.yield %1 : f32 } return %r : f32 } @@ -935,16 +935,16 @@ func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) { // Index-typed values cannot be shuffled at the moment. // CHECK-PROP-LABEL: func.func @vector_extractelement_1d_index( -// CHECK-PROP: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (index) { +// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (index) { // CHECK-PROP: "some_def" // CHECK-PROP: vector.extract -// CHECK-PROP: vector.yield {{.*}} : index +// CHECK-PROP: gpu.yield {{.*}} : index // CHECK-PROP: } func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (index) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (index) { %0 = "some_def"() : () -> (vector<96xindex>) %1 = vector.extractelement %0[%pos : index] : vector<96xindex> - vector.yield %1 : index + gpu.yield %1 : index } return %r : index } @@ -956,14 +956,14 @@ func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index func.func @lane_dependent_warp_propagate_read( %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) { // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-PROP-NOT: vector.warp_execute_on_lane_0 + // CHECK-PROP-NOT: gpu.warp_execute_on_lane_0 // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32> // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32> %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) { %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32> - vector.yield %2 : vector<1x32xf32> + gpu.yield %2 : vector<1x32xf32> } vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32> return @@ -974,9 +974,9 @@ func.func @lane_dependent_warp_propagate_read( func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) -> vector<1x1x4xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) { %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32> - vector.yield %2 : vector<32x4x32xf32> + gpu.yield %2 : vector<32x4x32xf32> } return %r : vector<1x1x4xf32> } @@ -997,9 +997,9 @@ func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) -> func.func @warp_propagate_read_broadcast(%laneid: index, %src: memref<32x1xf32>) -> vector<1x4xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) { %2 = vector.transfer_read %src[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0)>} : memref<32x1xf32>, vector<32x64xf32> - vector.yield %2 : vector<32x64xf32> + gpu.yield %2 : vector<32x64xf32> } return %r : vector<1x4xf32> } @@ -1020,14 +1020,14 @@ func.func @dont_duplicate_read( %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 -// CHECK-PROP: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { +// CHECK-PROP: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) { // CHECK-PROP-NEXT: vector.transfer_read // CHECK-PROP-NEXT: "blocking_use" -// CHECK-PROP-NEXT: vector.yield - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { +// CHECK-PROP-NEXT: gpu.yield + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) { %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32> "blocking_use"(%2) : (vector<32xf32>) -> () - vector.yield %2 : vector<32xf32> + gpu.yield %2 : vector<32xf32> } return %r : vector<1xf32> } @@ -1038,16 +1038,16 @@ func.func @dont_duplicate_read( func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) { - // CHECK-PROP: %[[SINGLE_RES:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) { - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] + // CHECK-PROP: %[[SINGLE_RES:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) { ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>): // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32> %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32> - // CHECK-PROP: vector.yield %[[SINGLE_VAL]] : vector<32xf32> - vector.yield %2, %2 : vector<32xf32>, vector<32xf32> + // CHECK-PROP: gpu.yield %[[SINGLE_VAL]] : vector<32xf32> + gpu.yield %2, %2 : vector<32xf32>, vector<32xf32> } // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32> @@ -1062,7 +1062,7 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-SCF-IF: scf.if{{.*}}{ - %r:4 = vector.warp_execute_on_lane_0(%laneid)[32] + %r:4 = gpu.warp_execute_on_lane_0(%laneid)[32] args(%s0, %v0, %v1, %v2 : f32, vector, vector<1xf32>, vector<1x1xf32>) -> (f32, vector, vector<1xf32>, vector<1x1xf32>) { ^bb0(%bs0: f32, %bv0: vector, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>): @@ -1084,8 +1084,8 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32> %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32> - // CHECK-SCF-IF-NOT: vector.yield - vector.yield %rs0, %rv0, %rv1, %rv2 : f32, vector, vector<1xf32>, vector<1x1xf32> + // CHECK-SCF-IF-NOT: gpu.yield + gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector, vector<1xf32>, vector<1x1xf32> } // CHECK-SCF-IF: gpu.barrier @@ -1113,7 +1113,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, % // CHECK-SCF-IF: gpu.barrier // CHECK-SCF-IF: scf.if{{.*}}{ - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) { ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>): @@ -1127,8 +1127,8 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, % %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32> %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32> - // CHECK-SCF-IF-NOT: vector.yield - vector.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32> + // CHECK-SCF-IF-NOT: gpu.yield + gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32> } // CHECK-SCF-IF: gpu.barrier @@ -1145,7 +1145,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, % // CHECK-PROP: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 3)> // CHECK-PROP-LABEL: func @vector_insertelement_1d( // CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) // CHECK-PROP: %[[INSERTING_LANE:.*]] = affine.apply #[[$MAP]]()[%[[POS]]] // CHECK-PROP: %[[INSERTING_POS:.*]] = affine.apply #[[$MAP1]]()[%[[POS]]] // CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[INSERTING_LANE]] : index @@ -1157,11 +1157,11 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, % // CHECK-PROP: } // CHECK-PROP: return %[[R]] func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { %0 = "some_def"() : () -> (vector<96xf32>) %f = "another_def"() : () -> (f32) %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32> - vector.yield %1 : vector<96xf32> + gpu.yield %1 : vector<96xf32> } return %r : vector<3xf32> } @@ -1170,17 +1170,17 @@ func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32 // CHECK-PROP-LABEL: func @vector_insertelement_1d_broadcast( // CHECK-PROP-SAME: %[[LANEID:.*]]: index, %[[POS:.*]]: index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VEC]], %[[VAL]] +// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] // CHECK-PROP: vector.insert %[[W]]#1, %[[W]]#0 [%[[POS]]] : f32 into vector<96xf32> func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (vector<96xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) { %0 = "some_def"() : () -> (vector<96xf32>) %f = "another_def"() : () -> (f32) %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32> - vector.yield %1 : vector<96xf32> + gpu.yield %1 : vector<96xf32> } return %r : vector<96xf32> } @@ -1188,17 +1188,17 @@ func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (ve // ----- // CHECK-PROP-LABEL: func @vector_insertelement_0d( -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector, f32) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector, f32) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VEC]], %[[VAL]] +// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] // CHECK-PROP: vector.insert %[[W]]#1, %[[W]]#0 [] : f32 into vector func.func @vector_insertelement_0d(%laneid: index) -> (vector) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector) { %0 = "some_def"() : () -> (vector) %f = "another_def"() : () -> (f32) %1 = vector.insertelement %f, %0[] : vector - vector.yield %1 : vector + gpu.yield %1 : vector } return %r : vector } @@ -1208,10 +1208,10 @@ func.func @vector_insertelement_0d(%laneid: index) -> (vector) { // CHECK-PROP-LABEL: func @vector_insert_1d( // CHECK-PROP-SAME: %[[LANEID:.*]]: index // CHECK-PROP-DAG: %[[C26:.*]] = arith.constant 26 : index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VEC]], %[[VAL]] +// CHECK-PROP: gpu.yield %[[VEC]], %[[VAL]] // CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C26]] // CHECK-PROP: %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) { // CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [1] @@ -1221,11 +1221,11 @@ func.func @vector_insertelement_0d(%laneid: index) -> (vector) { // CHECK-PROP: } // CHECK-PROP: return %[[R]] func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) { %0 = "some_def"() : () -> (vector<96xf32>) %f = "another_def"() : () -> (f32) %1 = vector.insert %f, %0[76] : f32 into vector<96xf32> - vector.yield %1 : vector<96xf32> + gpu.yield %1 : vector<96xf32> } return %r : vector<3xf32> } @@ -1234,18 +1234,18 @@ func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) { // CHECK-PROP-LABEL: func @vector_insert_2d_distr_src( // CHECK-PROP-SAME: %[[LANEID:.*]]: index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VAL]], %[[VEC]] +// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] // CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<3xf32> into vector<4x3xf32> // CHECK-PROP: return %[[INSERT]] func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) { %0 = "some_def"() : () -> (vector<4x96xf32>) %s = "another_def"() : () -> (vector<96xf32>) %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32> - vector.yield %1 : vector<4x96xf32> + gpu.yield %1 : vector<4x96xf32> } return %r : vector<4x3xf32> } @@ -1255,10 +1255,10 @@ func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) { // CHECK-PROP-LABEL: func @vector_insert_2d_distr_pos( // CHECK-PROP-SAME: %[[LANEID:.*]]: index // CHECK-PROP: %[[C19:.*]] = arith.constant 19 : index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VAL]], %[[VEC]] +// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] // CHECK-PROP: %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C19]] // CHECK-PROP: %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<4x96xf32>) { // CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [3] : vector<96xf32> into vector<4x96xf32> @@ -1268,11 +1268,11 @@ func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) { // CHECK-PROP: } // CHECK-PROP: return %[[R]] func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { %0 = "some_def"() : () -> (vector<128x96xf32>) %s = "another_def"() : () -> (vector<96xf32>) %1 = vector.insert %s, %0[79] : vector<96xf32> into vector<128x96xf32> - vector.yield %1 : vector<128x96xf32> + gpu.yield %1 : vector<128x96xf32> } return %r : vector<4x96xf32> } @@ -1281,18 +1281,18 @@ func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) { // CHECK-PROP-LABEL: func @vector_insert_2d_broadcast( // CHECK-PROP-SAME: %[[LANEID:.*]]: index -// CHECK-PROP: %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) +// CHECK-PROP: %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>) // CHECK-PROP: %[[VEC:.*]] = "some_def" // CHECK-PROP: %[[VAL:.*]] = "another_def" -// CHECK-PROP: vector.yield %[[VAL]], %[[VEC]] +// CHECK-PROP: gpu.yield %[[VAL]], %[[VEC]] // CHECK-PROP: %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<96xf32> into vector<4x96xf32> // CHECK-PROP: return %[[INSERT]] func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) { %0 = "some_def"() : () -> (vector<4x96xf32>) %s = "another_def"() : () -> (vector<96xf32>) %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32> - vector.yield %1 : vector<4x96xf32> + gpu.yield %1 : vector<4x96xf32> } return %r : vector<4x96xf32> } @@ -1310,12 +1310,12 @@ func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) { // CHECK-PROP-SAME: %[[AR2:[^ :]*]]: memref<1x4x1024xf32>) // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-PROP-DAG: %[[THREADID:.*]] = gpu.thread_id x -// CHECK-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]] +// CHECK-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]] // CHECK-PROP: %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}] // CHECK-PROP: %[[EXTRACT:.*]] = vector.extract %[[GATHER]][0] : vector<64xi32> from vector<1x64xi32> // CHECK-PROP: %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex> // CHECK-PROP: %[[EXTRACTELT:.*]] = vector.extract %[[CAST]][{{.*}}] : index from vector<64xindex> -// CHECK-PROP: vector.yield %[[EXTRACTELT]] : index +// CHECK-PROP: gpu.yield %[[EXTRACTELT]] : index // CHECK-PROP: %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[THREADID]]] // CHECK-PROP: %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[W]], %[[APPLY]]], // CHECK-PROP: return %[[TRANSFERREAD]] @@ -1329,14 +1329,14 @@ func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 : memref<1 %cst_2 = arith.constant dense<0> : vector<64xindex> %cst_6 = arith.constant 0.000000e+00 : f32 - %18 = vector.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) { + %18 = gpu.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) { ^bb0(%arg4: vector<1x64xindex>): %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32> %29 = vector.extract %28[0] : vector<64xi32> from vector<1x64xi32> %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex> %36 = vector.extractelement %30[%c0_i32 : index] : vector<64xindex> %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32> - vector.yield %37 : vector<64xf32> + gpu.yield %37 : vector<64xf32> } return %18 : vector<2xf32> } @@ -1347,16 +1347,16 @@ func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 : memref<1 // same value. // CHECK-PROP-LABEL: func @dont_fold_vector_broadcast( -// CHECK-PROP: %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>) +// CHECK-PROP: %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>) // CHECK-PROP: %[[some_def:.*]] = "some_def" // CHECK-PROP: %[[broadcast:.*]] = vector.broadcast %[[some_def]] : vector<64xf32> to vector<1x64xf32> -// CHECK-PROP: vector.yield %[[broadcast]] : vector<1x64xf32> +// CHECK-PROP: gpu.yield %[[broadcast]] : vector<1x64xf32> // CHECK-PROP: vector.print %[[r]] : vector<1x2xf32> func.func @dont_fold_vector_broadcast(%laneid: index) { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) { %0 = "some_def"() : () -> (vector<64xf32>) %1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32> - vector.yield %1 : vector<1x64xf32> + gpu.yield %1 : vector<1x64xf32> } vector.print %r : vector<1x2xf32> return @@ -1367,10 +1367,10 @@ func.func @dont_fold_vector_broadcast(%laneid: index) { func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>) -> vector<4xf32> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) { %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32> %3 = vector.shape_cast %2 : vector<32x4x32xf32> to vector<4096xf32> - vector.yield %3 : vector<4096xf32> + gpu.yield %3 : vector<4096xf32> } return %r : vector<4xf32> } @@ -1384,9 +1384,9 @@ func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>) func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index) -> vector<1xf32> { %f0 = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { %1 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> - vector.yield %1 : vector<1xf32> + gpu.yield %1 : vector<1xf32> } return %r : vector<1xf32> } @@ -1400,31 +1400,31 @@ func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<409 func.func @warp_propagate_multi_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>, vector<1xf32>) { %f0 = arith.constant 0.000000e+00 : f32 - %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) { %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> "some_use"(%0) : (vector<1xf32>) -> () %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32> - vector.yield %0, %1 : vector<1xf32>, vector<1xf32> + gpu.yield %0, %1 : vector<1xf32>, vector<1xf32> } return %r#0, %r#1 : vector<1xf32>, vector<1xf32> } // CHECK-PROP-LABEL: func.func @warp_propagate_multi_transfer_read -// CHECK-PROP: vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) +// CHECK-PROP: gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) // CHECK-PROP: %[[INNER_READ:.+]] = vector.transfer_read // CHECK-PROP: "some_use"(%[[INNER_READ]]) -// CHECK-PROP: vector.yield %[[INNER_READ]] : vector<1xf32> +// CHECK-PROP: gpu.yield %[[INNER_READ]] : vector<1xf32> // CHECK-PROP: vector.transfer_read // ----- func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>) { %f0 = arith.constant 0.000000e+00 : f32 - %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) { %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32> %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32> %max = arith.maximumf %0, %1 : vector<64xf32> - vector.yield %max : vector<64xf32> + gpu.yield %max : vector<64xf32> } return %r : vector<1xf32> } @@ -1437,25 +1437,25 @@ func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096 func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>) { %c0 = arith.constant 0 : index - vector.warp_execute_on_lane_0(%laneid)[32] -> () { + gpu.warp_execute_on_lane_0(%laneid)[32] -> () { %mask = "mask_def_0"() : () -> (vector<4096xi1>) %mask2 = "mask_def_1"() : () -> (vector<32xi1>) %0 = "some_def_0"() : () -> (vector<4096xf32>) %1 = "some_def_1"() : () -> (vector<32xf32>) vector.transfer_write %0, %dest[%c0], %mask : vector<4096xf32>, memref<4096xf32> vector.transfer_write %1, %dest[%c0], %mask2 : vector<32xf32>, memref<4096xf32> - vector.yield + gpu.yield } return } // CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_masked_write( -// CHECK-DIST-AND-PROP: %[[W:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) { +// CHECK-DIST-AND-PROP: %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) { // CHECK-DIST-AND-PROP: %[[M0:.*]] = "mask_def_0" // CHECK-DIST-AND-PROP: %[[M1:.*]] = "mask_def_1" // CHECK-DIST-AND-PROP: %[[V0:.*]] = "some_def_0" // CHECK-DIST-AND-PROP: %[[V1:.*]] = "some_def_1" -// CHECK-DIST-AND-PROP: vector.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]] +// CHECK-DIST-AND-PROP: gpu.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]] // CHECK-DIST-AND-PROP-SAME: vector<32xf32>, vector<32xi1>, vector<4096xf32>, vector<4096xi1> // CHECK-DIST-AND-PROP: } // CHECK-DIST-AND-PROP: vector.transfer_write %[[W]]#2, {{.*}}, %[[W]]#3 {in_bounds = [true]} : vector<128xf32>, memref<4096xf32> @@ -1466,12 +1466,12 @@ func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>) func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> (vector<2xf32>, vector<2x2xf32>) { %f0 = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) { %mask = "mask_def_0"() : () -> (vector<128xi1>) %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> %mask2 = "mask_def_1"() : () -> (vector<128x2xi1>) %1 = vector.transfer_read %src[%c0, %index], %f0, %mask2 {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<128x2xf32> - vector.yield %0, %1 : vector<128xf32>, vector<128x2xf32> + gpu.yield %0, %1 : vector<128xf32>, vector<128x2xf32> } return %r#0, %r#1 : vector<2xf32>, vector<2x2xf32> } @@ -1481,10 +1481,10 @@ func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096 // CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read // CHECK-PROP-SAME: %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index // CHECK-PROP: %[[C0:.*]] = arith.constant 0 : index -// CHECK-PROP: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) { +// CHECK-PROP: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) { // CHECK-PROP: %[[M0:.*]] = "mask_def_0" // CHECK-PROP: %[[M1:.*]] = "mask_def_1" -// CHECK-PROP: vector.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1> +// CHECK-PROP: gpu.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1> // CHECK-PROP: } // CHECK-PROP: %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]] // CHECK-PROP: vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[ARG2]]], {{.*}}, %[[R]]#1 {{.*}} vector<2x2xf32> @@ -1496,10 +1496,10 @@ func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096 func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> vector<2xf32> { %f0 = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) { %mask = "mask_def_0"() : () -> (vector<128xi1>) %0 = vector.transfer_read %src[%index, %c0], %f0, %mask {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<4096x4096xf32>, vector<128xf32> - vector.yield %0 : vector<128xf32> + gpu.yield %0 : vector<128xf32> } return %r : vector<2xf32> } @@ -1509,9 +1509,9 @@ func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %s // CHECK-PROP-LABEL: func.func @warp_propagate_nontrivial_map_masked_transfer_read // CHECK-PROP-SAME: %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index // CHECK-PROP: %[[C0:.*]] = arith.constant 0 : index -// CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) { +// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) { // CHECK-PROP: %[[M0:.*]] = "mask_def_0" -// CHECK-PROP: vector.yield %[[M0]] : vector<128xi1> +// CHECK-PROP: gpu.yield %[[M0]] : vector<128xi1> // CHECK-PROP: } // CHECK-PROP: %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG0]]] // CHECK-PROP: vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[C0]]], {{.*}}, %[[R]] @@ -1522,11 +1522,11 @@ func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %s func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src: memref<4096x4096xf32>, %index: index, %index2: index, %mask_ub: index) -> (vector<2xf32>, vector<2xf32>) { %f0 = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) { %mask = vector.create_mask %mask_ub: vector<128xi1> %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> %1 = vector.transfer_read %src[%c0, %index2], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32> - vector.yield %0, %1 : vector<128xf32>, vector<128xf32> + gpu.yield %0, %1 : vector<128xf32>, vector<128xf32> } return %r#0, %r#1 : vector<2xf32>, vector<2xf32> } @@ -1542,12 +1542,12 @@ func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src: func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref<128xf32>, %f1: f32) -> (vector<2xf32>, vector<4xf32>) { %f0 = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) { + %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) { %cst = arith.constant dense<2.0> : vector<128xf32> %0 = vector.transfer_read %buffer[%c0], %f0 {in_bounds = [true]} : memref<128xf32>, vector<128xf32> vector.transfer_write %cst, %buffer[%c0] : vector<128xf32>, memref<128xf32> %1 = vector.broadcast %f1 : f32 to vector<64xf32> - vector.yield %1, %0 : vector<64xf32>, vector<128xf32> + gpu.yield %1, %0 : vector<64xf32>, vector<128xf32> } return %r#0, %r#1 : vector<2xf32>, vector<4xf32> } @@ -1561,9 +1561,9 @@ func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref // ----- func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1> { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) { %1 = vector.create_mask %m0 : vector<32xi1> - vector.yield %1 : vector<32xi1> + gpu.yield %1 : vector<32xi1> } return %r : vector<1xi1> } @@ -1577,9 +1577,9 @@ func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1 // ----- func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1: index, %m2: index) -> vector<1x2x4xi1> { - %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) { + %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) { %1 = vector.create_mask %m0, %m1, %m2 : vector<16x4x4xi1> - vector.yield %1 : vector<16x4x4xi1> + gpu.yield %1 : vector<16x4x4xi1> } return %r : vector<1x2x4xi1> } @@ -1596,10 +1596,10 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1: func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) { %c0 = arith.constant 0 : index - vector.warp_execute_on_lane_0(%laneid)[32] -> () { + gpu.warp_execute_on_lane_0(%laneid)[32] -> () { %0 = "some_def"() : () -> (vector<4x1024xf32>) vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32> - vector.yield + gpu.yield } return } @@ -1607,9 +1607,9 @@ func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) { // CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)> // CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write( -// CHECK-DIST-AND-PROP: %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) { +// CHECK-DIST-AND-PROP: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) { // CHECK-DIST-AND-PROP: %[[V0:.*]] = "some_def" -// CHECK-DIST-AND-PROP: vector.yield %[[V0]] +// CHECK-DIST-AND-PROP: gpu.yield %[[V0]] // CHECK-DIST-AND-PROP-SAME: vector<4x1024xf32> // CHECK-DIST-AND-PROP: } diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir index 378e5b39415b5..f1abf77753b87 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -20,7 +20,7 @@ func.func @gpu_func(%in: memref<1024xf32>, %out: memref<1xf32>) { gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) { - vector.warp_execute_on_lane_0(%arg6)[32] { + gpu.warp_execute_on_lane_0(%arg6)[32] { %init = vector.transfer_read %out[%c0], %cst_0 {in_bounds = [true]} : memref<1xf32>, vector<1xf32> %13 = scf.for %arg0 = %c0 to %c1024 step %c32 iter_args(%arg1 = %init) -> (vector<1xf32>) { %20 = vector.transfer_read %in[%arg0], %cst_0 {in_bounds = [true]} : memref<1024xf32>, vector<32xf32> diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir index 7e9234901ffa1..8ce24bfe3640a 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir @@ -38,7 +38,7 @@ func.func @gpu_func(%arg1: memref<32xf32>, %arg2: memref<32xf32>) { gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) { - vector.warp_execute_on_lane_0(%arg6)[32] { + gpu.warp_execute_on_lane_0(%arg6)[32] { %0 = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true]} : memref<32xf32>, vector<32xf32> %1 = vector.transfer_read %arg2[%c0], %cst {in_bound = [true]} : memref<32xf32>, vector<32xf32> %2 = arith.addf %0, %1 : vector<32xf32> diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index 72aaa7dc4f897..9d8969edfd90f 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -519,7 +519,7 @@ struct TestVectorScanLowering /// Allocate shared memory for a single warp to test lowering of /// WarpExecuteOnLane0Op. static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder, - WarpExecuteOnLane0Op warpOp, + gpu::WarpExecuteOnLane0Op warpOp, Type type) { static constexpr int64_t kSharedMemorySpace = 3; // Compute type of shared memory buffer. @@ -583,8 +583,9 @@ struct TestVectorDistribution MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorDistribution) void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry + .insert(); } StringRef getArgument() const final { return "test-vector-warp-distribute"; } @@ -622,7 +623,7 @@ struct TestVectorDistribution RewritePatternSet patterns(&getContext()); getOperation().walk([&](Operation *op) { - if (auto warpOp = dyn_cast(op)) { + if (auto warpOp = dyn_cast(op)) { if (hoistUniform) { moveScalarUniformCode(warpOp); } @@ -677,7 +678,7 @@ struct TestVectorDistribution WarpExecuteOnLane0LoweringOptions options; options.warpAllocationFn = allocateGlobalSharedMemory; options.warpSyncronizationFn = [](Location loc, OpBuilder &builder, - WarpExecuteOnLane0Op warpOp) { + gpu::WarpExecuteOnLane0Op warpOp) { builder.create(loc); }; // Test on one pattern in isolation.