diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index c5d86e713f253..ea658fb16a36c 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -804,6 +804,15 @@ elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams); /// Get the address space which should be used for allocas uint64_t getAllocaAddressSpace(mlir::DataLayout *dataLayout); +/// The two vectors of MLIR values have the following property: +/// \p extents1[i] must have the same value as \p extents2[i] +/// The function returns a new vector of MLIR values that preserves +/// the same property vs \p extents1 and \p extents2, but allows +/// more optimizations. For example, if extents1[j] is a known constant, +/// and extents2[j] is not, then result[j] is the MLIR value extents1[j]. +llvm::SmallVector deduceOptimalExtents(mlir::ValueRange extents1, + mlir::ValueRange extents2); + } // namespace fir::factory #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h index c8aad644bc784..6e85b8f4ddf86 100644 --- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h +++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h @@ -508,6 +508,11 @@ genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity source, mlir::Type toType, bool preserveLowerBounds); +/// A shortcut for loadTrivialScalar(getElementAt()), +/// which designates and loads an element of an array. +Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder, + Entity entity, mlir::ValueRange oneBasedIndices); + } // namespace hlfir #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td index 644f1e3c3af2b..90cf6e74241bd 100644 --- a/flang/include/flang/Optimizer/HLFIR/Passes.td +++ b/flang/include/flang/Optimizer/HLFIR/Passes.td @@ -43,6 +43,17 @@ def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::ml def SimplifyHLFIRIntrinsics : Pass<"simplify-hlfir-intrinsics"> { let summary = "Simplify HLFIR intrinsic operations that don't need to result in runtime calls"; + let options = [Option<"allowNewSideEffects", "allow-new-side-effects", "bool", + /*default=*/"false", + "If enabled, then the HLFIR operations simplification " + "may introduce operations with side effects. " + "For example, hlfir.matmul may be inlined as " + "and hlfir.eval_in_mem with hlfir.assign inside it." + "The hlfir.assign has a write effect on the memory " + "argument of hlfir.eval_in_mem, which may block " + "some existing MLIR transformations (e.g. CSE) " + "that otherwise would have been possible across " + "the hlfir.matmul.">]; } def InlineElementals : Pass<"inline-elementals"> { diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index d01becfe80093..218f98ef9ef42 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -1740,3 +1740,17 @@ uint64_t fir::factory::getAllocaAddressSpace(mlir::DataLayout *dataLayout) { return mlir::cast(addrSpace).getUInt(); return 0; } + +llvm::SmallVector +fir::factory::deduceOptimalExtents(mlir::ValueRange extents1, + mlir::ValueRange extents2) { + llvm::SmallVector extents; + extents.reserve(extents1.size()); + for (auto [extent1, extent2] : llvm::zip(extents1, extents2)) { + if (!fir::getIntIfConstant(extent1) && fir::getIntIfConstant(extent2)) + extents.push_back(extent2); + else + extents.push_back(extent1); + } + return extents; +} diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 94238bc24e453..5e5d0bbd68132 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -939,8 +939,10 @@ llvm::SmallVector hlfir::genLoopNestWithReductions( doLoop = builder.create(loc, one, ub, one, isUnordered, /*finalCountValue=*/false, parentLoop.getRegionIterArgs()); - // Return the results of the child loop from its parent loop. - builder.create(loc, doLoop.getResults()); + if (!reductionInits.empty()) { + // Return the results of the child loop from its parent loop. + builder.create(loc, doLoop.getResults()); + } } builder.setInsertionPointToStart(doLoop.getBody()); @@ -955,7 +957,8 @@ llvm::SmallVector hlfir::genLoopNestWithReductions( reductionValues = genBody(loc, builder, oneBasedIndices, parentLoop.getRegionIterArgs()); builder.setInsertionPointToEnd(parentLoop.getBody()); - builder.create(loc, reductionValues); + if (!reductionValues.empty()) + builder.create(loc, reductionValues); builder.setInsertionPointAfter(outerLoop); return outerLoop->getResults(); } @@ -1410,3 +1413,11 @@ void hlfir::computeEvaluateOpIn(mlir::Location loc, fir::FirOpBuilder &builder, builder.clone(op, mapper); return; } + +hlfir::Entity hlfir::loadElementAt(mlir::Location loc, + fir::FirOpBuilder &builder, + hlfir::Entity entity, + mlir::ValueRange oneBasedIndices) { + return loadTrivialScalar(loc, builder, + getElementAt(loc, builder, entity, oneBasedIndices)); +} diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp index 314ced8679521..0fe3620b7f1ae 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp @@ -28,6 +28,13 @@ namespace hlfir { #include "flang/Optimizer/HLFIR/Passes.h.inc" } // namespace hlfir +#define DEBUG_TYPE "simplify-hlfir-intrinsics" + +static llvm::cl::opt forceMatmulAsElemental( + "flang-inline-matmul-as-elemental", + llvm::cl::desc("Expand hlfir.matmul as elemental operation"), + llvm::cl::init(false)); + namespace { class TransposeAsElementalConversion @@ -467,9 +474,442 @@ class CShiftAsElementalConversion } }; +template +class MatmulConversion : public mlir::OpRewritePattern { +public: + using mlir::OpRewritePattern::OpRewritePattern; + + llvm::LogicalResult + matchAndRewrite(Op matmul, mlir::PatternRewriter &rewriter) const override { + mlir::Location loc = matmul.getLoc(); + fir::FirOpBuilder builder{rewriter, matmul.getOperation()}; + hlfir::Entity lhs = hlfir::Entity{matmul.getLhs()}; + hlfir::Entity rhs = hlfir::Entity{matmul.getRhs()}; + mlir::Value resultShape, innerProductExtent; + std::tie(resultShape, innerProductExtent) = + genResultShape(loc, builder, lhs, rhs); + + if (forceMatmulAsElemental || isMatmulTranspose) { + // Generate hlfir.elemental that produces the result of + // MATMUL/MATMUL(TRANSPOSE). + // Note that this implementation is very suboptimal for MATMUL, + // but is quite good for MATMUL(TRANSPOSE), e.g.: + // R(1:N) = R(1:N) + MATMUL(TRANSPOSE(X(1:N,1:N)), Y(1:N)) + // Inlining MATMUL(TRANSPOSE) as hlfir.elemental may result + // in merging the inner product computation with the elemental + // addition. Note that the inner product computation will + // benefit from processing the lowermost dimensions of X and Y, + // which may be the best when they are contiguous. + // + // This is why we always inline MATMUL(TRANSPOSE) as an elemental. + // MATMUL is inlined below by default unless forceMatmulAsElemental. + hlfir::ExprType resultType = + mlir::cast(matmul.getType()); + hlfir::ElementalOp newOp = genElementalMatmul( + loc, builder, resultType, resultShape, lhs, rhs, innerProductExtent); + rewriter.replaceOp(matmul, newOp); + return mlir::success(); + } + + // Generate hlfir.eval_in_mem to mimic the MATMUL implementation + // from Fortran runtime. The implementation needs to operate + // with the result array as an in-memory object. + hlfir::EvaluateInMemoryOp evalOp = + builder.create( + loc, mlir::cast(matmul.getType()), resultShape); + builder.setInsertionPointToStart(&evalOp.getBody().front()); + + // Embox the raw array pointer to simplify designating it. + // TODO: this currently results in redundant lower bounds + // addition for the designator, but this should be fixed in + // hlfir::Entity::mayHaveNonDefaultLowerBounds(). + mlir::Value resultArray = evalOp.getMemory(); + mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType()); + resultArray = builder.createBox(loc, fir::BoxType::get(arrayType), + resultArray, resultShape, /*slice=*/nullptr, + /*lengths=*/{}, /*tdesc=*/nullptr); + + // The contiguous MATMUL version is best for the cases + // where the input arrays and (maybe) the result are contiguous + // in their lowermost dimensions. + // Especially, when LLVM can recognize the continuity + // and vectorize the loops properly. + // Note that the contiguous MATMUL inlining is correct + // even when the input arrays are not contiguous. + // TODO: we can try to recognize the cases when the continuity + // is not statically obvious and try to generate an explicitly + // continuous version under a dynamic check. This should allow + // LLVM to vectorize the loops better. Note that this can + // also be postponed up to the LoopVersioning pass. + // The fallback implementation may use genElementalMatmul() with + // an hlfir.assign into the result of eval_in_mem. + mlir::LogicalResult rewriteResult = + genContiguousMatmul(loc, builder, hlfir::Entity{resultArray}, + resultShape, lhs, rhs, innerProductExtent); + + if (mlir::failed(rewriteResult)) { + // Erase the unclaimed eval_in_mem op. + rewriter.eraseOp(evalOp); + return rewriter.notifyMatchFailure(matmul, + "genContiguousMatmul() failed"); + } + + rewriter.replaceOp(matmul, evalOp); + return mlir::success(); + } + +private: + static constexpr bool isMatmulTranspose = + std::is_same_v; + + // Return a tuple of: + // * A fir.shape operation representing the shape of the result + // of a MATMUL/MATMUL(TRANSPOSE). + // * An extent of the dimensions of the input array + // that are processed during the inner product computation. + static std::tuple + genResultShape(mlir::Location loc, fir::FirOpBuilder &builder, + hlfir::Entity input1, hlfir::Entity input2) { + mlir::Value input1Shape = hlfir::genShape(loc, builder, input1); + llvm::SmallVector input1Extents = + hlfir::getExplicitExtentsFromShape(input1Shape, builder); + if (input1Shape.getUses().empty()) + input1Shape.getDefiningOp()->erase(); + mlir::Value input2Shape = hlfir::genShape(loc, builder, input2); + llvm::SmallVector input2Extents = + hlfir::getExplicitExtentsFromShape(input2Shape, builder); + if (input2Shape.getUses().empty()) + input2Shape.getDefiningOp()->erase(); + + llvm::SmallVector newExtents; + mlir::Value innerProduct1Extent, innerProduct2Extent; + if (input1Extents.size() == 1) { + assert(!isMatmulTranspose && + "hlfir.matmul_transpose's first operand must be rank-2 array"); + assert(input2Extents.size() == 2 && + "hlfir.matmul second argument must be rank-2 array"); + newExtents.push_back(input2Extents[1]); + innerProduct1Extent = input1Extents[0]; + innerProduct2Extent = input2Extents[0]; + } else { + if (input2Extents.size() == 1) { + assert(input1Extents.size() == 2 && + "hlfir.matmul first argument must be rank-2 array"); + if constexpr (isMatmulTranspose) + newExtents.push_back(input1Extents[1]); + else + newExtents.push_back(input1Extents[0]); + } else { + assert(input1Extents.size() == 2 && input2Extents.size() == 2 && + "hlfir.matmul arguments must be rank-2 arrays"); + if constexpr (isMatmulTranspose) + newExtents.push_back(input1Extents[1]); + else + newExtents.push_back(input1Extents[0]); + + newExtents.push_back(input2Extents[1]); + } + if constexpr (isMatmulTranspose) + innerProduct1Extent = input1Extents[0]; + else + innerProduct1Extent = input1Extents[1]; + + innerProduct2Extent = input2Extents[0]; + } + // The inner product dimensions of the input arrays + // must match. Pick the best (e.g. constant) out of them + // so that the inner product loop bound can be used in + // optimizations. + llvm::SmallVector innerProductExtent = + fir::factory::deduceOptimalExtents({innerProduct1Extent}, + {innerProduct2Extent}); + return {builder.create(loc, newExtents), + innerProductExtent[0]}; + } + + static mlir::Value castToProductType(mlir::Location loc, + fir::FirOpBuilder &builder, + mlir::Value value, mlir::Type type) { + if (mlir::isa(type)) + return builder.createConvert(loc, builder.getIntegerType(1), value); + + // TODO: the multiplications/additions by/of zero resulting from + // complex * real are optimized by LLVM under -fno-signed-zeros + // -fno-honor-nans. + // We can make them disappear by default if we: + // * either expand the complex multiplication into real + // operations, OR + // * set nnan nsz fast-math flags to the complex operations. + if (fir::isa_complex(type) && !fir::isa_complex(value.getType())) { + mlir::Value zeroCmplx = fir::factory::createZeroValue(builder, loc, type); + fir::factory::Complex helper(builder, loc); + mlir::Type partType = helper.getComplexPartType(type); + return helper.insertComplexPart( + zeroCmplx, castToProductType(loc, builder, value, partType), + /*isImagPart=*/false); + } + return builder.createConvert(loc, type, value); + } + + // Generate an update of the inner product value: + // acc += v1 * v2, OR + // acc ||= v1 && v2 + static mlir::Value genAccumulateProduct(mlir::Location loc, + fir::FirOpBuilder &builder, + mlir::Type resultType, + mlir::Value acc, mlir::Value v1, + mlir::Value v2) { + acc = castToProductType(loc, builder, acc, resultType); + v1 = castToProductType(loc, builder, v1, resultType); + v2 = castToProductType(loc, builder, v2, resultType); + mlir::Value result; + if (mlir::isa(resultType)) + result = builder.create( + loc, acc, builder.create(loc, v1, v2)); + else if (mlir::isa(resultType)) + result = builder.create( + loc, acc, builder.create(loc, v1, v2)); + else if (mlir::isa(resultType)) + result = builder.create( + loc, acc, builder.create(loc, v1, v2)); + else if (mlir::isa(resultType)) + result = builder.create( + loc, acc, builder.create(loc, v1, v2)); + else + llvm_unreachable("unsupported type"); + + return builder.createConvert(loc, resultType, result); + } + + static mlir::LogicalResult + genContiguousMatmul(mlir::Location loc, fir::FirOpBuilder &builder, + hlfir::Entity result, mlir::Value resultShape, + hlfir::Entity lhs, hlfir::Entity rhs, + mlir::Value innerProductExtent) { + // This code does not support MATMUL(TRANSPOSE), and it is supposed + // to be inlined as hlfir.elemental. + if constexpr (isMatmulTranspose) + return mlir::failure(); + + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::Type resultElementType = result.getFortranElementType(); + llvm::SmallVector resultExtents = + mlir::cast(resultShape.getDefiningOp()).getExtents(); + + // The inner product loop may be unordered if FastMathFlags::reassoc + // transformations are allowed. The integer/logical inner product is + // always unordered. + // Note that isUnordered is currently applied to all loops + // in the loop nests generated below, while it has to be applied + // only to one. + bool isUnordered = mlir::isa(resultElementType) || + mlir::isa(resultElementType) || + static_cast(builder.getFastMathFlags() & + mlir::arith::FastMathFlags::reassoc); + + // Insert the initialization loop nest that fills the whole result with + // zeroes. + mlir::Value initValue = + fir::factory::createZeroValue(builder, loc, resultElementType); + auto genInitBody = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange oneBasedIndices, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + hlfir::Entity resultElement = + hlfir::getElementAt(loc, builder, result, oneBasedIndices); + builder.create(loc, initValue, resultElement); + return {}; + }; + + hlfir::genLoopNestWithReductions(loc, builder, resultExtents, + /*reductionInits=*/{}, genInitBody, + /*isUnordered=*/true); + + if (lhs.getRank() == 2 && rhs.getRank() == 2) { + // LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS) + // + // Insert the computation loop nest: + // DO 2 K = 1, N + // DO 2 J = 1, NCOLS + // DO 2 I = 1, NROWS + // 2 RESULT(I,J) = RESULT(I,J) + LHS(I,K)*RHS(K,J) + auto genMatrixMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange oneBasedIndices, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + mlir::Value I = oneBasedIndices[0]; + mlir::Value J = oneBasedIndices[1]; + mlir::Value K = oneBasedIndices[2]; + hlfir::Entity resultElement = + hlfir::getElementAt(loc, builder, result, {I, J}); + hlfir::Entity resultElementValue = + hlfir::loadTrivialScalar(loc, builder, resultElement); + hlfir::Entity lhsElementValue = + hlfir::loadElementAt(loc, builder, lhs, {I, K}); + hlfir::Entity rhsElementValue = + hlfir::loadElementAt(loc, builder, rhs, {K, J}); + mlir::Value productValue = genAccumulateProduct( + loc, builder, resultElementType, resultElementValue, + lhsElementValue, rhsElementValue); + builder.create(loc, productValue, resultElement); + return {}; + }; + + // Note that the loops are inserted in reverse order, + // so innerProductExtent should be passed as the last extent. + hlfir::genLoopNestWithReductions( + loc, builder, + {resultExtents[0], resultExtents[1], innerProductExtent}, + /*reductionInits=*/{}, genMatrixMatrix, isUnordered); + return mlir::success(); + } + + if (lhs.getRank() == 2 && rhs.getRank() == 1) { + // LHS(NROWS,N) * RHS(N) -> RESULT(NROWS) + // + // Insert the computation loop nest: + // DO 2 K = 1, N + // DO 2 J = 1, NROWS + // 2 RES(J) = RES(J) + LHS(J,K)*RHS(K) + auto genMatrixVector = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange oneBasedIndices, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + mlir::Value J = oneBasedIndices[0]; + mlir::Value K = oneBasedIndices[1]; + hlfir::Entity resultElement = + hlfir::getElementAt(loc, builder, result, {J}); + hlfir::Entity resultElementValue = + hlfir::loadTrivialScalar(loc, builder, resultElement); + hlfir::Entity lhsElementValue = + hlfir::loadElementAt(loc, builder, lhs, {J, K}); + hlfir::Entity rhsElementValue = + hlfir::loadElementAt(loc, builder, rhs, {K}); + mlir::Value productValue = genAccumulateProduct( + loc, builder, resultElementType, resultElementValue, + lhsElementValue, rhsElementValue); + builder.create(loc, productValue, resultElement); + return {}; + }; + hlfir::genLoopNestWithReductions( + loc, builder, {resultExtents[0], innerProductExtent}, + /*reductionInits=*/{}, genMatrixVector, isUnordered); + return mlir::success(); + } + if (lhs.getRank() == 1 && rhs.getRank() == 2) { + // LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS) + // + // Insert the computation loop nest: + // DO 2 K = 1, N + // DO 2 J = 1, NCOLS + // 2 RES(J) = RES(J) + LHS(K)*RHS(K,J) + auto genVectorMatrix = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange oneBasedIndices, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + mlir::Value J = oneBasedIndices[0]; + mlir::Value K = oneBasedIndices[1]; + hlfir::Entity resultElement = + hlfir::getElementAt(loc, builder, result, {J}); + hlfir::Entity resultElementValue = + hlfir::loadTrivialScalar(loc, builder, resultElement); + hlfir::Entity lhsElementValue = + hlfir::loadElementAt(loc, builder, lhs, {K}); + hlfir::Entity rhsElementValue = + hlfir::loadElementAt(loc, builder, rhs, {K, J}); + mlir::Value productValue = genAccumulateProduct( + loc, builder, resultElementType, resultElementValue, + lhsElementValue, rhsElementValue); + builder.create(loc, productValue, resultElement); + return {}; + }; + hlfir::genLoopNestWithReductions( + loc, builder, {resultExtents[0], innerProductExtent}, + /*reductionInits=*/{}, genVectorMatrix, isUnordered); + return mlir::success(); + } + + llvm_unreachable("unsupported MATMUL arguments' ranks"); + } + + static hlfir::ElementalOp + genElementalMatmul(mlir::Location loc, fir::FirOpBuilder &builder, + hlfir::ExprType resultType, mlir::Value resultShape, + hlfir::Entity lhs, hlfir::Entity rhs, + mlir::Value innerProductExtent) { + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::Type resultElementType = resultType.getElementType(); + auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange resultIndices) -> hlfir::Entity { + mlir::Value initValue = + fir::factory::createZeroValue(builder, loc, resultElementType); + // The inner product loop may be unordered if FastMathFlags::reassoc + // transformations are allowed. The integer/logical inner product is + // always unordered. + bool isUnordered = mlir::isa(resultElementType) || + mlir::isa(resultElementType) || + static_cast(builder.getFastMathFlags() & + mlir::arith::FastMathFlags::reassoc); + + auto genBody = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange oneBasedIndices, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + llvm::SmallVector lhsIndices; + llvm::SmallVector rhsIndices; + // MATMUL: + // LHS(NROWS,N) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS) + // LHS(NROWS,N) * RHS(N) -> RESULT(NROWS) + // LHS(N) * RHS(N,NCOLS) -> RESULT(NCOLS) + // + // MATMUL(TRANSPOSE): + // TRANSPOSE(LHS(N,NROWS)) * RHS(N,NCOLS) -> RESULT(NROWS,NCOLS) + // TRANSPOSE(LHS(N,NROWS)) * RHS(N) -> RESULT(NROWS) + // + // The resultIndices iterate over (NROWS[,NCOLS]). + // The oneBasedIndices iterate over (N). + if (lhs.getRank() > 1) + lhsIndices.push_back(resultIndices[0]); + lhsIndices.push_back(oneBasedIndices[0]); + + if constexpr (isMatmulTranspose) { + // Swap the LHS indices for TRANSPOSE. + std::swap(lhsIndices[0], lhsIndices[1]); + } + + rhsIndices.push_back(oneBasedIndices[0]); + if (rhs.getRank() > 1) + rhsIndices.push_back(resultIndices.back()); + + hlfir::Entity lhsElementValue = + hlfir::loadElementAt(loc, builder, lhs, lhsIndices); + hlfir::Entity rhsElementValue = + hlfir::loadElementAt(loc, builder, rhs, rhsIndices); + mlir::Value productValue = genAccumulateProduct( + loc, builder, resultElementType, reductionArgs[0], lhsElementValue, + rhsElementValue); + return {productValue}; + }; + llvm::SmallVector innerProductValue = + hlfir::genLoopNestWithReductions(loc, builder, {innerProductExtent}, + {initValue}, genBody, isUnordered); + return hlfir::Entity{innerProductValue[0]}; + }; + hlfir::ElementalOp elementalOp = hlfir::genElementalOp( + loc, builder, resultElementType, resultShape, /*typeParams=*/{}, + genKernel, + /*isUnordered=*/true, /*polymorphicMold=*/nullptr, resultType); + + return elementalOp; + } +}; + class SimplifyHLFIRIntrinsics : public hlfir::impl::SimplifyHLFIRIntrinsicsBase { public: + using SimplifyHLFIRIntrinsicsBase< + SimplifyHLFIRIntrinsics>::SimplifyHLFIRIntrinsicsBase; + void runOnOperation() override { mlir::MLIRContext *context = &getContext(); @@ -482,6 +922,22 @@ class SimplifyHLFIRIntrinsics patterns.insert(context); patterns.insert(context); patterns.insert(context); + patterns.insert>(context); + + // If forceMatmulAsElemental is false, then hlfir.matmul inlining + // will introduce hlfir.eval_in_mem operation with new memory side + // effects. This conflicts with CSE and optimized bufferization, e.g.: + // A(1:N,1:N) = A(1:N,1:N) - MATMUL(...) + // If we introduce hlfir.eval_in_mem before CSE, then the current + // MLIR CSE won't be able to optimize the trivial loads of 'N' value + // that happen before and after hlfir.matmul. + // If 'N' loads are not optimized, then the optimized bufferization + // won't be able to prove that the slices of A are identical + // on both sides of the assignment. + // This is actually the CSE problem, but we can work it around + // for the time being. + if (forceMatmulAsElemental || this->allowNewSideEffects) + patterns.insert>(context); if (mlir::failed(mlir::applyPatternsGreedily( getOperation(), std::move(patterns), config))) { diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index e1d7376ec3805..1cc3f0b81c20a 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -232,6 +232,12 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, if (optLevel.isOptimizingForSpeed()) { addCanonicalizerPassWithoutRegionSimplification(pm); pm.addPass(mlir::createCSEPass()); + // Run SimplifyHLFIRIntrinsics pass late after CSE, + // and allow introducing operations with new side effects. + addNestedPassToAllTopLevelOperations(pm, []() { + return hlfir::createSimplifyHLFIRIntrinsics( + {/*allowNewSideEffects=*/true}); + }); addNestedPassToAllTopLevelOperations( pm, hlfir::createOptimizedBufferization); addNestedPassToAllTopLevelOperations( diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index 55e86da2dfdf1..dd46aecb3274c 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -35,15 +35,19 @@ ! O2-NEXT: (S) {{.*}} num-dce'd ! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] ! O2-NEXT: 'fir.global' Pipeline +! O2-NEXT: SimplifyHLFIRIntrinsics ! O2-NEXT: OptimizedBufferization ! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'func.func' Pipeline +! O2-NEXT: SimplifyHLFIRIntrinsics ! O2-NEXT: OptimizedBufferization ! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.declare_reduction' Pipeline +! O2-NEXT: SimplifyHLFIRIntrinsics ! O2-NEXT: OptimizedBufferization ! O2-NEXT: InlineHLFIRAssign ! O2-NEXT: 'omp.private' Pipeline +! O2-NEXT: SimplifyHLFIRIntrinsics ! O2-NEXT: OptimizedBufferization ! O2-NEXT: InlineHLFIRAssign ! ALL: LowerHLFIROrderedAssignments diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 29a0f66157971..51e68d2157631 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -36,15 +36,19 @@ func.func @_QQmain() { // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd // PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] // PASSES-NEXT: 'fir.global' Pipeline +// PASSES-NEXT: SimplifyHLFIRIntrinsics // PASSES-NEXT: OptimizedBufferization // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'func.func' Pipeline +// PASSES-NEXT: SimplifyHLFIRIntrinsics // PASSES-NEXT: OptimizedBufferization // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.declare_reduction' Pipeline +// PASSES-NEXT: SimplifyHLFIRIntrinsics // PASSES-NEXT: OptimizedBufferization // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: 'omp.private' Pipeline +// PASSES-NEXT: SimplifyHLFIRIntrinsics // PASSES-NEXT: OptimizedBufferization // PASSES-NEXT: InlineHLFIRAssign // PASSES-NEXT: LowerHLFIROrderedAssignments diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir new file mode 100644 index 0000000000000..d29e9a26c20ba --- /dev/null +++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-matmul.fir @@ -0,0 +1,660 @@ +// Test hlfir.cshift simplification to hlfir.elemental: +// RUN: fir-opt --simplify-hlfir-intrinsics=allow-new-side-effects=false %s | FileCheck %s --check-prefixes=ALL,NOANSE +// RUN: fir-opt --simplify-hlfir-intrinsics=allow-new-side-effects=true %s | FileCheck %s --check-prefixes=ALL,ANSE +// RUN: fir-opt --simplify-hlfir-intrinsics -flang-inline-matmul-as-elemental %s | FileCheck %s --check-prefixes=ALL,ELEMENTAL + +func.func @matmul_matrix_matrix_integer(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_matrix_matrix_integer( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0 : i32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2> +// ANSE: %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr { +// ANSE: ^bb0(%[[VAL_12:.*]]: !fir.ref>): +// ANSE: %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// ANSE: fir.do_loop %[[VAL_14:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_18:.*]] = arith.subi %[[VAL_16]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_18]] : index +// ANSE: %[[VAL_20:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_21:.*]] = arith.addi %[[VAL_14]], %[[VAL_20]] : index +// ANSE: %[[VAL_22:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_19]], %[[VAL_21]]) : (!fir.box>, index, index) -> !fir.ref +// ANSE: hlfir.assign %[[VAL_4]] to %[[VAL_22]] : i32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: fir.do_loop %[[VAL_23:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_28:.*]] = arith.subi %[[VAL_26]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_29:.*]] = arith.addi %[[VAL_25]], %[[VAL_28]] : index +// ANSE: %[[VAL_30:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_31:.*]] = arith.addi %[[VAL_24]], %[[VAL_30]] : index +// ANSE: %[[VAL_32:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_29]], %[[VAL_31]]) : (!fir.box>, index, index) -> !fir.ref +// ANSE: %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref +// ANSE: %[[VAL_34:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_25]], %[[VAL_23]] : (!hlfir.expr, index, index) -> i16 +// ANSE: %[[VAL_35:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_23]], %[[VAL_24]] : (!hlfir.expr, index, index) -> i32 +// ANSE: %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (i16) -> i32 +// ANSE: %[[VAL_37:.*]] = arith.muli %[[VAL_36]], %[[VAL_35]] : i32 +// ANSE: %[[VAL_38:.*]] = arith.addi %[[VAL_33]], %[[VAL_37]] : i32 +// ANSE: hlfir.assign %[[VAL_38]] to %[[VAL_32]] : i32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_11]] : !hlfir.expr +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0 : i32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr { +// ELEMENTAL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ELEMENTAL: %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] unordered iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (i32) { +// ELEMENTAL: %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_14]] : (!hlfir.expr, index, index) -> i16 +// ELEMENTAL: %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr, index, index) -> i32 +// ELEMENTAL: %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (i16) -> i32 +// ELEMENTAL: %[[VAL_19:.*]] = arith.muli %[[VAL_18]], %[[VAL_17]] : i32 +// ELEMENTAL: %[[VAL_20:.*]] = arith.addi %[[VAL_15]], %[[VAL_19]] : i32 +// ELEMENTAL: fir.result %[[VAL_20]] : i32 +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_13]] : i32 +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_10]] : !hlfir.expr +// ELEMENTAL: } + +func.func @matmul_matrix_matrix_real(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_matrix_matrix_real( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2> +// ANSE: %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr { +// ANSE: ^bb0(%[[VAL_12:.*]]: !fir.ref>): +// ANSE: %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// ANSE: fir.do_loop %[[VAL_14:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_18:.*]] = arith.subi %[[VAL_16]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_19:.*]] = arith.addi %[[VAL_15]], %[[VAL_18]] : index +// ANSE: %[[VAL_20:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_21:.*]] = arith.addi %[[VAL_14]], %[[VAL_20]] : index +// ANSE: %[[VAL_22:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_19]], %[[VAL_21]]) : (!fir.box>, index, index) -> !fir.ref +// ANSE: hlfir.assign %[[VAL_4]] to %[[VAL_22]] : f32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: fir.do_loop %[[VAL_23:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] { +// ANSE: %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_28:.*]] = arith.subi %[[VAL_26]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_29:.*]] = arith.addi %[[VAL_25]], %[[VAL_28]] : index +// ANSE: %[[VAL_30:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_31:.*]] = arith.addi %[[VAL_24]], %[[VAL_30]] : index +// ANSE: %[[VAL_32:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_29]], %[[VAL_31]]) : (!fir.box>, index, index) -> !fir.ref +// ANSE: %[[VAL_33:.*]] = fir.load %[[VAL_32]] : !fir.ref +// ANSE: %[[VAL_34:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_25]], %[[VAL_23]] : (!hlfir.expr, index, index) -> f32 +// ANSE: %[[VAL_35:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_23]], %[[VAL_24]] : (!hlfir.expr, index, index) -> f16 +// ANSE: %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (f16) -> f32 +// ANSE: %[[VAL_37:.*]] = arith.mulf %[[VAL_34]], %[[VAL_36]] : f32 +// ANSE: %[[VAL_38:.*]] = arith.addf %[[VAL_33]], %[[VAL_37]] : f32 +// ANSE: hlfir.assign %[[VAL_38]] to %[[VAL_32]] : f32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_11]] : !hlfir.expr +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr { +// ELEMENTAL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ELEMENTAL: %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (f32) { +// ELEMENTAL: %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_14]] : (!hlfir.expr, index, index) -> f32 +// ELEMENTAL: %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr, index, index) -> f16 +// ELEMENTAL: %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (f16) -> f32 +// ELEMENTAL: %[[VAL_19:.*]] = arith.mulf %[[VAL_16]], %[[VAL_18]] : f32 +// ELEMENTAL: %[[VAL_20:.*]] = arith.addf %[[VAL_15]], %[[VAL_19]] : f32 +// ELEMENTAL: fir.result %[[VAL_20]] : f32 +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_13]] : f32 +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_10]] : !hlfir.expr +// ELEMENTAL: } + +func.func @matmul_matrix_matrix_complex(%arg0: !hlfir.expr>, %arg1: !hlfir.expr>) -> !hlfir.expr> { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr>, !hlfir.expr>) -> !hlfir.expr> + return %res : !hlfir.expr> +} +// ALL-LABEL: func.func @matmul_matrix_matrix_complex( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr>, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr>) -> !hlfir.expr> { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr>) -> !fir.shape<2> +// ANSE: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2> +// ANSE: %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr> { +// ANSE: ^bb0(%[[VAL_12:.*]]: !fir.ref>>): +// ANSE: %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// ANSE: %[[VAL_14:.*]] = fir.undefined complex +// ANSE: %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_4]], [0 : index] : (complex, f32) -> complex +// ANSE: %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_4]], [1 : index] : (complex, f32) -> complex +// ANSE: fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_21:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_21]] : index +// ANSE: %[[VAL_23:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_24:.*]] = arith.addi %[[VAL_17]], %[[VAL_23]] : index +// ANSE: %[[VAL_25:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_22]], %[[VAL_24]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: hlfir.assign %[[VAL_16]] to %[[VAL_25]] : complex, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_27:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_28:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] { +// ANSE: %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_31:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_32:.*]] = arith.addi %[[VAL_28]], %[[VAL_31]] : index +// ANSE: %[[VAL_33:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_34:.*]] = arith.addi %[[VAL_27]], %[[VAL_33]] : index +// ANSE: %[[VAL_35:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_32]], %[[VAL_34]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref> +// ANSE: %[[VAL_37:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_28]], %[[VAL_26]] : (!hlfir.expr>, index, index) -> complex +// ANSE: %[[VAL_38:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_26]], %[[VAL_27]] : (!hlfir.expr>, index, index) -> complex +// ANSE: %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (complex) -> complex +// ANSE: %[[VAL_40:.*]] = fir.mulc %[[VAL_37]], %[[VAL_39]] : complex +// ANSE: %[[VAL_41:.*]] = fir.addc %[[VAL_36]], %[[VAL_40]] : complex +// ANSE: hlfir.assign %[[VAL_41]] to %[[VAL_35]] : complex, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_11]] : !hlfir.expr> +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr>) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr> { +// ELEMENTAL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ELEMENTAL: %[[VAL_13:.*]] = fir.undefined complex +// ELEMENTAL: %[[VAL_14:.*]] = fir.insert_value %[[VAL_13]], %[[VAL_3]], [0 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [1 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_16:.*]] = fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (complex) { +// ELEMENTAL: %[[VAL_19:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_17]] : (!hlfir.expr>, index, index) -> complex +// ELEMENTAL: %[[VAL_20:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]], %[[VAL_12]] : (!hlfir.expr>, index, index) -> complex +// ELEMENTAL: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (complex) -> complex +// ELEMENTAL: %[[VAL_22:.*]] = fir.mulc %[[VAL_19]], %[[VAL_21]] : complex +// ELEMENTAL: %[[VAL_23:.*]] = fir.addc %[[VAL_18]], %[[VAL_22]] : complex +// ELEMENTAL: fir.result %[[VAL_23]] : complex +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_16]] : complex +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_10]] : !hlfir.expr> +// ELEMENTAL: } + +func.func @matmul_matrix_matrix_complex_real(%arg0: !hlfir.expr>, %arg1: !hlfir.expr) -> !hlfir.expr> { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr>, !hlfir.expr) -> !hlfir.expr> + return %res : !hlfir.expr> +} +// ALL-LABEL: func.func @matmul_matrix_matrix_complex_real( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr>, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr> { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2> +// ANSE: %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr> { +// ANSE: ^bb0(%[[VAL_12:.*]]: !fir.ref>>): +// ANSE: %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// ANSE: %[[VAL_14:.*]] = fir.undefined complex +// ANSE: %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_4]], [0 : index] : (complex, f32) -> complex +// ANSE: %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_4]], [1 : index] : (complex, f32) -> complex +// ANSE: fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_21:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_22:.*]] = arith.addi %[[VAL_18]], %[[VAL_21]] : index +// ANSE: %[[VAL_23:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_24:.*]] = arith.addi %[[VAL_17]], %[[VAL_23]] : index +// ANSE: %[[VAL_25:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_22]], %[[VAL_24]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: hlfir.assign %[[VAL_16]] to %[[VAL_25]] : complex, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_27:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_28:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] { +// ANSE: %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_31:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_32:.*]] = arith.addi %[[VAL_28]], %[[VAL_31]] : index +// ANSE: %[[VAL_33:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_34:.*]] = arith.addi %[[VAL_27]], %[[VAL_33]] : index +// ANSE: %[[VAL_35:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_32]], %[[VAL_34]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref> +// ANSE: %[[VAL_37:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_28]], %[[VAL_26]] : (!hlfir.expr>, index, index) -> complex +// ANSE: %[[VAL_38:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_26]], %[[VAL_27]] : (!hlfir.expr, index, index) -> f16 +// ANSE: %[[VAL_39:.*]] = fir.undefined complex +// ANSE: %[[VAL_40:.*]] = fir.insert_value %[[VAL_39]], %[[VAL_4]], [0 : index] : (complex, f32) -> complex +// ANSE: %[[VAL_41:.*]] = fir.insert_value %[[VAL_40]], %[[VAL_4]], [1 : index] : (complex, f32) -> complex +// ANSE: %[[VAL_42:.*]] = fir.convert %[[VAL_38]] : (f16) -> f32 +// ANSE: %[[VAL_43:.*]] = fir.insert_value %[[VAL_41]], %[[VAL_42]], [0 : index] : (complex, f32) -> complex +// ANSE: %[[VAL_44:.*]] = fir.mulc %[[VAL_37]], %[[VAL_43]] : complex +// ANSE: %[[VAL_45:.*]] = fir.addc %[[VAL_36]], %[[VAL_44]] : complex +// ANSE: hlfir.assign %[[VAL_45]] to %[[VAL_35]] : complex, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_11]] : !hlfir.expr> +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr> { +// ELEMENTAL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ELEMENTAL: %[[VAL_13:.*]] = fir.undefined complex +// ELEMENTAL: %[[VAL_14:.*]] = fir.insert_value %[[VAL_13]], %[[VAL_3]], [0 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_3]], [1 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_16:.*]] = fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (complex) { +// ELEMENTAL: %[[VAL_19:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_17]] : (!hlfir.expr>, index, index) -> complex +// ELEMENTAL: %[[VAL_20:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]], %[[VAL_12]] : (!hlfir.expr, index, index) -> f16 +// ELEMENTAL: %[[VAL_21:.*]] = fir.undefined complex +// ELEMENTAL: %[[VAL_22:.*]] = fir.insert_value %[[VAL_21]], %[[VAL_3]], [0 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_23:.*]] = fir.insert_value %[[VAL_22]], %[[VAL_3]], [1 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (f16) -> f32 +// ELEMENTAL: %[[VAL_25:.*]] = fir.insert_value %[[VAL_23]], %[[VAL_24]], [0 : index] : (complex, f32) -> complex +// ELEMENTAL: %[[VAL_26:.*]] = fir.mulc %[[VAL_19]], %[[VAL_25]] : complex +// ELEMENTAL: %[[VAL_27:.*]] = fir.addc %[[VAL_18]], %[[VAL_26]] : complex +// ELEMENTAL: fir.result %[[VAL_27]] : complex +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_16]] : complex +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_10]] : !hlfir.expr> +// ELEMENTAL: } + +func.func @matmul_matrix_matrix_logical(%arg0: !hlfir.expr>, %arg1: !hlfir.expr>) -> !hlfir.expr> { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr>, !hlfir.expr>) -> !hlfir.expr> + return %res : !hlfir.expr> +} +// ALL-LABEL: func.func @matmul_matrix_matrix_logical( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr>, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr>) -> !hlfir.expr> { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant false +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr>) -> !fir.shape<2> +// ANSE: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_10:.*]] = fir.shape %[[VAL_6]], %[[VAL_9]] : (index, index) -> !fir.shape<2> +// ANSE: %[[VAL_11:.*]] = hlfir.eval_in_mem shape %[[VAL_10]] : (!fir.shape<2>) -> !hlfir.expr> { +// ANSE: ^bb0(%[[VAL_12:.*]]: !fir.ref>>): +// ANSE: %[[VAL_13:.*]] = fir.embox %[[VAL_12]](%[[VAL_10]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// ANSE: %[[VAL_14:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4> +// ANSE: fir.do_loop %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_19:.*]] = arith.subi %[[VAL_17]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_20:.*]] = arith.addi %[[VAL_16]], %[[VAL_19]] : index +// ANSE: %[[VAL_21:.*]] = arith.subi %[[VAL_18]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_22:.*]] = arith.addi %[[VAL_15]], %[[VAL_21]] : index +// ANSE: %[[VAL_23:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_20]], %[[VAL_22]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: hlfir.assign %[[VAL_14]] to %[[VAL_23]] : !fir.logical<4>, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: fir.do_loop %[[VAL_24:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_25:.*]] = %[[VAL_3]] to %[[VAL_9]] step %[[VAL_3]] unordered { +// ANSE: fir.do_loop %[[VAL_26:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_2]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// ANSE: %[[VAL_29:.*]] = arith.subi %[[VAL_27]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_30:.*]] = arith.addi %[[VAL_26]], %[[VAL_29]] : index +// ANSE: %[[VAL_31:.*]] = arith.subi %[[VAL_28]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_32:.*]] = arith.addi %[[VAL_25]], %[[VAL_31]] : index +// ANSE: %[[VAL_33:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_30]], %[[VAL_32]]) : (!fir.box>>, index, index) -> !fir.ref> +// ANSE: %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref> +// ANSE: %[[VAL_35:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_26]], %[[VAL_24]] : (!hlfir.expr>, index, index) -> !fir.logical<1> +// ANSE: %[[VAL_36:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_24]], %[[VAL_25]] : (!hlfir.expr>, index, index) -> !fir.logical<4> +// ANSE: %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1 +// ANSE: %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.logical<1>) -> i1 +// ANSE: %[[VAL_39:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1 +// ANSE: %[[VAL_40:.*]] = arith.andi %[[VAL_38]], %[[VAL_39]] : i1 +// ANSE: %[[VAL_41:.*]] = arith.ori %[[VAL_37]], %[[VAL_40]] : i1 +// ANSE: %[[VAL_42:.*]] = fir.convert %[[VAL_41]] : (i1) -> !fir.logical<4> +// ANSE: hlfir.assign %[[VAL_42]] to %[[VAL_33]] : !fir.logical<4>, !fir.ref> +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_11]] : !hlfir.expr> +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant false +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr>) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr>) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_9:.*]] = fir.shape %[[VAL_5]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr> { +// ELEMENTAL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ELEMENTAL: %[[VAL_13:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4> +// ELEMENTAL: %[[VAL_14:.*]] = fir.do_loop %[[VAL_15:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] unordered iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (!fir.logical<4>) { +// ELEMENTAL: %[[VAL_17:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_15]] : (!hlfir.expr>, index, index) -> !fir.logical<1> +// ELEMENTAL: %[[VAL_18:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_15]], %[[VAL_12]] : (!hlfir.expr>, index, index) -> !fir.logical<4> +// ELEMENTAL: %[[VAL_19:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1 +// ELEMENTAL: %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<1>) -> i1 +// ELEMENTAL: %[[VAL_21:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1 +// ELEMENTAL: %[[VAL_22:.*]] = arith.andi %[[VAL_20]], %[[VAL_21]] : i1 +// ELEMENTAL: %[[VAL_23:.*]] = arith.ori %[[VAL_19]], %[[VAL_22]] : i1 +// ELEMENTAL: %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i1) -> !fir.logical<4> +// ELEMENTAL: fir.result %[[VAL_24]] : !fir.logical<4> +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_14]] : !fir.logical<4> +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_10]] : !hlfir.expr> +// ELEMENTAL: } + +func.func @matmul_matrix_vector_real(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_matrix_vector_real( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> +// ANSE: %[[VAL_9:.*]] = hlfir.eval_in_mem shape %[[VAL_8]] : (!fir.shape<1>) -> !hlfir.expr { +// ANSE: ^bb0(%[[VAL_10:.*]]: !fir.ref>): +// ANSE: %[[VAL_11:.*]] = fir.embox %[[VAL_10]](%[[VAL_8]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// ANSE: fir.do_loop %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_11]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_14:.*]] = arith.subi %[[VAL_13]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_14]] : index +// ANSE: %[[VAL_16:.*]] = hlfir.designate %[[VAL_11]] (%[[VAL_15]]) : (!fir.box>, index) -> !fir.ref +// ANSE: hlfir.assign %[[VAL_4]] to %[[VAL_16]] : f32, !fir.ref +// ANSE: } +// ANSE: fir.do_loop %[[VAL_17:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] { +// ANSE: %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_11]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_20:.*]] = arith.subi %[[VAL_19]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_21:.*]] = arith.addi %[[VAL_18]], %[[VAL_20]] : index +// ANSE: %[[VAL_22:.*]] = hlfir.designate %[[VAL_11]] (%[[VAL_21]]) : (!fir.box>, index) -> !fir.ref +// ANSE: %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref +// ANSE: %[[VAL_24:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_18]], %[[VAL_17]] : (!hlfir.expr, index, index) -> f32 +// ANSE: %[[VAL_25:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_17]] : (!hlfir.expr, index) -> f16 +// ANSE: %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (f16) -> f32 +// ANSE: %[[VAL_27:.*]] = arith.mulf %[[VAL_24]], %[[VAL_26]] : f32 +// ANSE: %[[VAL_28:.*]] = arith.addf %[[VAL_23]], %[[VAL_27]] : f32 +// ANSE: hlfir.assign %[[VAL_28]] to %[[VAL_22]] : f32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_9]] : !hlfir.expr +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_7:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1> +// ELEMENTAL: %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr { +// ELEMENTAL: ^bb0(%[[VAL_9:.*]]: index): +// ELEMENTAL: %[[VAL_10:.*]] = fir.do_loop %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_2]] iter_args(%[[VAL_12:.*]] = %[[VAL_3]]) -> (f32) { +// ELEMENTAL: %[[VAL_13:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_9]], %[[VAL_11]] : (!hlfir.expr, index, index) -> f32 +// ELEMENTAL: %[[VAL_14:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_11]] : (!hlfir.expr, index) -> f16 +// ELEMENTAL: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (f16) -> f32 +// ELEMENTAL: %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] : f32 +// ELEMENTAL: %[[VAL_17:.*]] = arith.addf %[[VAL_12]], %[[VAL_16]] : f32 +// ELEMENTAL: fir.result %[[VAL_17]] : f32 +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_10]] : f32 +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_8]] : !hlfir.expr +// ELEMENTAL: } + +func.func @matmul_vector_matrix_real(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_vector_matrix_real( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { + +// NOANSE: hlfir.matmul + +// ANSE: %[[VAL_2:.*]] = arith.constant 0 : index +// ANSE: %[[VAL_3:.*]] = arith.constant 1 : index +// ANSE: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// ANSE: %[[VAL_5:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<1> +// ANSE: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_5]] {dim = 0 : index} : (!fir.shape<1>) -> index +// ANSE: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ANSE: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ANSE: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> +// ANSE: %[[VAL_10:.*]] = hlfir.eval_in_mem shape %[[VAL_9]] : (!fir.shape<1>) -> !hlfir.expr { +// ANSE: ^bb0(%[[VAL_11:.*]]: !fir.ref>): +// ANSE: %[[VAL_12:.*]] = fir.embox %[[VAL_11]](%[[VAL_9]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// ANSE: fir.do_loop %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_3]] unordered { +// ANSE: %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_12]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_15:.*]] = arith.subi %[[VAL_14]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_15]] : index +// ANSE: %[[VAL_17:.*]] = hlfir.designate %[[VAL_12]] (%[[VAL_16]]) : (!fir.box>, index) -> !fir.ref +// ANSE: hlfir.assign %[[VAL_4]] to %[[VAL_17]] : f32, !fir.ref +// ANSE: } +// ANSE: fir.do_loop %[[VAL_18:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_3]] { +// ANSE: fir.do_loop %[[VAL_19:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_3]] { +// ANSE: %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_12]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// ANSE: %[[VAL_21:.*]] = arith.subi %[[VAL_20]]#0, %[[VAL_3]] : index +// ANSE: %[[VAL_22:.*]] = arith.addi %[[VAL_19]], %[[VAL_21]] : index +// ANSE: %[[VAL_23:.*]] = hlfir.designate %[[VAL_12]] (%[[VAL_22]]) : (!fir.box>, index) -> !fir.ref +// ANSE: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref +// ANSE: %[[VAL_25:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_18]] : (!hlfir.expr, index) -> f32 +// ANSE: %[[VAL_26:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_18]], %[[VAL_19]] : (!hlfir.expr, index, index) -> f16 +// ANSE: %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (f16) -> f32 +// ANSE: %[[VAL_28:.*]] = arith.mulf %[[VAL_25]], %[[VAL_27]] : f32 +// ANSE: %[[VAL_29:.*]] = arith.addf %[[VAL_24]], %[[VAL_28]] : f32 +// ANSE: hlfir.assign %[[VAL_29]] to %[[VAL_23]] : f32, !fir.ref +// ANSE: } +// ANSE: } +// ANSE: } +// ANSE: return %[[VAL_10]] : !hlfir.expr +// ANSE: } + +// ELEMENTAL: %[[VAL_2:.*]] = arith.constant 1 : index +// ELEMENTAL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ELEMENTAL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<1> +// ELEMENTAL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<1>) -> index +// ELEMENTAL: %[[VAL_6:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ELEMENTAL: %[[VAL_7:.*]] = hlfir.get_extent %[[VAL_6]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ELEMENTAL: %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1> +// ELEMENTAL: %[[VAL_9:.*]] = hlfir.elemental %[[VAL_8]] unordered : (!fir.shape<1>) -> !hlfir.expr { +// ELEMENTAL: ^bb0(%[[VAL_10:.*]]: index): +// ELEMENTAL: %[[VAL_11:.*]] = fir.do_loop %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_13:.*]] = %[[VAL_3]]) -> (f32) { +// ELEMENTAL: %[[VAL_14:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_12]] : (!hlfir.expr, index) -> f32 +// ELEMENTAL: %[[VAL_15:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_12]], %[[VAL_10]] : (!hlfir.expr, index, index) -> f16 +// ELEMENTAL: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (f16) -> f32 +// ELEMENTAL: %[[VAL_17:.*]] = arith.mulf %[[VAL_14]], %[[VAL_16]] : f32 +// ELEMENTAL: %[[VAL_18:.*]] = arith.addf %[[VAL_13]], %[[VAL_17]] : f32 +// ELEMENTAL: fir.result %[[VAL_18]] : f32 +// ELEMENTAL: } +// ELEMENTAL: hlfir.yield_element %[[VAL_11]] : f32 +// ELEMENTAL: } +// ELEMENTAL: return %[[VAL_9]] : !hlfir.expr +// ELEMENTAL: } + +func.func @matmul_transpose_matrix_matrix_integer(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_transpose_matrix_matrix_integer( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { +// ALL: %[[VAL_2:.*]] = arith.constant 1 : index +// ALL: %[[VAL_3:.*]] = arith.constant 0 : i32 +// ALL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ALL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ALL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ALL: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<2> +// ALL: %[[VAL_8:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ALL: %[[VAL_9:.*]] = fir.shape %[[VAL_6]], %[[VAL_8]] : (index, index) -> !fir.shape<2> +// ALL: %[[VAL_10:.*]] = hlfir.elemental %[[VAL_9]] unordered : (!fir.shape<2>) -> !hlfir.expr { +// ALL: ^bb0(%[[VAL_11:.*]]: index, %[[VAL_12:.*]]: index): +// ALL: %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] unordered iter_args(%[[VAL_15:.*]] = %[[VAL_3]]) -> (i32) { +// ALL: %[[VAL_16:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_14]], %[[VAL_11]] : (!hlfir.expr, index, index) -> i16 +// ALL: %[[VAL_17:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_14]], %[[VAL_12]] : (!hlfir.expr, index, index) -> i32 +// ALL: %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (i16) -> i32 +// ALL: %[[VAL_19:.*]] = arith.muli %[[VAL_18]], %[[VAL_17]] : i32 +// ALL: %[[VAL_20:.*]] = arith.addi %[[VAL_15]], %[[VAL_19]] : i32 +// ALL: fir.result %[[VAL_20]] : i32 +// ALL: } +// ALL: hlfir.yield_element %[[VAL_13]] : i32 +// ALL: } +// ALL: return %[[VAL_10]] : !hlfir.expr +// ALL: } + +func.func @matmul_transpose_matrix_vector_real(%arg0: !hlfir.expr, %arg1: !hlfir.expr) -> !hlfir.expr { + %res = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + return %res : !hlfir.expr +} +// ALL-LABEL: func.func @matmul_transpose_matrix_vector_real( +// ALL-SAME: %[[VAL_0:.*]]: !hlfir.expr, +// ALL-SAME: %[[VAL_1:.*]]: !hlfir.expr) -> !hlfir.expr { +// ALL: %[[VAL_2:.*]] = arith.constant 1 : index +// ALL: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// ALL: %[[VAL_4:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// ALL: %[[VAL_5:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 0 : index} : (!fir.shape<2>) -> index +// ALL: %[[VAL_6:.*]] = hlfir.get_extent %[[VAL_4]] {dim = 1 : index} : (!fir.shape<2>) -> index +// ALL: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> +// ALL: %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr { +// ALL: ^bb0(%[[VAL_9:.*]]: index): +// ALL: %[[VAL_10:.*]] = fir.do_loop %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_12:.*]] = %[[VAL_3]]) -> (f32) { +// ALL: %[[VAL_13:.*]] = hlfir.apply %[[VAL_0]], %[[VAL_11]], %[[VAL_9]] : (!hlfir.expr, index, index) -> f32 +// ALL: %[[VAL_14:.*]] = hlfir.apply %[[VAL_1]], %[[VAL_11]] : (!hlfir.expr, index) -> f16 +// ALL: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (f16) -> f32 +// ALL: %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] : f32 +// ALL: %[[VAL_17:.*]] = arith.addf %[[VAL_12]], %[[VAL_16]] : f32 +// ALL: fir.result %[[VAL_17]] : f32 +// ALL: } +// ALL: hlfir.yield_element %[[VAL_10]] : f32 +// ALL: } +// ALL: return %[[VAL_8]] : !hlfir.expr +// ALL: } + +// Check that the inner-product loop uses the best known extent +// of the input matrices: +func.func @matmul_matrix_matrix_deduce_bounds(%arg0: !hlfir.expr, %arg1: !hlfir.expr, %arg2: !hlfir.expr<10x?xi16>) -> (!hlfir.expr, !hlfir.expr) { + %res1 = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + %res2 = hlfir.matmul %arg1 %arg2 : (!hlfir.expr, !hlfir.expr<10x?xi16>) -> !hlfir.expr + return %res1, %res2 : !hlfir.expr, !hlfir.expr +} +// ALL-LABEL: func.func @matmul_matrix_matrix_deduce_bounds( + +// ANSE: %[[VAL_6:.*]] = arith.constant 10 : index +// ANSE: hlfir.eval_in_mem shape {{.*}} +// ANSE: fir.do_loop +// ANSE: fir.do_loop +// ANSE: fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_6]] +// ANSE: fir.do_loop +// ANSE: fir.do_loop +// ANSE: hlfir.eval_in_mem shape {{.*}} +// ANSE: fir.do_loop +// ANSE: fir.do_loop +// ANSE: fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_6]] +// ANSE: fir.do_loop +// ANSE: fir.do_loop + +// ELEMENTAL: %[[VAL_5:.*]] = arith.constant 10 : index +// ELEMENTAL: hlfir.elemental %{{.*}} +// ELEMENTAL: fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_5]] +// ELEMENTAL: hlfir.elemental %{{.*}} +// ELEMENTAL: fir.do_loop %{{.*}} = %{{.*}} to %[[VAL_5]]