@@ -2494,230 +2494,6 @@ static bool isMatvecLike(linalg::LinalgOp linalgOp) {
24942494 return true ;
24952495}
24962496
2497- // ====---------------------------------------------------------------------===//
2498- // Warp Reduction Pipeline Configuration
2499- // ====---------------------------------------------------------------------===//
2500-
2501- // / Set the configuration for reductions that can be mapped to warp reductions.
2502- static LogicalResult
2503- setWarpReductionConfig (IREE::GPU::TargetAttr target,
2504- mlir::FunctionOpInterface entryPoint,
2505- linalg::LinalgOp op) {
2506- if (!target.supportsSubgroupShuffle ())
2507- return failure ();
2508-
2509- SmallVector<unsigned > parallelDims;
2510- SmallVector<unsigned > reductionDims;
2511- op.getParallelDims (parallelDims);
2512- op.getReductionDims (reductionDims);
2513-
2514- SmallVector<int64_t > bounds = op.getStaticLoopRanges ();
2515- int64_t numParallelDims = op.getNumParallelLoops ();
2516-
2517- if (reductionDims.empty ())
2518- return failure ();
2519-
2520- // Make sure reduction dimensions are static and innermost ones.
2521- int64_t numDynamicReductionDims = 0 ;
2522- for (unsigned dim : reductionDims) {
2523- if (ShapedType::isDynamic (bounds[dim])) {
2524- numDynamicReductionDims++;
2525- }
2526- if (dim < numParallelDims) {
2527- return failure ();
2528- }
2529- }
2530- int numDynamicDims = llvm::count_if (bounds, ShapedType::isDynamic);
2531-
2532- // Distribution of multi-dim masked writes currently aren't fully supported.
2533- if (numDynamicReductionDims > 1 ) {
2534- return failure ();
2535- }
2536-
2537- if (op.getRegionOutputArgs ().size () != 1 )
2538- return failure ();
2539-
2540- // Only support projected permutation, this could be extended to projected
2541- // permutated with broadcast.
2542- if (llvm::any_of (op.getDpsInputOperands (), [&](OpOperand *input) {
2543- return !op.getMatchingIndexingMap (input).isProjectedPermutation ();
2544- }))
2545- return failure ();
2546-
2547- bool foundSingleReductionOutput = false ;
2548- for (auto [index, initOpOperand] : llvm::enumerate (op.getDpsInitsMutable ())) {
2549- // Only single combiner operations are supported for now.
2550- SmallVector<Operation *> combinerOps;
2551- if (matchReduction (op.getRegionOutputArgs (), index, combinerOps) &&
2552- combinerOps.size () == 1 ) {
2553- if (foundSingleReductionOutput)
2554- return failure ();
2555- foundSingleReductionOutput = true ;
2556- continue ;
2557- }
2558- if (!op.getMatchingIndexingMap (&initOpOperand).isIdentity ())
2559- return failure ();
2560- }
2561- if (!foundSingleReductionOutput)
2562- return failure ();
2563-
2564- SmallVector<int64_t > workgroupTileSizes (op.getNumParallelLoops (), 1 );
2565-
2566- int64_t reductionSize = 1 ;
2567- for (int64_t dim : reductionDims)
2568- reductionSize *= bounds[dim];
2569-
2570- int64_t subgroupSize = 0 ;
2571- for (int s : target.getWgp ().getSubgroupSizeChoices ().asArrayRef ()) {
2572- if (reductionSize % s == 0 ) {
2573- subgroupSize = s;
2574- break ;
2575- }
2576- }
2577- if (subgroupSize == 0 )
2578- return failure ();
2579-
2580- // Without any bounds on dynamic dims, we need specialization to
2581- // get peak performance. For now, just use the warp size.
2582- if (numDynamicDims > 0 ) {
2583- SmallVector<int64_t > reductionTileSizes (op.getNumLoops (), 0 );
2584- int64_t preferredSubgroupSize = target.getPreferredSubgroupSize ();
2585- // We should set the subgroup size on:
2586- // Priority 1: The innermost reduction dimension with static shapes.
2587- // Priority 2: If there's no reduction dimension with static shapes
2588- // then the innermost reduction dim.
2589- unsigned lastNonDynamicReductionDim = reductionDims.back ();
2590- if (reductionDims.size () > 1 ) {
2591- for (unsigned dim : reductionDims) {
2592- if (ShapedType::isDynamic (bounds[dim])) {
2593- reductionTileSizes[dim] = 1 ;
2594- } else {
2595- lastNonDynamicReductionDim = dim;
2596- }
2597- }
2598- }
2599- reductionTileSizes[lastNonDynamicReductionDim] = preferredSubgroupSize;
2600- TileSizesListType tileSizes;
2601- tileSizes.emplace_back (std::move (workgroupTileSizes)); // Workgroup level
2602- tileSizes.emplace_back (std::move (reductionTileSizes)); // Reduction level
2603- std::array<int64_t , 3 > workgroupSize = {preferredSubgroupSize, 1 , 1 };
2604- if (failed (setOpConfigAndEntryPointFnTranslation (
2605- entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction,
2606- workgroupSize, preferredSubgroupSize))) {
2607- return failure ();
2608- }
2609- return success ();
2610- }
2611-
2612- const Type elementType =
2613- llvm::cast<ShapedType>(op.getDpsInitOperand (0 )->get ().getType ())
2614- .getElementType ();
2615- if (!elementType.isIntOrFloat ())
2616- return failure ();
2617- unsigned bitWidth = elementType.getIntOrFloatBitWidth ();
2618- // Reduction distribution only supports 8/16/32 bit types now.
2619- if (bitWidth != 32 && bitWidth != 16 && bitWidth != 8 )
2620- return failure ();
2621-
2622- const unsigned largestLoadSizeInBits = 128 ;
2623- unsigned vectorSize = largestLoadSizeInBits / bitWidth;
2624- while ((reductionSize / vectorSize) % subgroupSize != 0 )
2625- vectorSize /= 2 ;
2626-
2627- // Deduce the workgroup size we should use for reduction. Currently a
2628- // workgroup processes all elements in reduction dimensions. Need to make sure
2629- // the workgroup size we use can divide the total reduction size, and it's
2630- // also within hardware limitations.
2631- const int64_t maxWorkgroupSize = 1024 ;
2632- int64_t groupSize = reductionSize / vectorSize;
2633- if (groupSize > maxWorkgroupSize) {
2634- groupSize = llvm::APIntOps::GreatestCommonDivisor (
2635- {64 , uint64_t (groupSize)}, {64 , uint64_t (maxWorkgroupSize)})
2636- .getZExtValue ();
2637- }
2638-
2639- // Then we need to strike a balance--
2640- // 1) parallel dimensions are distributed to workgroups. If there are many
2641- // workgroups dispatched, we'd want to have each GPU core hosting multiple
2642- // of them for occupancy.
2643- // 2) we want each thread to read quite a few 128-bit vectors for better
2644- // memory cache behavior.
2645- // Both means we cannot use a too large workgroup size.
2646-
2647- std::optional<int64_t > parallelSize = 1 ;
2648- for (int64_t dim : parallelDims) {
2649- if (ShapedType::isDynamic (bounds[dim])) {
2650- parallelSize = std::nullopt ;
2651- break ;
2652- }
2653- *parallelSize *= bounds[dim];
2654- }
2655- // Total parallel size that can fill the GPU with enough workgorups.
2656- // TODO: query from the target device; roughly 2x hardware compute unit.
2657- const int parallelThreshold = 256 ;
2658- // How many 128-bit vectors each thread should at least read.
2659- const int targetVectorCount = 8 ;
2660- while (parallelSize && *parallelSize > parallelThreshold &&
2661- (groupSize / 2 ) % subgroupSize == 0 &&
2662- reductionSize / (groupSize * vectorSize) < targetVectorCount) {
2663- // Use less subgroups per workgroup..
2664- groupSize /= 2 ;
2665- // in order to host more workgroups per hardware compute unit.
2666- *parallelSize /= 2 ;
2667- }
2668-
2669- // Current warp reduction pattern is a two step butterfly warp reduce.
2670- // First, do warp reductions along multiple subgroups.
2671- // Second, reduce results from multiple subgroups using single warp reduce.
2672- // The final warp reduce requires subgroup count <= subgroup size to work.
2673- if ((groupSize / subgroupSize) > subgroupSize)
2674- return failure ();
2675-
2676- // With just one subgroup per workgroup, make each subgroup do more work and
2677- // process a few reductions (rows) along the last parallel dimension.
2678- //
2679- // TODO: This is enabled for matvec on ROCm for now. We should
2680- // validate this strategy and extend to more linalg generics and to CUDA.
2681- if (isROCmBackend (target) && ShapedType::isStaticShape (bounds) &&
2682- isMatvecLike (op)) {
2683- int64_t parallelIdx = *llvm::find_if (
2684- parallelDims, [&](int64_t currIdx) { return bounds[currIdx] != 1 ; });
2685- int64_t parallelBound = bounds[parallelIdx];
2686- int64_t numParallelReductions = 1 ;
2687- const int64_t maxParallelFactor = groupSize / 4 ;
2688- for (int64_t parallelFactor = 2 ; (parallelFactor < maxParallelFactor) &&
2689- (parallelBound % parallelFactor == 0 ) &&
2690- (parallelBound > parallelFactor);
2691- parallelFactor *= 2 ) {
2692- numParallelReductions = parallelFactor;
2693- }
2694- workgroupTileSizes[parallelIdx] = numParallelReductions;
2695- }
2696-
2697- std::array<int64_t , 3 > workgroupSize = {groupSize, 1 , 1 };
2698- SmallVector<int64_t > reductionTileSizes (op.getNumLoops (), 0 );
2699- int64_t remainingGroupSize = groupSize;
2700- for (int i = reductionDims.size () - 1 ; i >= 0 ; --i) {
2701- int64_t dim = reductionDims[i];
2702- int64_t bound = bounds[dim];
2703- if (i == reductionDims.size () - 1 )
2704- bound /= vectorSize;
2705- APInt size = llvm::APIntOps::GreatestCommonDivisor (
2706- {64 , uint64_t (remainingGroupSize)}, {64 , uint64_t (bound)});
2707- reductionTileSizes[dim] = size.getSExtValue ();
2708- if (i == reductionDims.size () - 1 )
2709- reductionTileSizes[dim] *= vectorSize;
2710- remainingGroupSize /= size.getSExtValue ();
2711- }
2712- TileSizesListType tileSizes;
2713- tileSizes.emplace_back (std::move (workgroupTileSizes)); // Workgroup level
2714- tileSizes.emplace_back (std::move (reductionTileSizes)); // Reduction level
2715- return setOpConfigAndEntryPointFnTranslation (
2716- entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction,
2717- workgroupSize, subgroupSize);
2718- return success ();
2719- }
2720-
27212497static bool hasTwoOrThreeLoopsInfo (linalg::LinalgOp linalgOp) {
27222498 return linalgOp.getNumParallelLoops () >= 2 &&
27232499 linalgOp.getNumParallelLoops () <= 3 ;
@@ -3083,10 +2859,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
30832859 LDBG () << " Vector Distribution Subgroup Reduction Config" ;
30842860 return success ();
30852861 }
3086- if (succeeded (setWarpReductionConfig (target, entryPointFn, linalgOp))) {
3087- LDBG () << " Warp Reduction Config" ;
3088- return success ();
3089- }
30902862 if (succeeded (setConvolutionConfig (target, entryPointFn, linalgOp, 16 ))) {
30912863 LDBG () << " Convolution Config" ;
30922864 return success ();
0 commit comments