@@ -428,3 +428,216 @@ func.func @step_invariant() {
428428// CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
429429// CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64
430430// CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
431+
432+ // -----
433+
434+ // 1-d parallel reduction mapped to block.x and thread.x.
435+
436+ // CHECK-LABEL: @parallel_reduction_1d
437+ func.func @parallel_reduction_1d () {
438+ %alloc = memref.alloc () : memref <f32 >
439+ %alloc_0 = memref.alloc () : memref <64 xf32 >
440+ %c1 = arith.constant 1 : index
441+ %c64 = arith.constant 64 : index
442+ %c0 = arith.constant 0 : index
443+ %cst = arith.constant 0.000000e+00 : f32
444+ scf.parallel (%arg1 ) = (%c0 ) to (%c1 ) step (%c1 ) {
445+ %0 = scf.parallel (%arg2 ) = (%c0 ) to (%c64 ) step (%c1 ) init (%cst ) -> f32 {
446+ %1 = memref.load %alloc_0 [%arg2 ] : memref <64 xf32 >
447+ scf.reduce (%1 : f32 ) {
448+ ^bb0 (%arg3: f32 , %arg4: f32 ):
449+ %2 = arith.addf %arg3 , %arg4 : f32
450+ scf.reduce.return %2 : f32
451+ }
452+ } {mapping = [#gpu.loop_dim_map <processor = thread_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
453+ memref.store %0 , %alloc [] : memref <f32 >
454+ scf.reduce
455+ } {mapping = [#gpu.loop_dim_map <processor = block_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
456+ memref.dealloc %alloc : memref <f32 >
457+ memref.dealloc %alloc_0 : memref <64 xf32 >
458+ return
459+ }
460+
461+ // CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
462+ // CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
463+ // CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
464+ // CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
465+ // CHECK: gpu.launch
466+ // CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
467+ // CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
468+ // CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
469+ // CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
470+ // CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32>
471+ // CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
472+ // CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
473+ // CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
474+ // CHECK-NEXT: gpu.yield %[[sum]] : f32
475+ // CHECK-NEXT: } : (f32) -> f32
476+ // CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>
477+
478+ // -----
479+
480+ // 2-d parallel reduction mapped to block.x and thread.x and thread.y.
481+
482+ // CHECK-LABEL: @parallel_reduction_2d
483+ func.func @parallel_reduction_2d () {
484+ %alloc = memref.alloc () : memref <f32 >
485+ %alloc_0 = memref.alloc () : memref <8 x8 xf32 >
486+ %c1 = arith.constant 1 : index
487+ %c8 = arith.constant 8 : index
488+ %c0 = arith.constant 0 : index
489+ %cst = arith.constant 0.000000e+00 : f32
490+ scf.parallel (%arg1 ) = (%c0 ) to (%c1 ) step (%c1 ) {
491+ %0 = scf.parallel (%arg2 , %arg3 ) = (%c0 , %c0 ) to (%c8 , %c8 ) step (%c1 , %c1 ) init (%cst ) -> f32 {
492+ %1 = memref.load %alloc_0 [%arg2 , %arg3 ] : memref <8 x8 xf32 >
493+ scf.reduce (%1 : f32 ) {
494+ ^bb0 (%arg4: f32 , %arg5: f32 ):
495+ %2 = arith.addf %arg4 , %arg5 : f32
496+ scf.reduce.return %2 : f32
497+ }
498+ } {mapping = [#gpu.loop_dim_map <processor = thread_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>, #gpu.loop_dim_map <processor = thread_y , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
499+ memref.store %0 , %alloc [] : memref <f32 >
500+ scf.reduce
501+ } {mapping = [#gpu.loop_dim_map <processor = block_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
502+ memref.dealloc %alloc : memref <f32 >
503+ memref.dealloc %alloc_0 : memref <8 x8 xf32 >
504+ return
505+ }
506+
507+ // CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
508+ // CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32>
509+ // CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
510+ // CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
511+ // CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
512+ // CHECK: gpu.launch
513+ // CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
514+ // CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}})
515+ // CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
516+ // CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
517+ // CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}]
518+ // CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32>
519+ // CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
520+ // CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
521+ // CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
522+ // CHECK-NEXT: gpu.yield %[[sum]] : f32
523+ // CHECK-NEXT: } : (f32) -> f32
524+ // CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>
525+
526+ // -----
527+
528+ // tiled 1-d parallel reduction mapped to block.x and thread.x.
529+
530+ // CHECK-LABEL: @parallel_reduction_1d_tiled
531+ func.func @parallel_reduction_1d_tiled () {
532+ %c128 = arith.constant 128 : index
533+ %c1 = arith.constant 1 : index
534+ %c64 = arith.constant 64 : index
535+ %c0 = arith.constant 0 : index
536+ %cst = arith.constant 0.000000e+00 : f32
537+ %alloc_0 = memref.alloc () : memref <8192 xf32 >
538+ %alloc_1 = memref.alloc () : memref <64 xf32 >
539+ scf.parallel (%arg1 ) = (%c0 ) to (%c64 ) step (%c1 ) {
540+ %subview = memref.subview %alloc_1 [%arg1 ] [1 ] [1 ] : memref <64 xf32 > to memref <f32 , strided <[], offset : ?>>
541+ %0 = affine.apply affine_map <(d0 ) -> (d0 * 128 )>(%arg1 )
542+ %subview_1 = memref.subview %alloc_0 [%0 ] [128 ] [1 ] : memref <8192 xf32 > to memref <128 xf32 , strided <[1 ], offset : ?>>
543+ %1 = scf.parallel (%arg2 ) = (%c0 ) to (%c128 ) step (%c1 ) init (%cst ) -> f32 {
544+ %2 = memref.load %subview_1 [%arg2 ] : memref <128 xf32 , strided <[1 ], offset : ?>>
545+ scf.reduce (%2 : f32 ) {
546+ ^bb0 (%arg3: f32 , %arg4: f32 ):
547+ %3 = arith.addf %arg3 , %arg4 : f32
548+ scf.reduce.return %3 : f32
549+ }
550+ } {mapping = [#gpu.loop_dim_map <processor = thread_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
551+ memref.store %1 , %subview [] : memref <f32 , strided <[], offset : ?>>
552+ scf.reduce
553+ } {mapping = [#gpu.loop_dim_map <processor = block_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
554+ memref.dealloc %alloc_0 : memref <8192 xf32 >
555+ memref.dealloc %alloc_1 : memref <64 xf32 >
556+ return
557+ }
558+
559+ // CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32>
560+ // CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
561+ // CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
562+ // CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
563+ // CHECK: gpu.launch
564+ // CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
565+ // CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
566+ // CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
567+ // CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32>
568+ // CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]])
569+ // CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32>
570+ // CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
571+ // CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>>
572+ // CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
573+ // CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
574+ // CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
575+ // CHECK-NEXT: gpu.yield %[[sum]] : f32
576+ // CHECK-NEXT: } : (f32) -> f32
577+ // CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref<f32, strided<[], offset: ?>>
578+
579+ // -----
580+
581+ // 1-d parallel reduction, unsigned int. Cannot be mapped.
582+
583+ // CHECK-LABEL: @parallel_reduction_1d_uint
584+ func.func @parallel_reduction_1d_uint (%cst : ui32 ) {
585+ %alloc = memref.alloc () : memref <ui32 >
586+ %alloc_0 = memref.alloc () : memref <64 xui32 >
587+ %c1 = arith.constant 1 : index
588+ %c64 = arith.constant 64 : index
589+ %c0 = arith.constant 0 : index
590+ scf.parallel (%arg1 ) = (%c0 ) to (%c1 ) step (%c1 ) {
591+ %0 = scf.parallel (%arg2 ) = (%c0 ) to (%c64 ) step (%c1 ) init (%cst ) -> ui32 {
592+ %1 = memref.load %alloc_0 [%arg2 ] : memref <64 xui32 >
593+ scf.reduce (%1 : ui32 ) {
594+ ^bb0 (%arg3: ui32 , %arg4: ui32 ):
595+ scf.reduce.return %arg3 : ui32
596+ }
597+ } {mapping = [#gpu.loop_dim_map <processor = thread_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
598+ memref.store %0 , %alloc [] : memref <ui32 >
599+ scf.reduce
600+ } {mapping = [#gpu.loop_dim_map <processor = block_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
601+ memref.dealloc %alloc : memref <ui32 >
602+ memref.dealloc %alloc_0 : memref <64 xui32 >
603+ return
604+ }
605+
606+ // CHECK: scf.parallel
607+ // CHECK-NEXT: scf.parallel
608+ // CHECK: scf.reduce
609+
610+ // -----
611+
612+ // 1-d parallel reduction, not isolated from above. Cannot be mapped.
613+
614+ // CHECK-LABEL: @parallel_reduction_1d_outside
615+ func.func @parallel_reduction_1d_outside () {
616+ %alloc = memref.alloc () : memref <f32 >
617+ %alloc_0 = memref.alloc () : memref <64 xf32 >
618+ %c1 = arith.constant 1 : index
619+ %c64 = arith.constant 64 : index
620+ %c0 = arith.constant 0 : index
621+ %cst = arith.constant 0.000000e+00 : f32
622+ %const = arith.constant 1.000000e+00 : f32
623+ scf.parallel (%arg1 ) = (%c0 ) to (%c1 ) step (%c1 ) {
624+ %0 = scf.parallel (%arg2 ) = (%c0 ) to (%c64 ) step (%c1 ) init (%cst ) -> f32 {
625+ %1 = memref.load %alloc_0 [%arg2 ] : memref <64 xf32 >
626+ scf.reduce (%1 : f32 ) {
627+ ^bb0 (%arg3: f32 , %arg4: f32 ):
628+ %2 = arith.addf %arg3 , %arg4 : f32
629+ %3 = arith.addf %2 , %const : f32
630+ scf.reduce.return %3 : f32
631+ }
632+ } {mapping = [#gpu.loop_dim_map <processor = thread_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
633+ memref.store %0 , %alloc [] : memref <f32 >
634+ scf.reduce
635+ } {mapping = [#gpu.loop_dim_map <processor = block_x , map = (d0 ) -> (d0 ), bound = (d0 ) -> (d0 )>]}
636+ memref.dealloc %alloc : memref <f32 >
637+ memref.dealloc %alloc_0 : memref <64 xf32 >
638+ return
639+ }
640+
641+ // CHECK: scf.parallel
642+ // CHECK-NEXT: scf.parallel
643+ // CHECK: scf.reduce
0 commit comments