|
1 | 1 | // RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-fuse-and-hoist-parallel-loops))' --split-input-file | FileCheck %s |
2 | 2 |
|
| 3 | +#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64> |
| 4 | + |
3 | 5 | #map = affine_map<(d0) -> (d0 * 2)> |
4 | 6 | #map1 = affine_map<(d0) -> (d0 * 4)> |
5 | 7 | #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> |
6 | 8 | #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)> |
7 | 9 | #map4 = affine_map<(d0) -> (d0 * 16)> |
8 | | -func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> { |
| 10 | +func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> |
| 11 | + attributes {translation_info = #translation_info} { |
9 | 12 | %c4 = arith.constant 4 : index |
10 | 13 | %c128 = arith.constant 128 : index |
11 | 14 | %c0 = arith.constant 0 : index |
@@ -62,11 +65,14 @@ func.func @forall_fuse_then_hoist(%3: tensor<128x128xf16>, %4: tensor<128x128xf1 |
62 | 65 |
|
63 | 66 | // ----- |
64 | 67 |
|
| 68 | +#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64> |
| 69 | + |
65 | 70 | #map = affine_map<(d0) -> (d0 * 2)> |
66 | 71 | #map1 = affine_map<(d0) -> (d0 * 4)> |
67 | 72 | #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> |
68 | 73 | #map3 = affine_map<(d0) -> (d0 * 16)> |
69 | | -func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> { |
| 74 | +func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: tensor<128x128xf32>) -> tensor<128x128xf32> |
| 75 | + attributes {translation_info = #translation_info} { |
70 | 76 | %c4 = arith.constant 4 : index |
71 | 77 | %c128 = arith.constant 128 : index |
72 | 78 | %c0 = arith.constant 0 : index |
@@ -113,12 +119,15 @@ func.func @forall_fuse_then_hoist_mixed_mappings(%3: tensor<128x128xf16>, %5: te |
113 | 119 |
|
114 | 120 | // ----- |
115 | 121 |
|
| 122 | +#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64> |
| 123 | + |
116 | 124 | #map = affine_map<(d0) -> (d0 * 2)> |
117 | 125 | #map1 = affine_map<(d0) -> (d0 * 4)> |
118 | 126 | #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> |
119 | 127 | #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)> |
120 | 128 | #map4 = affine_map<(d0) -> (d0 * 16)> |
121 | | -func.func @forall_fuse_then_hoist_with_fill(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>) -> tensor<128x128xf32> { |
| 129 | +func.func @forall_fuse_then_hoist_with_fill(%3: tensor<128x128xf16>, %4: tensor<128x128xf16>) -> tensor<128x128xf32> |
| 130 | + attributes {translation_info = #translation_info} { |
122 | 131 | %c4 = arith.constant 4 : index |
123 | 132 | %c128 = arith.constant 128 : index |
124 | 133 | %c0 = arith.constant 0 : index |
@@ -340,3 +349,103 @@ func.func @hoist_with_single_trip_loops(%2: tensor<128x128xf16>, %3: tensor<128x |
340 | 349 | // CHECK: scf.forall.in_parallel |
341 | 350 | // CHECK: scf.forall.in_parallel |
342 | 351 | // CHECK: return |
| 352 | + |
| 353 | +// ----- |
| 354 | + |
| 355 | +#map = affine_map<(d0) -> (d0 * 2)> |
| 356 | +#map1 = affine_map<(d0) -> (d0 * 16)> |
| 357 | +func.func @no_fuse_forall_without_workgroup_size(%arg0: tensor<128x128xf32>) -> tensor<128x128xf32> { |
| 358 | + %0 = tensor.empty() : tensor<128x128xf32> |
| 359 | + %2 = scf.forall (%arg5, %arg6) in (64, 1) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) { |
| 360 | + %4 = affine.apply #map(%arg5) |
| 361 | + %extracted_slice = tensor.extract_slice %arg0[%4, %arg6] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32> |
| 362 | + %extracted_slice_0 = tensor.extract_slice %arg7[%4, %arg6] [2, 128] [1, 1] : tensor<128x128xf32> to tensor<2x128xf32> |
| 363 | + %5 = linalg.copy ins(%extracted_slice : tensor<2x128xf32>) outs(%extracted_slice_0 : tensor<2x128xf32>) -> tensor<2x128xf32> |
| 364 | + scf.forall.in_parallel { |
| 365 | + tensor.parallel_insert_slice %5 into %arg7[%4, %arg6] [2, 128] [1, 1] : tensor<2x128xf32> into tensor<128x128xf32> |
| 366 | + } |
| 367 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 368 | + %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) { |
| 369 | + %6 = affine.apply #map1(%arg5) |
| 370 | + %7 = affine.apply #map1(%arg6) |
| 371 | + %extracted_slice_0 = tensor.extract_slice %2[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 372 | + %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 373 | + %8 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> |
| 374 | + scf.forall.in_parallel { |
| 375 | + tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> |
| 376 | + } |
| 377 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 378 | + return %3 : tensor<128x128xf32> |
| 379 | +} |
| 380 | + |
| 381 | +// CHECK-LABEL: func @no_fuse_forall_without_workgroup_size |
| 382 | +// CHECK-COUNT-2: scf.forall {{.*}} -> (tensor<128x128xf32>) |
| 383 | + |
| 384 | +// ----- |
| 385 | + |
| 386 | +#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64> |
| 387 | +#map = affine_map<(d0) -> (d0 * 2)> |
| 388 | +#map1 = affine_map<(d0) -> (d0 * 16)> |
| 389 | +func.func @no_fuse_forall_workgroup_size_mismatch(%arg0: tensor<128x128xf32>) -> tensor<128x128xf32> |
| 390 | + attributes {translation_info = #translation_info} { |
| 391 | + %0 = tensor.empty() : tensor<128x128xf32> |
| 392 | + %2 = scf.forall (%arg5, %arg6) in (128, 1) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) { |
| 393 | + %4 = affine.apply #map(%arg5) |
| 394 | + %extracted_slice = tensor.extract_slice %arg0[%4, %arg6] [1, 128] [1, 1] : tensor<128x128xf32> to tensor<1x128xf32> |
| 395 | + %extracted_slice_0 = tensor.extract_slice %arg7[%4, %arg6] [1, 128] [1, 1] : tensor<128x128xf32> to tensor<1x128xf32> |
| 396 | + %5 = linalg.copy ins(%extracted_slice : tensor<1x128xf32>) outs(%extracted_slice_0 : tensor<1x128xf32>) -> tensor<1x128xf32> |
| 397 | + scf.forall.in_parallel { |
| 398 | + tensor.parallel_insert_slice %5 into %arg7[%4, %arg6] [1, 128] [1, 1] : tensor<1x128xf32> into tensor<128x128xf32> |
| 399 | + } |
| 400 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 401 | + // We have 128 threads but only use 64 here, so loops cannot be fused. |
| 402 | + %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) { |
| 403 | + %6 = affine.apply #map1(%arg5) |
| 404 | + %7 = affine.apply #map1(%arg6) |
| 405 | + %extracted_slice_0 = tensor.extract_slice %2[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 406 | + %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 407 | + %8 = linalg.matmul ins(%extracted_slice_0, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> |
| 408 | + scf.forall.in_parallel { |
| 409 | + tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> |
| 410 | + } |
| 411 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 412 | + return %3 : tensor<128x128xf32> |
| 413 | +} |
| 414 | + |
| 415 | +// CHECK-LABEL: func @no_fuse_forall_workgroup_size_mismatch |
| 416 | +// CHECK-COUNT-2: scf.forall {{.*}} -> (tensor<128x128xf32>) |
| 417 | + |
| 418 | +// ----- |
| 419 | + |
| 420 | +#translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64> |
| 421 | +#map1 = affine_map<(d0) -> (d0 * 16)> |
| 422 | +func.func @fuse_direct_forall_use(%arg0: tensor<128x128xf32>, %arg1: tensor<16x16xf32>) -> tensor<128x128xf32> |
| 423 | + attributes {translation_info = #translation_info} { |
| 424 | + %0 = tensor.empty() : tensor<128x128xf32> |
| 425 | + %1 = tensor.empty() : tensor<16x16xf32> |
| 426 | + %2 = scf.forall (%arg5, %arg6) in (4, 4) shared_outs(%arg7 = %1) -> (tensor<16x16xf32>) { |
| 427 | + %extracted_slice = tensor.extract_slice %arg1[%arg5, %arg6] [4, 4] [1, 1] : tensor<16x16xf32> to tensor<4x4xf32> |
| 428 | + %extracted_slice_0 = tensor.extract_slice %arg7[%arg5, %arg6] [4, 4] [1, 1] : tensor<16x16xf32> to tensor<4x4xf32> |
| 429 | + %5 = linalg.copy ins(%extracted_slice : tensor<4x4xf32>) outs(%extracted_slice_0 : tensor<4x4xf32>) -> tensor<4x4xf32> |
| 430 | + scf.forall.in_parallel { |
| 431 | + tensor.parallel_insert_slice %5 into %arg7[%arg5, %arg6] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<16x16xf32> |
| 432 | + } |
| 433 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 434 | + %3 = scf.forall (%arg5, %arg6) in (8, 8) shared_outs(%arg7 = %0) -> (tensor<128x128xf32>) { |
| 435 | + %6 = affine.apply #map1(%arg5) |
| 436 | + %7 = affine.apply #map1(%arg6) |
| 437 | + %extracted_slice_0 = tensor.extract_slice %arg0[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 438 | + %extracted_slice_1 = tensor.extract_slice %arg7[%6, %7] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> |
| 439 | + %8 = linalg.matmul ins(%2, %extracted_slice_0 : tensor<16x16xf32>, tensor<16x16xf32>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> |
| 440 | + scf.forall.in_parallel { |
| 441 | + tensor.parallel_insert_slice %8 into %arg7[%6, %7] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> |
| 442 | + } |
| 443 | + } {mapping = [#gpu.thread<y>, #gpu.thread<x>]} |
| 444 | + return %3 : tensor<128x128xf32> |
| 445 | +} |
| 446 | + |
| 447 | +// CHECK-LABEL: func @fuse_direct_forall_use |
| 448 | +// CHECK: %[[FUSED_LOOP:.+]] = scf.forall |
| 449 | +// CHECK: %[[BARRIER:.+]] = iree_gpu.barrier_region |
| 450 | +// CHECK: linalg.matmul ins(%[[BARRIER]] |
| 451 | +// CHECK: return %[[FUSED_LOOP]] |
0 commit comments