diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 42a017db300af..3adfd5f4f2c43 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1055,7 +1055,7 @@ def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, imposed by one's target platform. }]; let assemblyFormat = [{ - $format attr-dict ($args^ `:` type($args))? + $format attr-dict (`,` $args^ `:` type($args))? }]; } diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 318f0f78efa5b..f52dd6c0d0ce3 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -633,7 +633,7 @@ gpu.module @test_module_29 { // CHECK-NEXT: %[[EL1:.*]] = llvm.getelementptr %[[ALLOC]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i32, f64)> // CHECK-NEXT: llvm.store %[[EXT]], %[[EL1]] : f64, !llvm.ptr // CHECK-NEXT: llvm.call @vprintf(%[[FORMATSTART]], %[[ALLOC]]) : (!llvm.ptr, !llvm.ptr) -> i32 - gpu.printf "Hello: %d\n" %arg0, %arg1 : i32, f32 + gpu.printf "Hello: %d\n", %arg0, %arg1 : i32, f32 gpu.return } } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir index 1b904fa142bad..2dc6a5ab2a86c 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir @@ -36,7 +36,7 @@ gpu.module @test_module { // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64 // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 - gpu.printf "Hello: %d\n" %arg0 : i32 + gpu.printf "Hello: %d\n", %arg0 : i32 gpu.return } } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir index 870f5c5016ece..00d1d7d852680 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir @@ -9,7 +9,7 @@ gpu.module @test_module { // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4> // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8> // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func, ...)>) : (!llvm.ptr<4>, i32) -> i32 - gpu.printf "Hello: %d\n" %arg0 : i32 + gpu.printf "Hello: %d\n", %arg0 : i32 gpu.return } } diff --git a/mlir/test/Conversion/GPUToSPIRV/printf.mlir b/mlir/test/Conversion/GPUToSPIRV/printf.mlir index bc091124ea4c6..7fe9752b088db 100644 --- a/mlir/test/Conversion/GPUToSPIRV/printf.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/printf.mlir @@ -62,7 +62,7 @@ module attributes { // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr, UniformConstant> // CHECK-NEXT: [[FMTSTR_PTR1:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr, UniformConstant> to !spirv.ptr // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR1]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr, i32, f32, i32 -> i32 - gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index + gpu.printf "\nHello, world : %d %f \n Thread id: %d\n", %arg0, %arg1, %2: i32, f32, index // CHECK: spirv.Return gpu.return diff --git a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir index 91d7f1cd6c67d..85805da3ac10e 100644 --- a/mlir/test/Dialect/GPU/indirect-device-func-call.mlir +++ b/mlir/test/Dialect/GPU/indirect-device-func-call.mlir @@ -6,7 +6,7 @@ gpu.module @kernels { func.func @hello(%arg0 : f32) { %tid_x = gpu.thread_id x %csti8 = arith.constant 2 : i8 - gpu.printf "Hello from %lld, %d, %f\n" %tid_x, %csti8, %arg0 : index, i8, f32 + gpu.printf "Hello from %lld, %d, %f\n", %tid_x, %csti8, %arg0 : index, i8, f32 return } // CHECK-LABEL: @hello_indirect diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index c0ff2044b76c4..99915c493ea46 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -229,9 +229,22 @@ module attributes {gpu.container_module} { // CHECK-LABEL: gpu.func @printf_test // CHECK: (%[[ARG0:.*]]: i32) - // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32 + // CHECK: gpu.printf "Value: %d", %[[ARG0]] : i32 gpu.func @printf_test(%arg0 : i32) { - gpu.printf "Value: %d" %arg0 : i32 + gpu.printf "Value: %d", %arg0 : i32 + gpu.return + } + + // CHECK-LABEL: gpu.func @printf_empty + // CHECK: gpu.printf "]" + // CHECK: scf.if + // CHECK: gpu.printf ", " + gpu.func @printf_empty(%arg0 : i32) { + gpu.printf "]" + %1 = arith.cmpi slt, %arg0, %arg0 : i32 + scf.if %1 { + gpu.printf ", " + } gpu.return } diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir index 732f40c4333df..f02b26dba97d5 100644 --- a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir +++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir @@ -23,7 +23,7 @@ func.func @test_math(%arg0 : f32) { threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { // CHECK-NVVM: __nv_expf %s1 = math.exp %arg0 : f32 - gpu.printf "%f" %s1 : f32 + gpu.printf "%f", %s1 : f32 gpu.terminator } return diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir index 06a9c1ca0d114..3d6527fe59b2c 100644 --- a/mlir/test/Integration/GPU/CUDA/assert.mlir +++ b/mlir/test/Integration/GPU/CUDA/assert.mlir @@ -16,10 +16,10 @@ gpu.module @kernels { gpu.func @test_assert(%c0: i1, %c1: i1) kernel { %0 = gpu.thread_id x cf.assert %c1, "passing assertion" - gpu.printf "thread %lld: print after passing assertion\n" %0 : index + gpu.printf "thread %lld: print after passing assertion\n", %0 : index // Test callsite(callsite(name)) location. cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12)) - gpu.printf "thread %lld: print after failing assertion\n" %0 : index + gpu.printf "thread %lld: print after failing assertion\n", %0 : index gpu.return } } diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index 99ea1208e9c5e..15b0bf02d911a 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -14,7 +14,7 @@ module attributes {gpu.container_module} { %0 = gpu.thread_id x %csti8 = arith.constant 2 : i8 %cstf32 = arith.constant 3.0 : f32 - gpu.printf "Hello from %lld, %d, %f\n" %0, %csti8, %cstf32 : index, i8, f32 + gpu.printf "Hello from %lld, %d, %f\n", %0, %csti8, %cstf32 : index, i8, f32 gpu.return } } diff --git a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir index c70c940564a26..a22a34b9393a3 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir @@ -43,7 +43,7 @@ module attributes {gpu.container_module} { %cnd2 = arith.cmpi eq, %bidY, %c3 : index scf.if %cnd1 { scf.if %cnd2 { - gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n" + gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n", %cidX_i32, %cidY_i32, %cidZ_i32, diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir index b50772f8249fb..95bde40deb48e 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir @@ -85,7 +85,7 @@ module @mymod { // Step 7. First thread does TMA load scf.if %10 { - gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index + gpu.printf "[GPU] TMA SIZE %d\0A", %c8192 : index nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType } else { @@ -98,16 +98,16 @@ module @mymod { // Step 9. Print loaded data in 128b swizzled scf.if %10 { - gpu.printf "===--- Matrix A ---=== %d \0A" %c-1_i32 : i32 + gpu.printf "===--- Matrix A ---=== %d \0A", %c-1_i32 : i32 scf.for %arg12 = %c0 to %c128 step %c1 { scf.for %arg13 = %c0 to %c64 step %c1 { %15 = memref.load %7[%arg12, %arg13] : !shmemlhs %16 = arith.extf %15 : f16 to f32 - gpu.printf "%.0f, " %16 : f32 + gpu.printf "%.0f, ", %16 : f32 } - gpu.printf "%d\0A" %c-1_i32 : i32 + gpu.printf "%d\0A", %c-1_i32 : i32 } - gpu.printf "===----------------=== %d \0A" %c-1_i32 : i32 + gpu.printf "===----------------=== %d \0A", %c-1_i32 : i32 } gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir index 65e5fc0aff6aa..fce16f3df2368 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir @@ -109,7 +109,7 @@ module @mymod { // Step 6. First thread does TMA load scf.if %10 { - gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index + gpu.printf "[GPU] TMA SIZE %d\0A", %c32768 : index nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem1 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1]>, 3> nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[128, 1], offset: 4096>, 3> @@ -124,7 +124,7 @@ module @mymod { // Step 8. Print loaded data in 128b swizzled scf.if %10 { - gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32 + gpu.printf "===--- Matrix B ---=== %d \n", %c-1_i32 : i32 scf.for %ii = %c0 to %c64 step %c1 { scf.for %j = %c0 to %c128 step %c1 { %lhs0 = memref.load %rhsShmem[%ii, %j] : !shmemrhs @@ -133,7 +133,7 @@ module @mymod { } gpu.printf "%d\n" %c-1_i32 : i32 } - gpu.printf "===----------------=== %d \n" %c-1_i32 : i32 + gpu.printf "===----------------=== %d \n", %c-1_i32 : i32 } gpu.barrier gpu.terminator diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 391fda82e1e19..acca9811f5702 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -80,8 +80,8 @@ module @mymod { nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : > %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3> %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3> - gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32 - gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32 + gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A", %11 : f32 + gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A", %12 : f32 nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<64x8xf32, 3> nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : , swizzle = none, l2promo = none, oob = zero, interleave = none>, > -> memref<8x128xf32, 3> } else { @@ -92,8 +92,8 @@ module @mymod { scf.if %10 { %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3> %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3> - gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32 - gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32 + gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32 + gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32 } gpu.terminator } diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir index f83f65bb2963c..fe6c645357ecb 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir @@ -96,8 +96,8 @@ func.func @main() { scf.if %10 { %11 = memref.load %out[%c45, %c7] : memref<64x8xf32, 3> %12 = memref.load %out_1[%c7, %c0] : memref<8x128xf32, 3> - gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32 - gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32 + gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A", %11 : f32 + gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A", %12 : f32 } gpu.terminator } diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir index d5e6e3757540b..4a0e4d34bfab5 100644 --- a/mlir/test/Integration/GPU/ROCM/printf.mlir +++ b/mlir/test/Integration/GPU/ROCM/printf.mlir @@ -13,7 +13,7 @@ module attributes {gpu.container_module} { gpu.module @kernels { gpu.func @hello() kernel { %0 = gpu.thread_id x - gpu.printf "Hello from %d\n" %0 : index + gpu.printf "Hello from %d\n", %0 : index gpu.return } }