@@ -1016,7 +1016,7 @@ Syntax:
10161016
10171017.. code-block :: llvm
10181018
1019- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
1019+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group )
10201020 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
10211021 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
10221022 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
@@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the
10341034tensor data ranges from 1d to 5d with the coordinates specified
10351035by the ``i32 %d0 ... i32 %d4 `` arguments.
10361036
1037- * The last two arguments to these intrinsics are boolean flags
1038- indicating support for cache_hint and/or multicast modifiers.
1039- These flag arguments must be compile-time constants. The backend
1040- looks through these flags and lowers the intrinsics appropriately.
1037+ * The last three arguments to these intrinsics are flags
1038+ indicating support for multicast, cache_hint and cta_group::1/2
1039+ modifiers. These flag arguments must be compile-time constants.
1040+ The backend looks through these flags and lowers the intrinsics
1041+ appropriately.
10411042
1042- * The Nth argument ( denoted by ``i1 flag_ch ``) when set, indicates
1043+ * The argument denoted by ``i1 % flag_ch `` when set, indicates
10431044 a valid cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
10441045 variant of the PTX instruction.
10451046
1046- * The [N-1]th argument (denoted by ``i1 flag_mc ``) when set, indicates
1047- the presence of a multicast mask (``i16 %mc ``) and generates the PTX
1048- instruction with the ``.multicast::cluster `` modifier.
1047+ * The argument denoted by ``i1 %flag_mc `` when set, indicates
1048+ the presence of a multicast mask (``i16 %mc ``) and generates
1049+ the PTX instruction with the ``.multicast::cluster `` modifier.
1050+
1051+ * The argument denoted by ``i32 %flag_cta_group `` takes values within
1052+ the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group ``
1053+ is not within the range, it may raise an error from the Verifier.
1054+ The default value is '0' with no cta_group modifier in the
1055+ instruction. The values of '1' and '2' lower to ``cta_group::1 ``
1056+ and ``cta_group::2 `` variants of the PTX instruction respectively.
10491057
10501058For more information, refer PTX ISA
10511059`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
@@ -1058,7 +1066,7 @@ Syntax:
10581066
10591067.. code-block :: llvm
10601068
1061- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
1069+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group )
10621070 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
10631071 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
10641072
@@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this
10741082mode, the tensor has to be at least three-dimensional. Along with the tensor
10751083coordinates, im2col offsets are also specified (denoted by
10761084``i16 im2col0...i16 %im2col2 ``). The number of im2col offsets is two less
1077- than the number of dimensions of the tensor operation. The last two arguments
1078- to these intrinsics are boolean flags, with the same functionality as described
1085+ than the number of dimensions of the tensor operation. The last three arguments
1086+ to these intrinsics are flags, with the same functionality as described
10791087in the ``tile `` mode intrinsics above.
10801088
10811089For more information, refer PTX ISA
0 commit comments