@@ -1072,6 +1072,8 @@ Syntax:
10721072 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
10731073 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
10741074
1075+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1076+
10751077 Overview:
10761078"""""""""
10771079
@@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
10821084in ``tile `` mode. In tile mode, the multi-dimensional layout of the
10831085source tensor is preserved at the destination. The dimension of the
10841086tensor data ranges from 1d to 5d with the coordinates specified
1085- by the ``i32 %d0 ... i32 %d4 `` arguments.
1087+ by the ``i32 %d0 ... i32 %d4 `` arguments. In ``tile.gather4 `` mode,
1088+ four rows in a 2D tensor are combined to form a single 2D destination
1089+ tensor. The first coordinate ``i32 %x0 `` denotes the column index
1090+ followed by four coordinates indicating the four row-indices.
1091+ So, this mode takes a total of 5 coordinates as input arguments.
1092+ For more information on ``gather4 `` mode, refer PTX ISA
1093+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
10861094
10871095* The last three arguments to these intrinsics are flags
10881096 indicating support for multicast, cache_hint and cta_group::1/2
@@ -1116,10 +1124,18 @@ Syntax:
11161124
11171125.. code-block :: llvm
11181126
1119- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3 ) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1127+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7 ) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
11201128 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
11211129 declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
11221130
1131+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1132+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1133+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1134+
1135+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1136+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1137+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1138+
11231139 Overview:
11241140"""""""""
11251141
@@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
11311147are unrolled into a single dimensional column at the destination. In this
11321148mode, the tensor has to be at least three-dimensional. Along with the tensor
11331149coordinates, im2col offsets are also specified (denoted by
1134- ``i16 im2col0...i16 %im2col2 ``). The number of im2col offsets is two less
1135- than the number of dimensions of the tensor operation. The last three arguments
1136- to these intrinsics are flags, with the same functionality as described
1137- in the ``tile `` mode intrinsics above.
1150+ ``i16 im2col0...i16 %im2col2 ``). For the ``im2col `` mode, the number of offsets
1151+ is two less than the number of dimensions of the tensor operation. For the
1152+ ``im2col.w `` and ``im2col.w.128 `` mode, the number of offsets is always 2,
1153+ denoted by ``i16 %wHalo `` and ``i16 %wOffset `` arguments. For more information
1154+ on ``im2col.w `` and ``im2col.w.128 `` modes, refer PTX ISA
1155+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1156+
1157+ The last three arguments to these intrinsics are flags, with the same functionality
1158+ as described in the ``tile `` mode intrinsics above.
1159+
1160+ For more information, refer PTX ISA
1161+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
1162+
1163+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d ``'
1164+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1165+
1166+ Syntax:
1167+ """""""
1168+
1169+ .. code-block :: llvm
1170+
1171+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
1172+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...)
1173+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
1174+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1175+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1176+
1177+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1178+
1179+ Overview:
1180+ """""""""
1181+
1182+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d ``' intrinsics
1183+ correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.* ``
1184+ set of PTX instructions. These instructions initiate an asynchronous
1185+ copy of tensor data from global memory to shared::cta memory in
1186+ ``tile `` mode. In tile mode, the multi-dimensional layout of the
1187+ source tensor is preserved at the destination. The dimension of the
1188+ tensor data ranges from 1d to 5d with the coordinates specified
1189+ by the ``i32 %d0 ... i32 %d4 `` arguments. In ``tile.gather4 `` mode,
1190+ four rows in a 2D tensor are combined to form a single 2D destination
1191+ tensor. The first coordinate ``i32 %x0 `` denotes the column index
1192+ followed by four coordinates indicating the four row-indices.
1193+ So, this mode takes a total of 5 coordinates as input arguments.
1194+ For more information on ``gather4 `` mode, refer PTX ISA
1195+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1196+
1197+ * The last argument to these intrinsics is a boolean flag
1198+ indicating support for cache_hint. This flag argument must
1199+ be a compile-time constant. When set, it indicates a valid
1200+ cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
1201+ variant of the PTX instruction.
1202+
1203+ For more information, refer PTX ISA
1204+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
1205+
1206+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d ``'
1207+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1208+
1209+ Syntax:
1210+ """""""
1211+
1212+ .. code-block :: llvm
1213+
1214+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
1215+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
1216+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
1217+
1218+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1219+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1220+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1221+
1222+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1223+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1224+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1225+
1226+ Overview:
1227+ """""""""
1228+
1229+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d ``' intrinsics
1230+ correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.* ``
1231+ set of PTX instructions. These instructions initiate an asynchronous copy
1232+ of tensor data from global memory to shared::cta memory in ``im2col `` mode.
1233+ In im2col mode, some dimensions of the source tensor are unrolled into a
1234+ single dimensional column at the destination. In this mode, the tensor has
1235+ to be at least three-dimensional. Along with the tensor coordinates, im2col
1236+ offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2 ``).
1237+ For the ``im2col `` mode, the number of offsets is two less than the number
1238+ of dimensions of the tensor operation. For the ``im2col.w `` and ``im2col.w.128 ``
1239+ mode, the number of offsets is always 2, denoted by ``i16 %wHalo `` and
1240+ ``i16 %wOffset `` arguments. For more information on ``im2col.w `` and
1241+ ``im2col.w.128 `` modes, refer PTX ISA
1242+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1243+
1244+ * The last argument to these intrinsics is a boolean flag
1245+ indicating support for cache_hint. This flag argument must
1246+ be a compile-time constant. When set, it indicates a valid
1247+ cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
1248+ variant of the PTX instruction.
11381249
11391250For more information, refer PTX ISA
11401251`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
@@ -1153,6 +1264,8 @@ Syntax:
11531264 declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
11541265 declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
11551266
1267+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1268+
11561269 Overview:
11571270"""""""""
11581271
@@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from
11621275shared::cta to global memory (indicated by the ``s2g `` prefix)
11631276in ``tile `` mode. The dimension of the tensor data ranges from 1d to 5d
11641277with the coordinates specified by the ``i32 %d0 ... i32 %d4 `` arguments.
1278+ In ``tile.scatter4 `` mode, a single 2D source tensor is divided into
1279+ four rows in the 2D destination tensor. The first coordinate ``i32 %x0 ``
1280+ denotes the column index followed by four coordinates indicating the
1281+ four row-indices. So, this mode takes a total of 5 coordinates as input arguments.
1282+ For more information on ``scatter4 `` mode, refer PTX ISA
1283+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
11651284
11661285* The last argument to these intrinsics is a boolean flag
11671286 indicating support for cache_hint. This flag argument must
@@ -1214,6 +1333,8 @@ Syntax:
12141333 declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
12151334 declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
12161335
1336+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1337+
12171338 Overview:
12181339"""""""""
12191340
@@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination.
12251346The dimension of the tensor data ranges from 1d to 5d with the coordinates
12261347specified by the ``i32 %d0 ... i32 %d4 `` arguments.
12271348
1349+ In ``tile.gather4 `` mode, four rows in the 2-dimnesional source tensor are
1350+ fetched to the L2 cache. The first coordinate ``i32 %x0 `` denotes the column index
1351+ followed by four coordinates indicating the four row-indices. So, this mode takes
1352+ a total of 5 coordinates as input arguments.
1353+ For more information on ``gather4 `` mode, refer PTX ISA
1354+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1355+
12281356* The last argument to these intrinsics is a boolean flag
12291357 indicating support for cache_hint. This flag argument must
12301358 be a compile-time constant. When set, it indicates a valid
@@ -1246,6 +1374,14 @@ Syntax:
12461374 declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
12471375 declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
12481376
1377+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1378+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1379+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1380+
1381+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1382+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1383+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1384+
12491385 Overview:
12501386"""""""""
12511387
@@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some
12561392dimensions of the source tensor are unrolled into a single dimensional
12571393column at the destination. In this mode, the tensor has to be at least
12581394three-dimensional. Along with the tensor coordinates, im2col offsets are
1259- also specified (denoted by ``i16 im2col0...i16 %im2col2 ``). The number
1260- of im2col offsets is two less than the number of dimensions of the tensor
1261- operation. The last argument to these intrinsics is a boolean flag, with
1395+ also specified (denoted by ``i16 im2col0...i16 %im2col2 ``). For ``im2col ``
1396+ mode, the number of offsets is two less than the number of dimensions of
1397+ the tensor operation. For the ``im2col.w `` and ``im2col.w.128 `` modes,
1398+ the number of offsets is always 2, denoted by ``i16 %wHalo `` and
1399+ ``i16 %wOffset `` arguments. For more information on ``im2col.w `` and
1400+ ``im2col.w.128 `` modes, refer PTX ISA
1401+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1402+
1403+
1404+ The last argument to these intrinsics is a boolean flag, with
12621405the same functionality as described in the ``tile `` mode intrinsics above.
12631406
12641407For more information, refer PTX ISA
0 commit comments