Skip to content

Commit 4177189

Browse files
Merge branch 'llvm:main' into restrict-bounded-wrap-iter
2 parents 78dee26 + abe0cd4 commit 4177189

File tree

149 files changed

+5798
-1838
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+5798
-1838
lines changed

clang/lib/Driver/ToolChain.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ static void getAArch64MultilibFlags(const Driver &D,
230230
Result.push_back(BranchProtectionArg->getAsString(Args));
231231
}
232232

233+
if (Arg *AlignArg = Args.getLastArg(
234+
options::OPT_mstrict_align, options::OPT_mno_strict_align,
235+
options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
236+
if (AlignArg->getOption().matches(options::OPT_mstrict_align) ||
237+
AlignArg->getOption().matches(options::OPT_mno_unaligned_access))
238+
Result.push_back(AlignArg->getAsString(Args));
239+
}
240+
241+
if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian,
242+
options::OPT_mlittle_endian)) {
243+
if (Endian->getOption().matches(options::OPT_mbig_endian))
244+
Result.push_back(Endian->getAsString(Args));
245+
}
246+
233247
const Arg *ABIArg = Args.getLastArgNoClaim(options::OPT_mabi_EQ);
234248
if (ABIArg) {
235249
Result.push_back(ABIArg->getAsString(Args));
@@ -287,6 +301,20 @@ static void getARMMultilibFlags(const Driver &D,
287301
if (BranchProtectionArg) {
288302
Result.push_back(BranchProtectionArg->getAsString(Args));
289303
}
304+
305+
if (Arg *AlignArg = Args.getLastArg(
306+
options::OPT_mstrict_align, options::OPT_mno_strict_align,
307+
options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
308+
if (AlignArg->getOption().matches(options::OPT_mstrict_align) ||
309+
AlignArg->getOption().matches(options::OPT_mno_unaligned_access))
310+
Result.push_back(AlignArg->getAsString(Args));
311+
}
312+
313+
if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian,
314+
options::OPT_mlittle_endian)) {
315+
if (Endian->getOption().matches(options::OPT_mbig_endian))
316+
Result.push_back(Endian->getAsString(Args));
317+
}
290318
}
291319

292320
static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,

clang/test/Driver/print-multi-selection-flags.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@
6868
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mbranch-protection=standard | FileCheck --check-prefix=CHECK-BRANCH-PROTECTION %s
6969
// CHECK-BRANCH-PROTECTION: -mbranch-protection=standard
7070

71+
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s
72+
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s
73+
// CHECK-NO-UNALIGNED-ACCESS: -mno-unaligned-access
74+
75+
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s
76+
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s
77+
// CHECK-BIG-ENDIAN: -mbig-endian
78+
7179
// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=riscv32-none-elf -march=rv32g | FileCheck --check-prefix=CHECK-RV32 %s
7280
// CHECK-RV32: --target=riscv32-unknown-none-elf
7381
// CHECK-RV32: -mabi=ilp32d

lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ DWARFDebugInfoEntry::GetAbbreviationDeclarationPtr(const DWARFUnit *cu) const {
614614
}
615615

616616
bool DWARFDebugInfoEntry::IsGlobalOrStaticScopeVariable() const {
617-
if (Tag() != DW_TAG_variable)
617+
if (Tag() != DW_TAG_variable && Tag() != DW_TAG_member)
618618
return false;
619619
const DWARFDebugInfoEntry *parent_die = GetParent();
620620
while (parent_die != nullptr) {

lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,13 @@ void ManualDWARFIndex::IndexUnitImpl(DWARFUnit &unit,
222222
case DW_TAG_variable:
223223
break;
224224

225+
case DW_TAG_member:
226+
// Only in DWARF 4 and earlier `static const` members of a struct, a class
227+
// or a union have an entry tag `DW_TAG_member`
228+
if (unit.GetVersion() >= 5)
229+
continue;
230+
break;
231+
225232
default:
226233
continue;
227234
}
@@ -362,6 +369,18 @@ void ManualDWARFIndex::IndexUnitImpl(DWARFUnit &unit,
362369
set.namespaces.Insert(ConstString(name), ref);
363370
break;
364371

372+
case DW_TAG_member: {
373+
// In DWARF 4 and earlier `static const` members of a struct, a class or a
374+
// union have an entry tag `DW_TAG_member`, and are also tagged as
375+
// `DW_AT_declaration`, but otherwise follow the same rules as
376+
// `DW_TAG_variable`.
377+
bool parent_is_class_type = false;
378+
if (auto parent = die.GetParent())
379+
parent_is_class_type = DWARFDIE(&unit, parent).IsStructUnionOrClass();
380+
if (!parent_is_class_type || !is_declaration)
381+
break;
382+
[[fallthrough]];
383+
}
365384
case DW_TAG_variable:
366385
if (name && has_location_or_const_value && is_global_or_static_variable) {
367386
set.globals.Insert(ConstString(name), ref);

lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,7 +2403,7 @@ void SymbolFileDWARF::FindGlobalVariables(
24032403
sc.module_sp = m_objfile_sp->GetModule();
24042404
assert(sc.module_sp);
24052405

2406-
if (die.Tag() != DW_TAG_variable)
2406+
if (die.Tag() != DW_TAG_variable && die.Tag() != DW_TAG_member)
24072407
return true;
24082408

24092409
auto *dwarf_cu = llvm::dyn_cast<DWARFCompileUnit>(die.GetCU());
@@ -3505,7 +3505,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
35053505
ModuleSP module = GetObjectFile()->GetModule();
35063506

35073507
if (tag != DW_TAG_variable && tag != DW_TAG_constant &&
3508-
(tag != DW_TAG_formal_parameter || !sc.function))
3508+
tag != DW_TAG_member && (tag != DW_TAG_formal_parameter || !sc.function))
35093509
return nullptr;
35103510

35113511
DWARFAttributes attributes = die.GetAttributes();
@@ -3811,7 +3811,7 @@ void SymbolFileDWARF::ParseAndAppendGlobalVariable(
38113811
return;
38123812

38133813
dw_tag_t tag = die.Tag();
3814-
if (tag != DW_TAG_variable && tag != DW_TAG_constant)
3814+
if (tag != DW_TAG_variable && tag != DW_TAG_constant && tag != DW_TAG_member)
38153815
return;
38163816

38173817
// Check to see if we have already parsed this variable or constant?

lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -120,17 +120,15 @@ def check_global_var(self, name: str, expect_type, expect_val):
120120
self.assertEqual(varobj.type.name, expect_type)
121121
self.assertEqual(varobj.value, expect_val)
122122

123-
@expectedFailureAll(dwarf_version=["<", "5"])
124-
# On linux this passes due to the manual index
125-
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
126-
def test_inline_static_members(self):
127-
self.build()
123+
def check_inline_static_members(self, flags):
124+
self.build(dictionary={"CXXFLAGS_EXTRAS": flags})
128125
lldbutil.run_to_source_breakpoint(
129126
self, "// break here", lldb.SBFileSpec("main.cpp")
130127
)
131128

132129
self.check_global_var("A::int_val", "const int", "1")
133130
self.check_global_var("A::int_val_with_address", "const int", "2")
131+
self.check_global_var("A::inline_int_val", "const int", "3")
134132
self.check_global_var("A::bool_val", "const bool", "true")
135133
self.check_global_var("A::enum_val", "Enum", "enum_case2")
136134
self.check_global_var("A::enum_bool_val", "EnumBool", "enum_bool_case1")
@@ -144,6 +142,16 @@ def test_inline_static_members(self):
144142
"ClassWithConstexprs::scoped_enum_val", "ScopedEnum", "scoped_enum_case2"
145143
)
146144

145+
# On linux this passes due to the manual index
146+
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
147+
def test_inline_static_members_dwarf5(self):
148+
self.check_inline_static_members("-gdwarf-5")
149+
150+
# On linux this passes due to the manual index
151+
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
152+
def test_inline_static_members_dwarf4(self):
153+
self.check_inline_static_members("-gdwarf-4")
154+
147155
# With older versions of Clang, LLDB fails to evaluate classes with only
148156
# constexpr members when dsymutil is enabled
149157
@expectedFailureAll(
@@ -170,15 +178,12 @@ def test_class_with_only_constexpr_static(self):
170178
"ClassWithEnumAlias::enum_alias_alias", result_value="scoped_enum_case1"
171179
)
172180

173-
@expectedFailureAll(dwarf_version=["<", "5"])
174-
# On linux this passes due to the manual index
175-
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
176-
def test_shadowed_static_inline_members(self):
181+
def check_shadowed_static_inline_members(self, flags):
177182
"""Tests that the expression evaluator and SBAPI can both
178183
correctly determine the requested inline static variable
179184
in the presence of multiple variables of the same name."""
180185

181-
self.build()
186+
self.build(dictionary={"CXXFLAGS_EXTRAS": flags})
182187
lldbutil.run_to_name_breakpoint(self, "bar")
183188

184189
self.check_global_var("ns::Foo::mem", "const int", "10")
@@ -188,6 +193,16 @@ def test_shadowed_static_inline_members(self):
188193
self.expect_expr("ns::Foo::mem", result_value="10")
189194
self.expect_expr("::Foo::mem", result_value="-29")
190195

196+
# On linux this passes due to the manual index
197+
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
198+
def test_shadowed_static_inline_members_dwarf5(self):
199+
self.check_shadowed_static_inline_members("-gdwarf-5")
200+
201+
# On linux this passes due to the manual index
202+
@expectedFailureDarwin(debug_info=no_match(["dsym"]))
203+
def test_shadowed_static_inline_members_dwarf4(self):
204+
self.check_shadowed_static_inline_members("-gdwarf-4")
205+
191206
@expectedFailureAll(bugnumber="target var doesn't honour global namespace")
192207
def test_shadowed_static_inline_members_xfail(self):
193208
self.build()

lldb/test/API/lang/cpp/const_static_integral_member/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enum class ScopedLongLongEnum : long long {
2929
struct A {
3030
const static int int_val = 1;
3131
const static int int_val_with_address = 2;
32+
inline const static int inline_int_val = 3;
3233
const static bool bool_val = true;
3334

3435
const static auto char_max = std::numeric_limits<char>::max();

llvm/docs/CommandGuide/llvm-exegesis.rst

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,22 @@ snippets.
3333
SUPPORTED PLATFORMS
3434
-------------------
3535

36-
:program:`llvm-exegesis` currently only supports X86 (64-bit only), ARM (AArch64
37-
only), MIPS, and PowerPC (PowerPC64LE only) on Linux for benchmarking. Not all
38-
benchmarking functionality is guaranteed to work on every platform.
39-
:program:`llvm-exegesis` also has a separate analysis mode that is supported
40-
on every platform that LLVM is.
36+
:program:`llvm-exegesis` currently only supports X86 (64-bit only), ARM
37+
(AArch64 only, snippet generation is sparse), MIPS, and PowerPC (PowerPC64LE
38+
only) on Linux for benchmarking. Not all benchmarking functionality is
39+
guaranteed to work on every platform. :program:`llvm-exegesis` also has a
40+
separate analysis mode that is supported on every platform that LLVM is.
41+
42+
To enable benchmarking in llvm-exegesis, LLVM must be configured and built with
43+
`LLVM_ENABLE_LIBPFM` enabled, as :program:`llvm-exegesis` depends on libpfm4
44+
for accessing performance counters. Benchmarking may fail if the target CPU is
45+
unsupported by libpfm. This can be verified by setting `LIBPFM_VERBOSE` and
46+
`LIBPFM_DEBUG` environment variables to enable verbose or debug mode for
47+
libpfm. If libpfm is installed in a non-standard directory, LLVM can be
48+
configured to locate the necessary library and header files by setting
49+
`LIBRARY_PATH`, `C_INCLUDE_PATH`, and `CPLUS_INCLUDE_PATH` environment
50+
variables. Additionally, `LD_LIBRARY_PATH` should be set so that
51+
:program:`llvm-exegesis` can locate the libpfm library during execution.
4152

4253
SNIPPET ANNOTATIONS
4354
-------------------

llvm/docs/LangRef.rst

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2498,11 +2498,6 @@ For example:
24982498
function with a tail call. The prototype of a thunk should not be used for
24992499
optimization purposes. The caller is expected to cast the thunk prototype to
25002500
match the thunk target prototype.
2501-
2502-
``"tls-load-hoist"``
2503-
This attribute indicates that the function will try to reduce redundant
2504-
tls address calculation by hoisting tls variable.
2505-
25062501
``uwtable[(sync|async)]``
25072502
This attribute indicates that the ABI being targeted requires that
25082503
an unwind table entry be produced for this function even if we can

llvm/docs/NVPTXUsage.rst

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,143 @@ to left-shift the found bit into the most-significant bit position, otherwise
462462
the result is the shift amount needed to right-shift the found bit into the
463463
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
464464

465+
TMA family of Intrinsics
466+
------------------------
467+
468+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
469+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470+
471+
Syntax:
472+
"""""""
473+
474+
.. code-block:: llvm
475+
476+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
477+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
478+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
479+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
480+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
481+
482+
Overview:
483+
"""""""""
484+
485+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' intrinsics
486+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
487+
These instructions initiate an asynchronous copy of tensor data from
488+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
489+
in ``tile`` mode. In tile mode, the multi-dimensional layout of the
490+
source tensor is preserved at the destination. The dimension of the
491+
tensor data ranges from 1d to 5d with the coordinates specified
492+
by the ``i32 %d0 ... i32 %d4`` arguments.
493+
494+
* The last two arguments to these intrinsics are boolean flags
495+
indicating support for cache_hint and/or multicast modifiers.
496+
These flag arguments must be compile-time constants. The backend
497+
looks through these flags and lowers the intrinsics appropriately.
498+
499+
* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
500+
a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
501+
variant of the PTX instruction.
502+
503+
* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
504+
the presence of a multicast mask (``i16 %mc``) and generates the PTX
505+
instruction with the ``.multicast::cluster`` modifier.
506+
507+
For more information, refer PTX ISA
508+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
509+
510+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``'
511+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
512+
513+
Syntax:
514+
"""""""
515+
516+
.. code-block:: llvm
517+
518+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
519+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
520+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
521+
522+
Overview:
523+
"""""""""
524+
525+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' intrinsics
526+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
527+
These instructions initiate an asynchronous copy of tensor data from
528+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
529+
in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
530+
are unrolled into a single dimensional column at the destination. In this
531+
mode, the tensor has to be at least three-dimensional. Along with the tensor
532+
coordinates, im2col offsets are also specified (denoted by
533+
``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
534+
than the number of dimensions of the tensor operation. The last two arguments
535+
to these intrinsics are boolean flags, with the same functionality as described
536+
in the ``tile`` mode intrinsics above.
537+
538+
For more information, refer PTX ISA
539+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
540+
541+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``'
542+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
543+
544+
Syntax:
545+
"""""""
546+
547+
.. code-block:: llvm
548+
549+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
550+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...)
551+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
552+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
553+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
554+
555+
Overview:
556+
"""""""""
557+
558+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' intrinsics
559+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
560+
These instructions initiate an asynchronous copy of tensor data from
561+
shared::cta to global memory (indicated by the ``s2g`` prefix)
562+
in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
563+
with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
564+
565+
* The last argument to these intrinsics is a boolean flag
566+
indicating support for cache_hint. This flag argument must
567+
be a compile-time constant. When set, it indicates a valid
568+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
569+
variant of the PTX instruction.
570+
571+
For more information, refer PTX ISA
572+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
573+
574+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d``'
575+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
576+
577+
Syntax:
578+
"""""""
579+
580+
.. code-block:: llvm
581+
582+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
583+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
584+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
585+
586+
Overview:
587+
"""""""""
588+
589+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d``' intrinsics
590+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
591+
These instructions initiate an asynchronous copy of tensor data from
592+
shared::cta to global memory (indicated by the ``s2g`` prefix)
593+
in ``im2col`` mode. In this mode, the tensor has to be at least
594+
three-dimensional. Unlike the ``g2s`` variants, there are no
595+
im2col_offsets for these intrinsics. The last argument to these
596+
intrinsics is a boolean flag, with the same functionality as
597+
described in the ``s2g.tile`` mode intrinsics above.
598+
599+
For more information, refer PTX ISA
600+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
601+
465602
Other Intrinsics
466603
----------------
467604

0 commit comments

Comments
 (0)