llvm
diff --git a/‎clang/lib/Driver/ToolChain.cpp‎
Lines changed: 28 additions & 0 deletions b/‎clang/lib/Driver/ToolChain.cpp‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎clang/test/Driver/print-multi-selection-flags.c‎
Lines changed: 8 additions & 0 deletions b/‎clang/test/Driver/print-multi-selection-flags.c‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp‎
Lines changed: 19 additions & 0 deletions b/‎lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp‎
Lines changed: 3 additions & 3 deletions b/‎lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py‎
Lines changed: 25 additions & 10 deletions b/‎lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎lldb/test/API/lang/cpp/const_static_integral_member/main.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lldb/test/API/lang/cpp/const_static_integral_member/main.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/docs/CommandGuide/llvm-exegesis.rst‎
Lines changed: 16 additions & 5 deletions b/‎llvm/docs/CommandGuide/llvm-exegesis.rst‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎llvm/docs/LangRef.rst‎
Lines changed: 0 additions & 5 deletions b/‎llvm/docs/LangRef.rst‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎llvm/docs/NVPTXUsage.rst‎
Lines changed: 137 additions & 0 deletions b/‎llvm/docs/NVPTXUsage.rst‎
Lines changed: 137 additions & 0 deletions
@@ -230,6 +230,20 @@ static void getAArch64MultilibFlags(const Driver &D,
     Result.push_back(BranchProtectionArg->getAsString(Args));
   }
 
+  if (Arg *AlignArg = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (AlignArg->getOption().matches(options::OPT_mstrict_align) ||
+        AlignArg->getOption().matches(options::OPT_mno_unaligned_access))
+      Result.push_back(AlignArg->getAsString(Args));
+  }
+
+  if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian,
+                                    options::OPT_mlittle_endian)) {
+    if (Endian->getOption().matches(options::OPT_mbig_endian))
+      Result.push_back(Endian->getAsString(Args));
+  }
+
   const Arg *ABIArg = Args.getLastArgNoClaim(options::OPT_mabi_EQ);
   if (ABIArg) {
     Result.push_back(ABIArg->getAsString(Args));
@@ -287,6 +301,20 @@ static void getARMMultilibFlags(const Driver &D,
   if (BranchProtectionArg) {
     Result.push_back(BranchProtectionArg->getAsString(Args));
   }
+
+  if (Arg *AlignArg = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (AlignArg->getOption().matches(options::OPT_mstrict_align) ||
+        AlignArg->getOption().matches(options::OPT_mno_unaligned_access))
+      Result.push_back(AlignArg->getAsString(Args));
+  }
+
+  if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian,
+                                    options::OPT_mlittle_endian)) {
+    if (Endian->getOption().matches(options::OPT_mbig_endian))
+      Result.push_back(Endian->getAsString(Args));
+  }
 }
 
 static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple,
 
@@ -68,6 +68,14 @@
 // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mbranch-protection=standard | FileCheck --check-prefix=CHECK-BRANCH-PROTECTION %s
 // CHECK-BRANCH-PROTECTION: -mbranch-protection=standard
 
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s
+// CHECK-NO-UNALIGNED-ACCESS: -mno-unaligned-access
+
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s
+// CHECK-BIG-ENDIAN: -mbig-endian
+
 // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=riscv32-none-elf -march=rv32g | FileCheck --check-prefix=CHECK-RV32 %s
 // CHECK-RV32: --target=riscv32-unknown-none-elf
 // CHECK-RV32: -mabi=ilp32d
 
@@ -614,7 +614,7 @@ DWARFDebugInfoEntry::GetAbbreviationDeclarationPtr(const DWARFUnit *cu) const {
 }
 
 bool DWARFDebugInfoEntry::IsGlobalOrStaticScopeVariable() const {
-  if (Tag() != DW_TAG_variable)
+  if (Tag() != DW_TAG_variable && Tag() != DW_TAG_member)
     return false;
   const DWARFDebugInfoEntry *parent_die = GetParent();
   while (parent_die != nullptr) {
 
@@ -222,6 +222,13 @@ void ManualDWARFIndex::IndexUnitImpl(DWARFUnit &unit,
     case DW_TAG_variable:
       break;
 
+    case DW_TAG_member:
+      // Only in DWARF 4 and earlier `static const` members of a struct, a class
+      // or a union have an entry tag `DW_TAG_member`
+      if (unit.GetVersion() >= 5)
+        continue;
+      break;
+
     default:
       continue;
     }
@@ -362,6 +369,18 @@ void ManualDWARFIndex::IndexUnitImpl(DWARFUnit &unit,
         set.namespaces.Insert(ConstString(name), ref);
       break;
 
+    case DW_TAG_member: {
+      // In DWARF 4 and earlier `static const` members of a struct, a class or a
+      // union have an entry tag `DW_TAG_member`, and are also tagged as
+      // `DW_AT_declaration`, but otherwise follow the same rules as
+      // `DW_TAG_variable`.
+      bool parent_is_class_type = false;
+      if (auto parent = die.GetParent())
+        parent_is_class_type = DWARFDIE(&unit, parent).IsStructUnionOrClass();
+      if (!parent_is_class_type || !is_declaration)
+        break;
+      [[fallthrough]];
+    }
     case DW_TAG_variable:
       if (name && has_location_or_const_value && is_global_or_static_variable) {
         set.globals.Insert(ConstString(name), ref);
 
@@ -2403,7 +2403,7 @@ void SymbolFileDWARF::FindGlobalVariables(
       sc.module_sp = m_objfile_sp->GetModule();
     assert(sc.module_sp);
 
-    if (die.Tag() != DW_TAG_variable)
+    if (die.Tag() != DW_TAG_variable && die.Tag() != DW_TAG_member)
       return true;
 
     auto *dwarf_cu = llvm::dyn_cast<DWARFCompileUnit>(die.GetCU());
@@ -3505,7 +3505,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
   ModuleSP module = GetObjectFile()->GetModule();
 
   if (tag != DW_TAG_variable && tag != DW_TAG_constant &&
-      (tag != DW_TAG_formal_parameter || !sc.function))
+      tag != DW_TAG_member && (tag != DW_TAG_formal_parameter || !sc.function))
     return nullptr;
 
   DWARFAttributes attributes = die.GetAttributes();
@@ -3811,7 +3811,7 @@ void SymbolFileDWARF::ParseAndAppendGlobalVariable(
     return;
 
   dw_tag_t tag = die.Tag();
-  if (tag != DW_TAG_variable && tag != DW_TAG_constant)
+  if (tag != DW_TAG_variable && tag != DW_TAG_constant && tag != DW_TAG_member)
     return;
 
   // Check to see if we have already parsed this variable or constant?
 
@@ -120,17 +120,15 @@ def check_global_var(self, name: str, expect_type, expect_val):
         self.assertEqual(varobj.type.name, expect_type)
         self.assertEqual(varobj.value, expect_val)
 
-    @expectedFailureAll(dwarf_version=["<", "5"])
-    # On linux this passes due to the manual index
-    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
-    def test_inline_static_members(self):
-        self.build()
+    def check_inline_static_members(self, flags):
+        self.build(dictionary={"CXXFLAGS_EXTRAS": flags})
         lldbutil.run_to_source_breakpoint(
             self, "// break here", lldb.SBFileSpec("main.cpp")
         )
 
         self.check_global_var("A::int_val", "const int", "1")
         self.check_global_var("A::int_val_with_address", "const int", "2")
+        self.check_global_var("A::inline_int_val", "const int", "3")
         self.check_global_var("A::bool_val", "const bool", "true")
         self.check_global_var("A::enum_val", "Enum", "enum_case2")
         self.check_global_var("A::enum_bool_val", "EnumBool", "enum_bool_case1")
@@ -144,6 +142,16 @@ def test_inline_static_members(self):
             "ClassWithConstexprs::scoped_enum_val", "ScopedEnum", "scoped_enum_case2"
         )
 
+    # On linux this passes due to the manual index
+    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
+    def test_inline_static_members_dwarf5(self):
+        self.check_inline_static_members("-gdwarf-5")
+
+    # On linux this passes due to the manual index
+    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
+    def test_inline_static_members_dwarf4(self):
+        self.check_inline_static_members("-gdwarf-4")
+
     # With older versions of Clang, LLDB fails to evaluate classes with only
     # constexpr members when dsymutil is enabled
     @expectedFailureAll(
@@ -170,15 +178,12 @@ def test_class_with_only_constexpr_static(self):
             "ClassWithEnumAlias::enum_alias_alias", result_value="scoped_enum_case1"
         )
 
-    @expectedFailureAll(dwarf_version=["<", "5"])
-    # On linux this passes due to the manual index
-    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
-    def test_shadowed_static_inline_members(self):
+    def check_shadowed_static_inline_members(self, flags):
         """Tests that the expression evaluator and SBAPI can both
         correctly determine the requested inline static variable
         in the presence of multiple variables of the same name."""
 
-        self.build()
+        self.build(dictionary={"CXXFLAGS_EXTRAS": flags})
         lldbutil.run_to_name_breakpoint(self, "bar")
 
         self.check_global_var("ns::Foo::mem", "const int", "10")
@@ -188,6 +193,16 @@ def test_shadowed_static_inline_members(self):
         self.expect_expr("ns::Foo::mem", result_value="10")
         self.expect_expr("::Foo::mem", result_value="-29")
 
+    # On linux this passes due to the manual index
+    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
+    def test_shadowed_static_inline_members_dwarf5(self):
+        self.check_shadowed_static_inline_members("-gdwarf-5")
+
+    # On linux this passes due to the manual index
+    @expectedFailureDarwin(debug_info=no_match(["dsym"]))
+    def test_shadowed_static_inline_members_dwarf4(self):
+        self.check_shadowed_static_inline_members("-gdwarf-4")
+
     @expectedFailureAll(bugnumber="target var doesn't honour global namespace")
     def test_shadowed_static_inline_members_xfail(self):
         self.build()
 
@@ -29,6 +29,7 @@ enum class ScopedLongLongEnum : long long {
 struct A {
   const static int int_val = 1;
   const static int int_val_with_address = 2;
+  inline const static int inline_int_val = 3;
   const static bool bool_val = true;
 
   const static auto char_max = std::numeric_limits<char>::max();
 
@@ -33,11 +33,22 @@ snippets.
 SUPPORTED PLATFORMS
 -------------------
 
-:program:`llvm-exegesis` currently only supports X86 (64-bit only), ARM (AArch64
-only), MIPS, and PowerPC (PowerPC64LE only) on Linux for benchmarking. Not all
-benchmarking functionality is guaranteed to work on every platform.
-:program:`llvm-exegesis` also has a separate analysis mode that is supported
-on every platform that LLVM is.
+:program:`llvm-exegesis` currently only supports X86 (64-bit only), ARM
+(AArch64 only, snippet generation is sparse), MIPS, and PowerPC (PowerPC64LE
+only) on Linux for benchmarking. Not all benchmarking functionality is
+guaranteed to work on every platform. :program:`llvm-exegesis` also has a
+separate analysis mode that is supported on every platform that LLVM is.
+
+To enable benchmarking in llvm-exegesis, LLVM must be configured and built with
+`LLVM_ENABLE_LIBPFM` enabled, as :program:`llvm-exegesis` depends on libpfm4
+for accessing performance counters. Benchmarking may fail if the target CPU is
+unsupported by libpfm. This can be verified by setting `LIBPFM_VERBOSE` and
+`LIBPFM_DEBUG` environment variables to enable verbose or debug mode for
+libpfm. If libpfm is installed in a non-standard directory, LLVM can be
+configured to locate the necessary library and header files by setting
+`LIBRARY_PATH`, `C_INCLUDE_PATH`, and `CPLUS_INCLUDE_PATH` environment
+variables. Additionally, `LD_LIBRARY_PATH` should be set so that
+:program:`llvm-exegesis` can locate the libpfm library during execution.
 
 SNIPPET ANNOTATIONS
 -------------------
 
@@ -2498,11 +2498,6 @@ For example:
     function with a tail call. The prototype of a thunk should not be used for
     optimization purposes. The caller is expected to cast the thunk prototype to
     match the thunk target prototype.
-
-``"tls-load-hoist"``
-    This attribute indicates that the function will try to reduce redundant
-    tls address calculation by hoisting tls variable.
-
 ``uwtable[(sync|async)]``
     This attribute indicates that the ABI being targeted requires that
     an unwind table entry be produced for this function even if we can
 
@@ -462,6 +462,143 @@ to left-shift the found bit into the most-significant bit position, otherwise
 the result is the shift amount needed to right-shift the found bit into the
 least-significant bit position. 0xffffffff is returned if no 1 bit is found.
 
+TMA family of Intrinsics
+------------------------
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
+These instructions initiate an asynchronous copy of tensor data from
+global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
+in ``tile`` mode. In tile mode, the multi-dimensional layout of the
+source tensor is preserved at the destination. The dimension of the
+tensor data ranges from 1d to 5d with the coordinates specified
+by the ``i32 %d0 ... i32 %d4`` arguments.
+
+* The last two arguments to these intrinsics are boolean flags
+  indicating support for cache_hint and/or multicast modifiers.
+  These flag arguments must be compile-time constants. The backend
+  looks through these flags and lowers the intrinsics appropriately.
+
+* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
+  a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
+  the presence of a multicast mask (``i16 %mc``) and generates the PTX
+  instruction with the ``.multicast::cluster`` modifier.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
+These instructions initiate an asynchronous copy of tensor data from
+global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
+in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
+are unrolled into a single dimensional column at the destination. In this
+mode, the tensor has to be at least three-dimensional. Along with the tensor
+coordinates, im2col offsets are also specified (denoted by
+``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
+than the number of dimensions of the tensor operation. The last two arguments
+to these intrinsics are boolean flags, with the same functionality as described
+in the ``tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
+These instructions initiate an asynchronous copy of tensor data from
+shared::cta to global memory (indicated by the ``s2g`` prefix)
+in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
+with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
+These instructions initiate an asynchronous copy of tensor data from
+shared::cta to global memory (indicated by the ``s2g`` prefix)
+in ``im2col`` mode. In this mode, the tensor has to be at least
+three-dimensional. Unlike the ``g2s`` variants, there are no
+im2col_offsets for these intrinsics. The last argument to these
+intrinsics is a boolean flag, with the same functionality as
+described in the ``s2g.tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
 Other Intrinsics
 ----------------
Original file line number	Diff line number	Diff line change
`@@ -614,7 +614,7 @@ DWARFDebugInfoEntry::GetAbbreviationDeclarationPtr(const DWARFUnit *cu) const {`
`614`	`614`	`}`
`615`	`615`
`616`	`616`	`bool DWARFDebugInfoEntry::IsGlobalOrStaticScopeVariable() const {`
`617`		`- if (Tag() != DW_TAG_variable)`
	`617`	`+ if (Tag() != DW_TAG_variable && Tag() != DW_TAG_member)`
`618`	`618`	`return false;`
`619`	`619`	`const DWARFDebugInfoEntry *parent_die = GetParent();`
`620`	`620`	`while (parent_die != nullptr) {`