ROCm
diff --git a/‎clang/lib/Sema/SemaLookup.cpp‎
Lines changed: 18 additions & 5 deletions b/‎clang/lib/Sema/SemaLookup.cpp‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎clang/lib/Sema/SemaModule.cpp‎
Lines changed: 7 additions & 6 deletions b/‎clang/lib/Sema/SemaModule.cpp‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎clang/test/Modules/pr143788.cppm‎
Lines changed: 28 additions & 0 deletions b/‎clang/test/Modules/pr143788.cppm‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎compiler-rt/test/lit.common.configured.in‎
Lines changed: 0 additions & 1 deletion b/‎compiler-rt/test/lit.common.configured.in‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎llvm/docs/NVPTXUsage.rst‎
Lines changed: 20 additions & 12 deletions b/‎llvm/docs/NVPTXUsage.rst‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎llvm/include/llvm/Analysis/VectorUtils.h‎
Lines changed: 6 additions & 0 deletions b/‎llvm/include/llvm/Analysis/VectorUtils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/BasicBlock.h‎
Lines changed: 0 additions & 9 deletions b/‎llvm/include/llvm/IR/BasicBlock.h‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎llvm/include/llvm/IR/Function.h‎
Lines changed: 0 additions & 9 deletions b/‎llvm/include/llvm/IR/Function.h‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsNVVM.td‎
Lines changed: 19 additions & 13 deletions b/‎llvm/include/llvm/IR/IntrinsicsNVVM.td‎
Lines changed: 19 additions & 13 deletions
diff --git a/‎llvm/include/llvm/IR/Module.h‎
Lines changed: 0 additions & 20 deletions b/‎llvm/include/llvm/IR/Module.h‎
Lines changed: 0 additions & 20 deletions
@@ -1978,6 +1978,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   if (D->isModulePrivate())
     return false;
 
+  Module *DeclTopModule = DeclModule->getTopLevelModule();
+
   // [module.reach]/p1
   //   A translation unit U is necessarily reachable from a point P if U is a
   //   module interface unit on which the translation unit containing P has an
@@ -1996,17 +1998,28 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   //
   // Here we only check for the first condition. Since we couldn't see
   // DeclModule if it isn't (transitively) imported.
-  if (DeclModule->getTopLevelModule()->isModuleInterfaceUnit())
+  if (DeclTopModule->isModuleInterfaceUnit())
     return true;
 
-  // [module.reach]/p2
+  // [module.reach]/p1,2
+  //   A translation unit U is necessarily reachable from a point P if U is a
+  //   module interface unit on which the translation unit containing P has an
+  //   interface dependency, or the translation unit containing P imports U, in
+  //   either case prior to P
+  //
   //   Additional translation units on
   //   which the point within the program has an interface dependency may be
   //   considered reachable, but it is unspecified which are and under what
   //   circumstances.
-  //
-  // The decision here is to treat all additional tranditional units as
-  // unreachable.
+  Module *CurrentM = SemaRef.getCurrentModule();
+
+  // Directly imported module are necessarily reachable.
+  // Since we can't export import a module implementation partition unit, we
+  // don't need to count for Exports here.
+  if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule))
+    return true;
+
+  // Then we treat all module implementation partition unit as unreachable.
   return false;
 }
 
 
@@ -712,19 +712,20 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
       Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) {
     Diag(ExportLoc, diag::err_export_partition_impl)
         << SourceRange(ExportLoc, Path.back().getLoc());
-  } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) {
+  } else if (ExportLoc.isValid() &&
+             (ModuleScopes.empty() || currentModuleIsImplementation())) {
+    // [module.interface]p1:
+    // An export-declaration shall inhabit a namespace scope and appear in the
+    // purview of a module interface unit.
+    Diag(ExportLoc, diag::err_export_not_in_module_interface);
+  } else if (!ModuleScopes.empty()) {
     // Re-export the module if the imported module is exported.
     // Note that we don't need to add re-exported module to Imports field
     // since `Exports` implies the module is imported already.
     if (ExportLoc.isValid() || getEnclosingExportDecl(Import))
       getCurrentModule()->Exports.emplace_back(Mod, false);
     else
       getCurrentModule()->Imports.insert(Mod);
-  } else if (ExportLoc.isValid()) {
-    // [module.interface]p1:
-    // An export-declaration shall inhabit a namespace scope and appear in the
-    // purview of a module interface unit.
-    Diag(ExportLoc, diag::err_export_not_in_module_interface);
   }
 
   return Import;
 
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/P.pcm
+// RUN: %clang_cc1 -std=c++20 %t/I.cpp -fmodule-file=M:P=%t/P.pcm -fmodule-file=M=%t/M.pcm -fsyntax-only -verify
+
+//--- H.hpp
+struct S{};
+
+//--- M.cppm
+export module M;
+
+
+//--- P.cppm
+module;
+#include "H.hpp"
+module M:P;
+
+using T = S;
+
+//--- I.cpp
+// expected-no-diagnostics
+module M;
+import :P;
+
+T f() { return {}; }
@@ -25,7 +25,6 @@ set_default("gold_executable", "@GOLD_EXECUTABLE@")
 set_default("clang", "@COMPILER_RT_RESOLVED_TEST_COMPILER@")
 set_default("compiler_id", "@COMPILER_RT_TEST_COMPILER_ID@")
 set_default("python_executable", "@Python3_EXECUTABLE@")
-set_default("python_root_dir", "@Python3_ROOT_DIR@")
 set_default("compiler_rt_debug", @COMPILER_RT_DEBUG_PYBOOL@)
 set_default("compiler_rt_intercept_libdispatch", @COMPILER_RT_INTERCEPT_LIBDISPATCH_PYBOOL@)
 set_default("compiler_rt_output_dir", "@COMPILER_RT_RESOLVED_OUTPUT_DIR@")
 
@@ -1016,7 +1016,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
@@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the
 tensor data ranges from 1d to 5d with the coordinates specified
 by the ``i32 %d0 ... i32 %d4`` arguments.
 
-* The last two arguments to these intrinsics are boolean flags
-  indicating support for cache_hint and/or multicast modifiers.
-  These flag arguments must be compile-time constants. The backend
-  looks through these flags and lowers the intrinsics appropriately.
+* The last three arguments to these intrinsics are flags
+  indicating support for multicast, cache_hint and cta_group::1/2
+  modifiers. These flag arguments must be compile-time constants.
+  The backend looks through these flags and lowers the intrinsics
+  appropriately.
 
-* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
+* The argument denoted by ``i1 %flag_ch`` when set, indicates
   a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
   variant of the PTX instruction.
 
-* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
-  the presence of a multicast mask (``i16 %mc``) and generates the PTX
-  instruction with the ``.multicast::cluster`` modifier.
+* The argument denoted by ``i1 %flag_mc`` when set, indicates
+  the presence of a multicast mask (``i16 %mc``) and generates
+  the PTX instruction with the ``.multicast::cluster`` modifier.
+
+* The argument denoted by ``i32 %flag_cta_group`` takes values within
+  the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group``
+  is not within the range, it may raise an error from the Verifier.
+  The default value is '0' with no cta_group modifier in the
+  instruction. The values of '1' and '2' lower to ``cta_group::1``
+  and ``cta_group::2`` variants of the PTX instruction respectively.
 
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
@@ -1058,7 +1066,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
@@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this
 mode, the tensor has to be at least three-dimensional. Along with the tensor
 coordinates, im2col offsets are also specified (denoted by
 ``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
-than the number of dimensions of the tensor operation. The last two arguments
-to these intrinsics are boolean flags, with the same functionality as described
+than the number of dimensions of the tensor operation. The last three arguments
+to these intrinsics are flags, with the same functionality as described
 in the ``tile`` mode intrinsics above.
 
 For more information, refer PTX ISA
 
@@ -176,6 +176,12 @@ LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(
 LLVM_ABI Intrinsic::ID
 getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI);
 
+/// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
+
+/// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
 
@@ -63,9 +63,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
 public:
   using InstListType = SymbolTableList<Instruction, ilist_iterator_bits<true>,
                                        ilist_parent<BasicBlock>>;
-  /// Flag recording whether or not this block stores debug-info in the form
-  /// of intrinsic instructions (false) or non-instruction records (true).
-  bool IsNewDbgInfoFormat;
 
 private:
   // Allow Function to renumber blocks.
@@ -95,12 +92,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// IsNewDbgInfoFormat = false.
   LLVM_ABI void convertFromNewDbgValues();
 
-  /// Ensure the block is in "old" dbg.value format (\p NewFlag == false) or
-  /// in the new format (\p NewFlag == true), converting to the desired format
-  /// if necessary.
-  LLVM_ABI void setIsNewDbgInfoFormat(bool NewFlag);
-  LLVM_ABI void setNewDbgInfoFormatFlag(bool NewFlag);
-
   unsigned getNumber() const {
     assert(getParent() && "only basic blocks in functions have valid numbers");
     return Number;
 
@@ -111,11 +111,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   friend class SymbolTableListTraits<Function>;
 
 public:
-  /// Is this function using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// hasLazyArguments/CheckLazyArguments - The argument list of a function is
   /// built on demand, so that the list isn't allocated until the first client
   /// needs it.  The hasLazyArguments predicate returns true if the arg list
@@ -130,9 +125,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// \see BasicBlock::convertFromNewDbgValues.
   void convertFromNewDbgValues();
 
-  void setIsNewDbgInfoFormat(bool NewVal);
-  void setNewDbgInfoFormatFlag(bool NewVal);
-
 private:
   friend class TargetLibraryInfoImpl;
 
@@ -760,7 +752,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// to the newly inserted BB.
   Function::iterator insert(Function::iterator Position, BasicBlock *BB) {
     Function::iterator FIt = BasicBlocks.insert(Position, BB);
-    BB->setIsNewDbgInfoFormat(IsNewDbgInfoFormat);
     return FIt;
   }
 
 
@@ -2020,20 +2020,26 @@ foreach dim = 1...5 in {
     defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
     defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
 
+    defvar g2s_params = !listconcat(
+                          [llvm_shared_cluster_ptr_ty, // dst_ptr
+                           llvm_shared_ptr_ty,  // mbarrier_ptr
+                           llvm_ptr_ty],        // tensormap_ptr
+                          tensor_dim_args,      // actual tensor dims
+                          im2col_offsets_args,  // im2col offsets
+                          [llvm_i16_ty,         // cta_mask
+                           llvm_i64_ty]);       // cache_hint
+    defvar g2s_flags = [llvm_i1_ty,             // Flag for cta_mask
+                        llvm_i1_ty,             // Flag for cache_hint
+                        llvm_i32_ty];           // Flag for cta_group
+    defvar cta_group_idx = !add(
+                             !size(g2s_params),
+                             !sub(!size(g2s_flags), 1));
+    defvar g2s_props = [IntrConvergent,
+                        WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+                        // Allowed values for cta_group are {0,1,2} i.e [0, 3).
+                        Range<ArgIndex<cta_group_idx>, 0, 3>];
     def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
-      DefaultAttrsIntrinsicFlags<[],
-          !listconcat([llvm_shared_cluster_ptr_ty,  // dst_shared_cluster_ptr
-                       llvm_shared_ptr_ty,          // mbarrier_smem_ptr
-                       llvm_ptr_ty],                // tensormap_ptr
-                      tensor_dim_args,              // actual tensor dims
-                      im2col_offsets_args,          // im2col offsets
-                      [llvm_i16_ty,                 // cta_mask
-                       llvm_i64_ty]),               // cache_hint
-          [llvm_i1_ty,                              // Flag for cta_mask
-           llvm_i1_ty],                             // Flag for cache_hint
-          [IntrConvergent,
-           WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>]>;
+      DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
 
     def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[],
 
@@ -215,11 +215,6 @@ class LLVM_ABI Module {
 /// @name Constructors
 /// @{
 public:
-  /// Is this Module using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// Used when printing this module in the new debug info format; removes all
   /// declarations of debug intrinsics that are replaced by non-intrinsic
   /// records in the new format.
@@ -230,28 +225,13 @@ class LLVM_ABI Module {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
-    IsNewDbgInfoFormat = true;
   }
 
   /// \see BasicBlock::convertFromNewDbgValues.
   void convertFromNewDbgValues() {
     for (auto &F : *this) {
       F.convertFromNewDbgValues();
     }
-    IsNewDbgInfoFormat = false;
-  }
-
-  void setIsNewDbgInfoFormat(bool UseNewFormat) {
-    if (UseNewFormat && !IsNewDbgInfoFormat)
-      convertToNewDbgValues();
-    else if (!UseNewFormat && IsNewDbgInfoFormat)
-      convertFromNewDbgValues();
-  }
-  void setNewDbgInfoFormatFlag(bool NewFlag) {
-    for (auto &F : *this) {
-      F.setNewDbgInfoFormatFlag(NewFlag);
-    }
-    IsNewDbgInfoFormat = NewFlag;
   }
 
   /// The Module constructor. Note that there is no default constructor. You