llvm
diff --git a/‎clang/lib/CodeGen/Targets/NVPTX.cpp‎
Lines changed: 2 additions & 6 deletions b/‎clang/lib/CodeGen/Targets/NVPTX.cpp‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎clang/test/CodeGenCUDA/launch-bounds.cu‎
Lines changed: 23 additions & 36 deletions b/‎clang/test/CodeGenCUDA/launch-bounds.cu‎
Lines changed: 23 additions & 36 deletions
diff --git a/‎clang/test/Driver/hip-gz-options.hip‎
Lines changed: 1 addition & 1 deletion b/‎clang/test/Driver/hip-gz-options.hip‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/test/Driver/print-supported-extensions-riscv.c‎
Lines changed: 1 addition & 0 deletions b/‎clang/test/Driver/print-supported-extensions-riscv.c‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎clang/test/OpenMP/ompx_attributes_codegen.cpp‎
Lines changed: 16 additions & 9 deletions b/‎clang/test/OpenMP/ompx_attributes_codegen.cpp‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎clang/test/OpenMP/thread_limit_nvptx.c‎
Lines changed: 10 additions & 8 deletions b/‎clang/test/OpenMP/thread_limit_nvptx.c‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎flang/lib/Semantics/mod-file.cpp‎
Lines changed: 2 additions & 19 deletions b/‎flang/lib/Semantics/mod-file.cpp‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎flang/test/Semantics/Inputs/modfile72.f90‎
Lines changed: 13 additions & 0 deletions b/‎flang/test/Semantics/Inputs/modfile72.f90‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎flang/test/Semantics/modfile72.f90‎
Lines changed: 28 additions & 0 deletions b/‎flang/test/Semantics/modfile72.f90‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎libcxx/docs/ReleaseNotes/21.rst‎
Lines changed: 3 additions & 0 deletions b/‎libcxx/docs/ReleaseNotes/21.rst‎
Lines changed: 3 additions & 0 deletions
@@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
                                                int32_t *MaxThreadsVal,
                                                int32_t *MinBlocksVal,
                                                int32_t *MaxClusterRankVal) {
-  // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
   llvm::APSInt MaxThreads(32);
   MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
   if (MaxThreads > 0) {
     if (MaxThreadsVal)
       *MaxThreadsVal = MaxThreads.getExtValue();
-    if (F) {
-      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
-                                              MaxThreads.getExtValue());
-    }
+    if (F)
+      F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
   }
 
   // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
 
@@ -10,23 +10,30 @@
 #endif
 
 // CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
+// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
+// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
 // CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
-// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
-// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
-
-// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
-// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
-// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
-
-// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
-// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
-
-// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
-// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
+// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
+// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
+// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
+// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
+
+// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
+// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
+
+// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
+// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
+
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
+// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
 
 // Test both max threads per block and Min cta per sm.
 extern "C" {
@@ -37,8 +44,6 @@ Kernel1()
 }
 }
 
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
-
 #ifdef USE_MAX_BLOCKS
 // Test max threads per block and min/max cta per sm.
 extern "C" {
@@ -48,8 +53,6 @@ Kernel1_sm_90()
 {
 }
 }
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
 #endif // USE_MAX_BLOCKS
 
 // Test only max threads per block. Min cta per sm defaults to 0, and
@@ -62,8 +65,6 @@ Kernel2()
 }
 }
 
-// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
-
 template <int max_threads_per_block>
 __global__ void
 __launch_bounds__(max_threads_per_block)
@@ -72,7 +73,6 @@ Kernel3()
 }
 
 template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
 
 template <int max_threads_per_block, int min_blocks_per_mp>
 __global__ void
@@ -82,7 +82,6 @@ Kernel4()
 }
 template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
 
 #ifdef USE_MAX_BLOCKS
 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -93,7 +92,6 @@ Kernel4_sm_90()
 }
 template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
 #endif //USE_MAX_BLOCKS
 
 const int constint = 100;
@@ -106,8 +104,6 @@ Kernel5()
 }
 template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 
-// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
-
 #ifdef USE_MAX_BLOCKS
 
 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -120,7 +116,6 @@ Kernel5_sm_90()
 }
 template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
 #endif //USE_MAX_BLOCKS
 
 // Make sure we don't emit negative launch bounds values.
@@ -129,33 +124,25 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
 Kernel6()
 {
 }
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
 
 __global__ void
 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
 Kernel7()
 {
 }
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
-// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
 
 #ifdef USE_MAX_BLOCKS
 __global__ void
 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
 Kernel7_sm_90()
 {
 }
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
 #endif // USE_MAX_BLOCKS
 
 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
-// CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
 
 #ifdef USE_MAX_BLOCKS
 const char constchar_2 = 14;
 __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
 #endif // USE_MAX_BLOCKS
@@ -9,6 +9,6 @@
 // RUN:   -ggdb -gz=zlib 2>&1 | FileCheck %s
 
 // CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
-// CHECK-DAG: {{".*lld" .* "--compress-debug-sections=zlib"}}
+// CHECK-DAG: {{".*lld.*" .* "--compress-debug-sections=zlib"}}
 // CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
 // CHECK: "--compress-debug-sections=zlib"
@@ -204,6 +204,7 @@
 // CHECK-NEXT:     xqcilo               0.2       'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)
 // CHECK-NEXT:     xqcilsm              0.2       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
+// CHECK-NEXT:     xrivosvisni          0.1       'XRivosVisni' (Rivos Vector Integer Small New)
 // CHECK-NEXT:     xrivosvizip          0.1       'XRivosVizip' (Rivos Vector Register Zips)
 // CHECK-EMPTY:
 // CHECK-NEXT: Supported Profiles
 
@@ -11,9 +11,13 @@
 
 // Check that the target attributes are set on the generated kernel
 void func() {
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
+
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
+  // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
 
   #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
   {}
@@ -34,9 +38,12 @@ void func() {
 // AMD-SAME: "omp_target_thread_limit"="17"
 
 // It is unclear if we should use the AMD annotations for other targets, we do for now.
-// NVIDIA: "omp_target_thread_limit"="20"
-// NVIDIA: "omp_target_thread_limit"="45"
-// NVIDIA: "omp_target_thread_limit"="17"
-// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
-// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
-// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
+// NVIDIA: attributes #[[ATTR0]]
+// NVIDIA-SAME: "nvvm.maxntid"="20"
+// NVIDIA-SAME: "omp_target_thread_limit"="20"
+// NVIDIA: attributes #[[ATTR1]]
+// NVIDIA-SAME: "nvvm.maxntid"="45"
+// NVIDIA-SAME: "omp_target_thread_limit"="45"
+// NVIDIA: attributes #[[ATTR2]]
+// NVIDIA-SAME: "nvvm.maxntid"="17"
+// NVIDIA-SAME: "omp_target_thread_limit"="17"
@@ -7,27 +7,29 @@
 #define HEADER
 
 void foo(int N) {
-// CHECK: l11, !"maxntidx", i32 128}
+// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
 #pragma omp target teams distribute parallel for simd
   for (int i = 0; i < N; ++i)
     ;
-// CHECK: l15, !"maxntidx", i32 4}
+// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
 #pragma omp target teams distribute parallel for simd thread_limit(4)
   for (int i = 0; i < N; ++i)
     ;
-// CHECK-NOT: l21, !"maxntidx", i32 128}
-// CHECK: l21, !"maxntidx", i32 42}
-// CHECK-NOT: l21, !"maxntidx", i32 128}
+
+// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
   for (int i = 0; i < N; ++i)
     ;
-// CHECK-NOT: l27, !"maxntidx", i32 42}
-// CHECK: l27, !"maxntidx", i32 22}
-// CHECK-NOT: l27, !"maxntidx", i32 42}
+
+// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
   for (int i = 0; i < N; ++i)
     ;
 }
 
 #endif
 
+// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
+// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
+// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
+// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}
@@ -836,18 +836,6 @@ void ModFileWriter::PutUseExtraAttr(
   }
 }
 
-static inline SourceName NameInModuleFile(const Symbol &symbol) {
-  if (const auto *use{symbol.detailsIf<UseDetails>()}) {
-    if (use->symbol().attrs().test(Attr::PRIVATE)) {
-      // Avoid the use in sorting of names created to access private
-      // specific procedures as a result of generic resolution;
-      // they're not in the cooked source.
-      return use->symbol().name();
-    }
-  }
-  return symbol.name();
-}
-
 // Collect the symbols of this scope sorted by their original order, not name.
 // Generics and namelists are exceptions: they are sorted after other symbols.
 void CollectSymbols(const Scope &scope, SymbolVector &sorted,
@@ -882,13 +870,8 @@ void CollectSymbols(const Scope &scope, SymbolVector &sorted,
       sorted.push_back(symbol);
     }
   }
-  // Sort most symbols by name: use of Symbol::ReplaceName ensures the source
-  // location of a symbol's name is the first "real" use.
-  auto sorter{[](SymbolRef x, SymbolRef y) {
-    return NameInModuleFile(*x).begin() < NameInModuleFile(*y).begin();
-  }};
-  std::sort(sorted.begin(), sorted.end(), sorter);
-  std::sort(generics.begin(), generics.end(), sorter);
+  std::sort(sorted.begin(), sorted.end(), SymbolSourcePositionCompare{});
+  std::sort(generics.begin(), generics.end(), SymbolSourcePositionCompare{});
   sorted.insert(sorted.end(), generics.begin(), generics.end());
   sorted.insert(sorted.end(), namelist.begin(), namelist.end());
   for (const auto &pair : scope.commonBlocks()) {
 
@@ -0,0 +1,13 @@
+module foo
+  interface do_foo
+    procedure do_foo_impl
+  end interface
+  interface do_bar
+    procedure do_bar_impl
+  end interface
+contains
+  subroutine do_foo_impl()
+  end
+  subroutine do_bar_impl()
+  end
+end
@@ -0,0 +1,28 @@
+! This test verifies that both invocations produce a consistent order in the
+! generated `.mod` file. Previous versions of Flang exhibited non-deterministic
+! behavior due to pointers outside the cooked source being used to order symbols
+! in the `.mod` file.
+
+! RUN: rm -rf %t && mkdir -p %t
+! RUN: %flang_fc1 -fsyntax-only -J%t %S/Inputs/modfile72.f90
+! RUN: %flang_fc1 -fsyntax-only -J%t %s
+! RUN: cat %t/bar.mod | FileCheck %s
+
+! RUN: rm -rf %t && mkdir -p %t
+! RUN: %flang_fc1 -fsyntax-only -J%t %S/Inputs/modfile72.f90 %s
+! RUN: cat %t/bar.mod | FileCheck %s
+
+module bar
+  use foo, only : do_foo
+  use foo, only : do_bar
+contains
+  subroutine do_baz()
+    call do_foo()
+    call do_bar()
+  end
+end
+
+!      CHECK: use foo,only:do_foo
+! CHECK-NEXT: use foo,only:do_bar
+! CHECK-NEXT: use foo,only:foo$foo$do_bar_impl=>do_bar_impl
+! CHECK-NEXT: use foo,only:foo$foo$do_foo_impl=>do_foo_impl
@@ -47,6 +47,9 @@ Improvements and New Features
 - The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for
   ``std::vector<bool>::iterator``, resulting in a performance improvement of up to 2000x.
 
+- The ``std::ranges::equal`` algorithm has been optimized for ``std::vector<bool>::iterator``, resulting in a performance
+  improvement of up to 188x.
+
 - Updated formatting library to Unicode 16.0.0.
 
 Deprecations and Removals