Skip to content

Commit ce2e756

Browse files
committed
Merge branch 'upstream' into x86-broadcast_load-helper
2 parents 86e52d6 + 722c7c0 commit ce2e756

File tree

79 files changed

+2053
-1573
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+2053
-1573
lines changed

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -357,17 +357,13 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
357357
int32_t *MaxThreadsVal,
358358
int32_t *MinBlocksVal,
359359
int32_t *MaxClusterRankVal) {
360-
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
361360
llvm::APSInt MaxThreads(32);
362361
MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
363362
if (MaxThreads > 0) {
364363
if (MaxThreadsVal)
365364
*MaxThreadsVal = MaxThreads.getExtValue();
366-
if (F) {
367-
// Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
368-
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
369-
MaxThreads.getExtValue());
370-
}
365+
if (F)
366+
F->addFnAttr("nvvm.maxntid", llvm::utostr(MaxThreads.getExtValue()));
371367
}
372368

373369
// min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it

clang/test/CodeGenCUDA/launch-bounds.cu

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,30 @@
1010
#endif
1111

1212
// CHECK: @Kernel1() #[[ATTR0:[0-9]+]]
13+
// CHECK: @Kernel2() #[[ATTR1:[0-9]+]]
14+
// CHECK: @{{.*}}Kernel3{{.*}}() #[[ATTR1]]
1315
// CHECK: @{{.*}}Kernel4{{.*}}() #[[ATTR0]]
14-
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR1:[0-9]+]]
15-
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR0]]
16-
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR3:[0-9]+]]
17-
18-
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
19-
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.minctasm"="258" {{.*}}}
20-
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="12" {{.*}}}
21-
22-
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR4:[0-9]+]]
23-
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR4]]
24-
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR5:[0-9]+]]
25-
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR6:[0-9]+]]
26-
27-
// CHECK_MAX_BLOCKS: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.minctasm"="2" {{.*}}}
28-
// CHECK_MAX_BLOCKS: attributes #[[ATTR5]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.minctasm"="258" {{.*}}}
29-
// CHECK_MAX_BLOCKS: attributes #[[ATTR6]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.minctasm"="12" {{.*}}}
16+
// CHECK: @{{.*}}Kernel5{{.*}}() #[[ATTR2:[0-9]+]]
17+
// CHECK: @{{.*}}Kernel6{{.*}}() #[[ATTR3:[0-9]+]]
18+
// CHECK: @{{.*}}Kernel7{{.*}}() #[[ATTR1]]
19+
// CHECK: @{{.*}}Kernel8{{.*}}() #[[ATTR4:[0-9]+]]
20+
21+
// CHECK-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
22+
// CHECK-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
23+
// CHECK-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
24+
// CHECK-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.minctasm"="2" {{.*}}}
25+
// CHECK-DAG: attributes #[[ATTR4]] = {{{.*}} "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
26+
27+
// CHECK_MAX_BLOCKS: @Kernel1_sm_90() #[[ATTR0:[0-9]+]]
28+
// CHECK_MAX_BLOCKS: @{{.*}}Kernel4_sm_90{{.*}} #[[ATTR0]]
29+
// CHECK_MAX_BLOCKS: @{{.*}}Kernel5_sm_90{{.*}} #[[ATTR1:[0-9]+]]
30+
// CHECK_MAX_BLOCKS: @{{.*}}Kernel7_sm_90{{.*}} #[[ATTR2:[0-9]+]]
31+
// CHECK_MAX_BLOCKS: @{{.*}}Kernel8_sm_90{{.*}} #[[ATTR3:[0-9]+]]
32+
33+
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxclusterrank"="4" "nvvm.maxntid"="256" "nvvm.minctasm"="2" {{.*}}}
34+
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxclusterrank"="260" "nvvm.maxntid"="356" "nvvm.minctasm"="258" {{.*}}}
35+
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="256" {{.*}}}
36+
// CHECK_MAX_BLOCKS-DAG: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxclusterrank"="14" "nvvm.maxntid"="100" "nvvm.minctasm"="12" {{.*}}}
3037

3138
// Test both max threads per block and Min cta per sm.
3239
extern "C" {
@@ -37,8 +44,6 @@ Kernel1()
3744
}
3845
}
3946

40-
// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
41-
4247
#ifdef USE_MAX_BLOCKS
4348
// Test max threads per block and min/max cta per sm.
4449
extern "C" {
@@ -48,8 +53,6 @@ Kernel1_sm_90()
4853
{
4954
}
5055
}
51-
52-
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
5356
#endif // USE_MAX_BLOCKS
5457

5558
// Test only max threads per block. Min cta per sm defaults to 0, and
@@ -62,8 +65,6 @@ Kernel2()
6265
}
6366
}
6467

65-
// CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
66-
6768
template <int max_threads_per_block>
6869
__global__ void
6970
__launch_bounds__(max_threads_per_block)
@@ -72,7 +73,6 @@ Kernel3()
7273
}
7374

7475
template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
75-
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
7676

7777
template <int max_threads_per_block, int min_blocks_per_mp>
7878
__global__ void
@@ -82,7 +82,6 @@ Kernel4()
8282
}
8383
template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
8484

85-
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
8685

8786
#ifdef USE_MAX_BLOCKS
8887
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -93,7 +92,6 @@ Kernel4_sm_90()
9392
}
9493
template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
9594

96-
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
9795
#endif //USE_MAX_BLOCKS
9896

9997
const int constint = 100;
@@ -106,8 +104,6 @@ Kernel5()
106104
}
107105
template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
108106

109-
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
110-
111107
#ifdef USE_MAX_BLOCKS
112108

113109
template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
@@ -120,7 +116,6 @@ Kernel5_sm_90()
120116
}
121117
template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
122118

123-
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
124119
#endif //USE_MAX_BLOCKS
125120

126121
// Make sure we don't emit negative launch bounds values.
@@ -129,33 +124,25 @@ __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
129124
Kernel6()
130125
{
131126
}
132-
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
133127

134128
__global__ void
135129
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
136130
Kernel7()
137131
{
138132
}
139-
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
140-
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
141133

142134
#ifdef USE_MAX_BLOCKS
143135
__global__ void
144136
__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
145137
Kernel7_sm_90()
146138
{
147139
}
148-
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
149-
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
150-
// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
151140
#endif // USE_MAX_BLOCKS
152141

153142
const char constchar = 12;
154143
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
155-
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
156144

157145
#ifdef USE_MAX_BLOCKS
158146
const char constchar_2 = 14;
159147
__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
160-
// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
161148
#endif // USE_MAX_BLOCKS

clang/test/Driver/hip-gz-options.hip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@
99
// RUN: -ggdb -gz=zlib 2>&1 | FileCheck %s
1010

1111
// CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
12-
// CHECK-DAG: {{".*lld" .* "--compress-debug-sections=zlib"}}
12+
// CHECK-DAG: {{".*lld.*" .* "--compress-debug-sections=zlib"}}
1313
// CHECK-DAG: {{".*clang.*" .* "--compress-debug-sections=zlib"}}
1414
// CHECK: "--compress-debug-sections=zlib"

clang/test/Driver/print-supported-extensions-riscv.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@
204204
// CHECK-NEXT: xqcilo 0.2 'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)
205205
// CHECK-NEXT: xqcilsm 0.2 'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
206206
// CHECK-NEXT: xqcisls 0.2 'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
207+
// CHECK-NEXT: xrivosvisni 0.1 'XRivosVisni' (Rivos Vector Integer Small New)
207208
// CHECK-NEXT: xrivosvizip 0.1 'XRivosVizip' (Rivos Vector Register Zips)
208209
// CHECK-EMPTY:
209210
// CHECK-NEXT: Supported Profiles

clang/test/OpenMP/ompx_attributes_codegen.cpp

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,13 @@
1111

1212
// Check that the target attributes are set on the generated kernel
1313
void func() {
14-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}}) #0
15-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}})
16-
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #4
14+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0
15+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}})
16+
// AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4
17+
18+
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]]
19+
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]]
20+
// NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]]
1721

1822
#pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
1923
{}
@@ -34,9 +38,12 @@ void func() {
3438
// AMD-SAME: "omp_target_thread_limit"="17"
3539

3640
// It is unclear if we should use the AMD annotations for other targets, we do for now.
37-
// NVIDIA: "omp_target_thread_limit"="20"
38-
// NVIDIA: "omp_target_thread_limit"="45"
39-
// NVIDIA: "omp_target_thread_limit"="17"
40-
// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l18, !"maxntidx", i32 20}
41-
// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l20, !"maxntidx", i32 45}
42-
// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l22, !"maxntidx", i32 17}
41+
// NVIDIA: attributes #[[ATTR0]]
42+
// NVIDIA-SAME: "nvvm.maxntid"="20"
43+
// NVIDIA-SAME: "omp_target_thread_limit"="20"
44+
// NVIDIA: attributes #[[ATTR1]]
45+
// NVIDIA-SAME: "nvvm.maxntid"="45"
46+
// NVIDIA-SAME: "omp_target_thread_limit"="45"
47+
// NVIDIA: attributes #[[ATTR2]]
48+
// NVIDIA-SAME: "nvvm.maxntid"="17"
49+
// NVIDIA-SAME: "omp_target_thread_limit"="17"

clang/test/OpenMP/thread_limit_nvptx.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,29 @@
77
#define HEADER
88

99
void foo(int N) {
10-
// CHECK: l11, !"maxntidx", i32 128}
10+
// CHECK: define {{.*}}l11{{.*}} #[[ATTR0:[0-9]+]]
1111
#pragma omp target teams distribute parallel for simd
1212
for (int i = 0; i < N; ++i)
1313
;
14-
// CHECK: l15, !"maxntidx", i32 4}
14+
// CHECK: define {{.*}}l15{{.*}} #[[ATTR1:[0-9]+]]
1515
#pragma omp target teams distribute parallel for simd thread_limit(4)
1616
for (int i = 0; i < N; ++i)
1717
;
18-
// CHECK-NOT: l21, !"maxntidx", i32 128}
19-
// CHECK: l21, !"maxntidx", i32 42}
20-
// CHECK-NOT: l21, !"maxntidx", i32 128}
18+
19+
// CHECK: define {{.*}}l20{{.*}} #[[ATTR2:[0-9]+]]
2120
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
2221
for (int i = 0; i < N; ++i)
2322
;
24-
// CHECK-NOT: l27, !"maxntidx", i32 42}
25-
// CHECK: l27, !"maxntidx", i32 22}
26-
// CHECK-NOT: l27, !"maxntidx", i32 42}
23+
24+
// CHECK: define {{.*}}l25{{.*}} #[[ATTR3:[0-9]+]]
2725
#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
2826
for (int i = 0; i < N; ++i)
2927
;
3028
}
3129

3230
#endif
3331

32+
// CHECK: attributes #[[ATTR0]] = {{{.*}} "nvvm.maxntid"="128" {{.*}}}
33+
// CHECK: attributes #[[ATTR1]] = {{{.*}} "nvvm.maxntid"="4" {{.*}}}
34+
// CHECK: attributes #[[ATTR2]] = {{{.*}} "nvvm.maxntid"="42" {{.*}}}
35+
// CHECK: attributes #[[ATTR3]] = {{{.*}} "nvvm.maxntid"="22" {{.*}}}

flang/lib/Semantics/mod-file.cpp

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -836,18 +836,6 @@ void ModFileWriter::PutUseExtraAttr(
836836
}
837837
}
838838

839-
static inline SourceName NameInModuleFile(const Symbol &symbol) {
840-
if (const auto *use{symbol.detailsIf<UseDetails>()}) {
841-
if (use->symbol().attrs().test(Attr::PRIVATE)) {
842-
// Avoid the use in sorting of names created to access private
843-
// specific procedures as a result of generic resolution;
844-
// they're not in the cooked source.
845-
return use->symbol().name();
846-
}
847-
}
848-
return symbol.name();
849-
}
850-
851839
// Collect the symbols of this scope sorted by their original order, not name.
852840
// Generics and namelists are exceptions: they are sorted after other symbols.
853841
void CollectSymbols(const Scope &scope, SymbolVector &sorted,
@@ -882,13 +870,8 @@ void CollectSymbols(const Scope &scope, SymbolVector &sorted,
882870
sorted.push_back(symbol);
883871
}
884872
}
885-
// Sort most symbols by name: use of Symbol::ReplaceName ensures the source
886-
// location of a symbol's name is the first "real" use.
887-
auto sorter{[](SymbolRef x, SymbolRef y) {
888-
return NameInModuleFile(*x).begin() < NameInModuleFile(*y).begin();
889-
}};
890-
std::sort(sorted.begin(), sorted.end(), sorter);
891-
std::sort(generics.begin(), generics.end(), sorter);
873+
std::sort(sorted.begin(), sorted.end(), SymbolSourcePositionCompare{});
874+
std::sort(generics.begin(), generics.end(), SymbolSourcePositionCompare{});
892875
sorted.insert(sorted.end(), generics.begin(), generics.end());
893876
sorted.insert(sorted.end(), namelist.begin(), namelist.end());
894877
for (const auto &pair : scope.commonBlocks()) {
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
module foo
2+
interface do_foo
3+
procedure do_foo_impl
4+
end interface
5+
interface do_bar
6+
procedure do_bar_impl
7+
end interface
8+
contains
9+
subroutine do_foo_impl()
10+
end
11+
subroutine do_bar_impl()
12+
end
13+
end

flang/test/Semantics/modfile72.f90

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
! This test verifies that both invocations produce a consistent order in the
2+
! generated `.mod` file. Previous versions of Flang exhibited non-deterministic
3+
! behavior due to pointers outside the cooked source being used to order symbols
4+
! in the `.mod` file.
5+
6+
! RUN: rm -rf %t && mkdir -p %t
7+
! RUN: %flang_fc1 -fsyntax-only -J%t %S/Inputs/modfile72.f90
8+
! RUN: %flang_fc1 -fsyntax-only -J%t %s
9+
! RUN: cat %t/bar.mod | FileCheck %s
10+
11+
! RUN: rm -rf %t && mkdir -p %t
12+
! RUN: %flang_fc1 -fsyntax-only -J%t %S/Inputs/modfile72.f90 %s
13+
! RUN: cat %t/bar.mod | FileCheck %s
14+
15+
module bar
16+
use foo, only : do_foo
17+
use foo, only : do_bar
18+
contains
19+
subroutine do_baz()
20+
call do_foo()
21+
call do_bar()
22+
end
23+
end
24+
25+
! CHECK: use foo,only:do_foo
26+
! CHECK-NEXT: use foo,only:do_bar
27+
! CHECK-NEXT: use foo,only:foo$foo$do_bar_impl=>do_bar_impl
28+
! CHECK-NEXT: use foo,only:foo$foo$do_foo_impl=>do_foo_impl

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ Improvements and New Features
4747
- The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for
4848
``std::vector<bool>::iterator``, resulting in a performance improvement of up to 2000x.
4949

50+
- The ``std::ranges::equal`` algorithm has been optimized for ``std::vector<bool>::iterator``, resulting in a performance
51+
improvement of up to 188x.
52+
5053
- Updated formatting library to Unicode 16.0.0.
5154

5255
Deprecations and Removals

0 commit comments

Comments
 (0)