Skip to content

Commit ba6c662

Browse files
Merge branch 'main' into mcmodel_crash
2 parents 28fcb0e + b95a6c7 commit ba6c662

File tree

13 files changed

+515
-257
lines changed

13 files changed

+515
-257
lines changed

llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -967,21 +967,20 @@ void DWARFVerifier::verifyDebugLineStmtOffsets() {
967967
// here because we validate this in the .debug_info verifier.
968968
continue;
969969
}
970-
auto Iter = StmtListToDie.find(LineTableOffset);
971-
if (Iter != StmtListToDie.end()) {
970+
auto [Iter, Inserted] = StmtListToDie.try_emplace(LineTableOffset, Die);
971+
if (!Inserted) {
972972
++NumDebugLineErrors;
973+
const auto &OldDie = Iter->second;
973974
ErrorCategory.Report("Identical DW_AT_stmt_list section offset", [&]() {
974975
error() << "two compile unit DIEs, "
975-
<< format("0x%08" PRIx64, Iter->second.getOffset()) << " and "
976+
<< format("0x%08" PRIx64, OldDie.getOffset()) << " and "
976977
<< format("0x%08" PRIx64, Die.getOffset())
977978
<< ", have the same DW_AT_stmt_list section offset:\n";
978-
dump(Iter->second);
979+
dump(OldDie);
979980
dump(Die) << '\n';
980981
});
981982
// Already verified this line table before, no need to do it again.
982-
continue;
983983
}
984-
StmtListToDie[LineTableOffset] = Die;
985984
}
986985
}
987986

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 18 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
416416
return 1024;
417417
}
418418

419-
// FIXME: Should we use narrower types for local/region, or account for when
420-
// unaligned access is legal?
421419
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
422420
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
423421
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426424
if (AtomicElementSize)
427425
return Type::getIntNTy(Context, *AtomicElementSize * 8);
428426

429-
Align MinAlign = std::min(SrcAlign, DestAlign);
430-
431-
// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
432-
// hardware into byte accesses. If you assume all alignments are equally
433-
// probable, it's more efficient on average to use short accesses for this
434-
// case.
435-
if (MinAlign == Align(2))
436-
return Type::getInt16Ty(Context);
437-
438-
// Not all subtargets have 128-bit DS instructions, and we currently don't
439-
// form them by default.
440-
if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441-
SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
442-
DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
443-
DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
444-
return FixedVectorType::get(Type::getInt32Ty(Context), 2);
445-
}
446-
447-
// Global memory works best with 16-byte accesses.
427+
// 16-byte accesses achieve the highest copy throughput.
448428
// If the operation has a fixed known length that is large enough, it is
449429
// worthwhile to return an even wider type and let legalization lower it into
450-
// multiple accesses, effectively unrolling the memcpy loop. Private memory
451-
// also hits this, although accesses may be decomposed.
430+
// multiple accesses, effectively unrolling the memcpy loop.
431+
// We also rely on legalization to decompose into smaller accesses for
432+
// subtargets and address spaces where it is necessary.
452433
//
453434
// Don't unroll if Length is not a constant, since unrolling leads to worse
454435
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
473454
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
474455
DestAlign, AtomicCpySize);
475456

476-
Align MinAlign = std::min(SrcAlign, DestAlign);
477-
478-
if (MinAlign != Align(2)) {
479-
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
480-
while (RemainingBytes >= 16) {
481-
OpsOut.push_back(I32x4Ty);
482-
RemainingBytes -= 16;
483-
}
457+
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
458+
while (RemainingBytes >= 16) {
459+
OpsOut.push_back(I32x4Ty);
460+
RemainingBytes -= 16;
461+
}
484462

485-
Type *I64Ty = Type::getInt64Ty(Context);
486-
while (RemainingBytes >= 8) {
487-
OpsOut.push_back(I64Ty);
488-
RemainingBytes -= 8;
489-
}
463+
Type *I64Ty = Type::getInt64Ty(Context);
464+
while (RemainingBytes >= 8) {
465+
OpsOut.push_back(I64Ty);
466+
RemainingBytes -= 8;
467+
}
490468

491-
Type *I32Ty = Type::getInt32Ty(Context);
492-
while (RemainingBytes >= 4) {
493-
OpsOut.push_back(I32Ty);
494-
RemainingBytes -= 4;
495-
}
469+
Type *I32Ty = Type::getInt32Ty(Context);
470+
while (RemainingBytes >= 4) {
471+
OpsOut.push_back(I32Ty);
472+
RemainingBytes -= 4;
496473
}
497474

498475
Type *I16Ty = Type::getInt16Ty(Context);

llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,10 +287,10 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
287287
RegSeqInfo &CompatibleRSI,
288288
std::vector<std::pair<unsigned, unsigned>> &RemapChan) {
289289
unsigned NeededUndefs = 4 - RSI.UndefReg.size();
290-
if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
291-
return false;
292290
std::vector<MachineInstr *> &MIs =
293291
PreviousRegSeqByUndefCount[NeededUndefs];
292+
if (MIs.empty())
293+
return false;
294294
CompatibleRSI = PreviousRegSeq[MIs.back()];
295295
tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
296296
return true;

0 commit comments

Comments
 (0)