diff --git a/src/cpu/minor/BaseMinorCPU.py b/src/cpu/minor/BaseMinorCPU.py index 545dfeaee56..5484a11a882 100644 --- a/src/cpu/minor/BaseMinorCPU.py +++ b/src/cpu/minor/BaseMinorCPU.py @@ -426,7 +426,10 @@ def support_take_over(cls): ) branchPred = Param.BranchPredictor( - TournamentBP(numThreads=Parent.numThreads), "Branch Predictor" + BranchPredictor( + conditionalBranchPred=TournamentBP(numThreads=Parent.numThreads) + ), + "Branch Predictor", ) def addCheckerCpu(self): diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc index b02294bfe6a..eb16b601d05 100644 --- a/src/cpu/minor/fetch2.cc +++ b/src/cpu/minor/fetch2.cc @@ -204,7 +204,7 @@ Fetch2::predictBranch(MinorDynInstPtr inst, BranchData &branch) DPRINTF(Branch, "Trying to predict for inst: %s\n", *inst); if (branchPredictor.predict(inst->staticInst, - inst->id.fetchSeqNum, *inst_pc, inst->id.threadId)) { + inst->id.fetchSeqNum, *inst_pc, inst->id.threadId).taken) { set(branch.target, *inst_pc); inst->predictedTaken = true; set(inst->predictedTarget, inst_pc); diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index a6906c16437..4535be91485 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -209,7 +209,10 @@ def support_take_over(cls): smtCommitPolicy = Param.CommitPolicy("RoundRobin", "SMT Commit Policy") branchPred = Param.BranchPredictor( - TournamentBP(numThreads=Parent.numThreads), "Branch Predictor" + BranchPredictor( + conditionalBranchPred=TournamentBP(numThreads=Parent.numThreads) + ), + "Branch Predictor", ) needsTSO = Param.Bool(False, "Enable TSO Memory model") diff --git a/src/cpu/o3/bac.cc b/src/cpu/o3/bac.cc index cbb11762693..191b65f32bf 100644 --- a/src/cpu/o3/bac.cc +++ b/src/cpu/o3/bac.cc @@ -75,7 +75,7 @@ BAC::BAC(CPU *_cpu, const BaseO3CPUParams ¶ms) fetchTargetWidth(params.fetchTargetWidth), minInstSize(params.minInstSize), numThreads(params.numThreads), - stats(_cpu,this) + stats(_cpu, this) { fatal_if(decoupledFrontEnd && (fetchTargetWidth < params.fetchBufferSize), "Fetch target width should be larger than fetch buffer size!"); @@ -83,6 +83,7 @@ BAC::BAC(CPU *_cpu, const BaseO3CPUParams ¶ms) for (int i = 0; i < MaxThreads; i++) { bacPC[i].reset(params.isa[0]->newPCState()); stalls[i] = {false, false, false}; + branchPredictRemaining[i] = Cycles(0); } assert(bpu!=nullptr); @@ -401,12 +402,24 @@ BAC::checkSignalsAndUpdate(ThreadID tid) return true; } + if (branchPredictRemaining[tid] > Cycles(0)) { + --branchPredictRemaining[tid]; + DPRINTF(BAC, + "[global] Stalling for Branch Predictor for %i more cycles.\n", + branchPredictRemaining[tid] + ); + stalls[tid].bpu = true; + } else { + stalls[tid].bpu = false; + } + if (checkStall(tid)) { // return block(tid); bacStatus[tid] = Blocked; return false; } + // If at this point the FTQ is still invalid we need to wait for // A resteer/squash signal. if (!ftq->isValid(tid) && bacStatus[tid] != Idle) { @@ -446,6 +459,15 @@ BAC::checkSignalsAndUpdate(ThreadID tid) return true; } + if (ftq->isFull(tid)) { + // If the FTQ is full, we need to block the BAC. + if (bacStatus[tid] != FTQFull) { + DPRINTF(BAC, "[tid:%i] FTQ is full. Blocking BAC.\n", tid); + bacStatus[tid] = FTQFull; + } + return true; + } + // Now all stall/squash conditions are checked. // Attempt to run the BAC if not already running. if (ftq->isValid(tid) && @@ -469,7 +491,7 @@ BAC::squashBpuHistories(ThreadID tid) { if (!decoupledFrontEnd) return; - DPRINTF(BAC, "%s(tid:%i): FTQ sz: %i\n", tid, __func__, ftq->size(tid)); + DPRINTF(BAC, "%s(tid:%i): FTQ sz: %i\n", __func__, tid, ftq->size(tid)); unsigned n_fts = ftq->size(tid); if (n_fts == 0) return; @@ -577,14 +599,16 @@ BAC::newFetchTarget(ThreadID tid, const PCStateBase &start_pc) return ft; } -bool +Prediction BAC::predict(ThreadID tid, const StaticInstPtr &inst, const FetchTargetPtr &ft, PCStateBase &pc) { /** Perform the prediction. */ BPredUnit::PredictorHistory* bpu_history = nullptr; - bool taken = bpu->predict(inst, ft->ftNum(), pc, tid, bpu_history); + Prediction pred = bpu->predict( + inst, ft->ftNum(), pc, tid, bpu_history + ); /** Push the prediction history to the fetch target. * The postFetch() function will move the history from the FTQ to the @@ -593,7 +617,7 @@ BAC::predict(ThreadID tid, const StaticInstPtr &inst, ft->bpu_history = static_cast(bpu_history); DPRINTF(Branch,"[tid:%i, ftn:%llu] History added.\n", tid, ft->ftNum()); - return taken; + return pred; } @@ -678,7 +702,9 @@ BAC::generateFetchTargets(ThreadID tid, bool &status_change) // Now make the actual prediction. Note the BPU will advance // the PC to the next instruction. - predict_taken = predict(tid, staticInst, curFT, *next_pc); + Prediction pred = predict(tid, staticInst, curFT, *next_pc); + predict_taken = pred.taken; + branchPredictRemaining[tid] = Cycles(pred.latency); DPRINTF(BAC, "[tid:%i, ftn:%llu] Branch found at PC %#x " "taken?:%i, target:%#x\n", @@ -869,7 +895,7 @@ BAC::updatePreDecode(ThreadID tid, const InstSeqNum seqNum, hist = new BPredUnit::PredictorHistory(tid, seqNum, pc.instAddr(), inst); bpu->branchPlaceholder(tid, pc.instAddr(), inst->isUncondCtrl(), - hist->bpHistory); + hist); hist->predTaken = hist->condPred = false; hist->targetProvider = BPredUnit::TargetProvider::NoTarget; @@ -926,8 +952,12 @@ BAC::updatePC(const DynInstPtr &inst, } else { // With a coupled front-end we need to make the branch prediction // here. - predict_taken = bpu->predict(inst->staticInst, inst->seqNum, - fetch_pc, tid); + // + // Latency is ignored in coupled mode + Prediction pred = bpu->predict( + inst->staticInst, inst->seqNum, fetch_pc, tid + ); + predict_taken = pred.taken; } DPRINTF(BAC, "[tid:%i] [sn:%llu] Branch at PC %#x " @@ -989,6 +1019,9 @@ BAC::profileCycle(ThreadID tid) case Squashing: stats.squashCycles++; break; + case Blocked: + stats.blockedCycles++; + break; case FTQFull: stats.ftqFullCycles++; break; @@ -1008,6 +1041,8 @@ BAC::BACStats::BACStats(o3::CPU *cpu, BAC *bac) "Number of cycles BAC is running"), ADD_STAT(squashCycles, statistics::units::Cycle::get(), "Number of cycles BAC is squashing"), + ADD_STAT(blockedCycles, statistics::units::Cycle::get(), + "Number of cycles BAC is blocked"), ADD_STAT(ftqFullCycles, statistics::units::Cycle::get(), "Number of cycles BAC has spent waiting for FTQ to become free"), diff --git a/src/cpu/o3/bac.hh b/src/cpu/o3/bac.hh index 8d6e59072b5..7ae8113f06f 100644 --- a/src/cpu/o3/bac.hh +++ b/src/cpu/o3/bac.hh @@ -53,6 +53,7 @@ namespace gem5 { struct BaseO3CPUParams; +typedef branch_prediction::Prediction Prediction; namespace o3 { @@ -93,6 +94,7 @@ typedef std::shared_ptr FetchTargetPtr; class BAC { typedef branch_prediction::BranchType BranchType; + typedef branch_prediction::BPredUnit BPredUnit; public: /** Overall decoupled BPU stage status. Used to determine if the CPU can @@ -245,9 +247,9 @@ class BAC * @param inst The branch instruction. * @param ft The fetch target that is currently processed. * @param PC The predicted PC is passed back through this parameter. - * @return Returns if the branch is taken or not. + * @return Returns the prediction result from the BPU. */ - bool predict(ThreadID tid, const StaticInstPtr &inst, + Prediction predict(ThreadID tid, const StaticInstPtr &inst, const FetchTargetPtr &ft, PCStateBase &pc); @@ -373,6 +375,9 @@ class BAC */ bool wroteToTimeBuffer; + /** Tracks remaining cycles that the branch predictor stalls BAC */ + Cycles branchPredictRemaining[MaxThreads]; + /** Source of possible stalls. */ struct Stalls { @@ -428,6 +433,8 @@ class BAC statistics::Scalar runCycles; /** Stat for total number of squashing cycles. */ statistics::Scalar squashCycles; + /** Stat for total number of blocked cycles. */ + statistics::Scalar blockedCycles; /** Stat for total number of cycles the FTQ was full. */ statistics::Scalar ftqFullCycles; diff --git a/src/cpu/o3/ftq.cc b/src/cpu/o3/ftq.cc index 27ac69d0476..c1c9ce77830 100644 --- a/src/cpu/o3/ftq.cc +++ b/src/cpu/o3/ftq.cc @@ -205,6 +205,7 @@ FTQ::forAllBackward(ThreadID tid, std::function f) void FTQ::insert(ThreadID tid, FetchTargetPtr fetchTarget) { + assert(ftq[tid].size() < numEntries); ftq[tid].push_back(fetchTarget); ppFTQInsert->notify(fetchTarget); stats.inserts++; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 34ea773b99e..587b52b31da 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -601,6 +601,11 @@ LSQUnit::executeLoad(const DynInstPtr &inst) assert(!inst->isSquashed()); + if (inst->isExecuted()) { + DPRINTF(LSQUnit, "Load [sn:%lli] already executed\n", inst->seqNum); + return NoFault; + } + load_fault = inst->initiateAcc(); if (load_fault == NoFault && !inst->readMemAccPredicate()) { diff --git a/src/cpu/pred/2bit_local.cc b/src/cpu/pred/2bit_local.cc index 7c27355b20f..a14d077b6b5 100644 --- a/src/cpu/pred/2bit_local.cc +++ b/src/cpu/pred/2bit_local.cc @@ -52,7 +52,7 @@ namespace branch_prediction { LocalBP::LocalBP(const LocalBPParams ¶ms) - : BPredUnit(params), + : ConditionalPredictor(params), localPredictorSize(params.localPredictorSize), localCtrBits(params.localCtrBits), localPredictorSets(localPredictorSize / localCtrBits), @@ -78,6 +78,12 @@ LocalBP::LocalBP(const LocalBPParams ¶ms) instShiftAmt); } +void LocalBP::branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, void * &bpHistory) +{ +// Placeholder for a function that only returns history items +} + void LocalBP::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, @@ -87,7 +93,7 @@ LocalBP::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, } -bool +Prediction LocalBP::lookup(ThreadID tid, Addr branch_addr, void * &bp_history) { bool taken; @@ -103,7 +109,7 @@ LocalBP::lookup(ThreadID tid, Addr branch_addr, void * &bp_history) taken = getPrediction(counter_val); - return taken; + return staticPrediction(taken); } void diff --git a/src/cpu/pred/2bit_local.hh b/src/cpu/pred/2bit_local.hh index 8d77289f966..d117c922264 100644 --- a/src/cpu/pred/2bit_local.hh +++ b/src/cpu/pred/2bit_local.hh @@ -46,7 +46,8 @@ #include "base/sat_counter.hh" #include "base/types.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/branch_type.hh" +#include "cpu/pred/conditional.hh" #include "params/LocalBP.hh" namespace gem5 @@ -62,7 +63,7 @@ namespace branch_prediction * predictor state that needs to be recorded or updated; the update can be * determined solely by the branch being taken or not taken. */ -class LocalBP : public BPredUnit +class LocalBP : public ConditionalPredictor { public: /** @@ -71,7 +72,10 @@ class LocalBP : public BPredUnit LocalBP(const LocalBPParams ¶ms); // Overriding interface functions - bool lookup(ThreadID tid, Addr pc, void * &bp_history) override; + Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override; + + void branchPlaceholder(ThreadID tid, Addr pc, bool uncond, + void * &bpHistory) override; void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 496b92a02d7..bdc464222fa 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -142,6 +142,21 @@ class SimpleBTB(BranchTargetBuffer): ) +class ConditionalPredictor(ClockedObject): + type = "ConditionalPredictor" + cxx_class = "gem5::branch_prediction::ConditionalPredictor" + cxx_header = "cpu/pred/conditional.hh" + abstract = True + + numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") + instShiftAmt = Param.Unsigned( + Parent.instShiftAmt, "Number of bits to shift instructions by" + ) + latency = Param.Cycles( + 0, "Static (flat) latency of the predictor (in cycles)" + ) + + class IndirectPredictor(SimObject): type = "IndirectPredictor" cxx_class = "gem5::branch_prediction::IndirectPredictor" @@ -179,7 +194,6 @@ class BranchPredictor(SimObject): type = "BranchPredictor" cxx_class = "gem5::branch_prediction::BPredUnit" cxx_header = "cpu/pred/bpred_unit.hh" - abstract = True numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") instShiftAmt = Param.Unsigned(2, "Number of bits to shift instructions by") @@ -197,6 +211,13 @@ class BranchPredictor(SimObject): ras = Param.ReturnAddrStack( ReturnAddrStack(), "Return address stack, set to NULL to disable RAS." ) + conditionalBranchPred = Param.ConditionalPredictor( + "Conditional branch predictor" + ) + overridingBranchPred = Param.ConditionalPredictor( + NULL, + "Secondary, overriding predictor which corrects the primary predictor", + ) indirectBranchPred = Param.IndirectPredictor( SimpleIndirectPredictor(), "Indirect branch predictor, set to NULL to disable " @@ -212,7 +233,7 @@ class BranchPredictor(SimObject): ) -class LocalBP(BranchPredictor): +class LocalBP(ConditionalPredictor): type = "LocalBP" cxx_class = "gem5::branch_prediction::LocalBP" cxx_header = "cpu/pred/2bit_local.hh" @@ -221,7 +242,7 @@ class LocalBP(BranchPredictor): localCtrBits = Param.Unsigned(2, "Bits per counter") -class TournamentBP(BranchPredictor): +class TournamentBP(ConditionalPredictor): type = "TournamentBP" cxx_class = "gem5::branch_prediction::TournamentBP" cxx_header = "cpu/pred/tournament.hh" @@ -235,7 +256,7 @@ class TournamentBP(BranchPredictor): choiceCtrBits = Param.Unsigned(2, "Bits of choice counters") -class BiModeBP(BranchPredictor): +class BiModeBP(ConditionalPredictor): type = "BiModeBP" cxx_class = "gem5::branch_prediction::BiModeBP" cxx_header = "cpu/pred/bi_mode.hh" @@ -310,7 +331,7 @@ class TAGEBase(SimObject): # TAGE branch predictor as described in https://www.jilp.org/vol8/v8paper1.pdf # The default sizes below are for the 8C-TAGE configuration (63.5 Kbits) -class TAGE(BranchPredictor): +class TAGE(ConditionalPredictor): type = "TAGE" cxx_class = "gem5::branch_prediction::TAGE" cxx_header = "cpu/pred/tage.hh" @@ -661,6 +682,9 @@ class TAGE_SC_L(LTAGE): sc_enabled = Param.Bool( True, "Use the statistical corrector in the branch predictor" ) + loop_enabled = Param.Bool( + True, "Use the loop predictor in the branch predictor" + ) statistical_corrector = Param.StatisticalCorrector("Statistical Corrector") @@ -775,8 +799,68 @@ class TAGE_SC_L_8KB(TAGE_SC_L): loop_predictor = TAGE_SC_L_8KB_LoopPredictor() statistical_corrector = TAGE_SC_L_8KB_StatisticalCorrector() +class LLBP_TAGE_64KB(TAGE_SC_L_TAGE_64KB): + type = "LLBP_TAGE_64KB" + cxx_class = "gem5::branch_prediction::LLBP_TAGE_64KB" + cxx_header = "cpu/pred/llbp.hh" + +class LLBP(ConditionalPredictor): + type = "LLBP" + cxx_class = "gem5::branch_prediction::LLBP" + cxx_header = "cpu/pred/llbp.hh" + + base = Param.TAGE_SC_L("Base predictor") + + rcrType = Param.Int(3, "RCR Type of Branches to hash") + rcrWindow = Param.Int(8, "RCR Number of Branches to hash") + rcrDist = Param.Int(8, "RCR Number of Branches to skip") + rcrShift = Param.Int(2, "RCR Number of bits to shift PC by") + rcrTagWidth = Param.Int(14, "RCR Tag Width") + + backingStorageCapacity = Param.Int( + 14000, "Backing Storage Capacity (in number of contexts)" + ) + patterTagBits = Param.Int(14, "Number of bits in the pattern tag (TTWidth)") + backingStorageLatency = Param.Cycles(6, "Backing Storage Latency") + + patternBufferCapacity = Param.Int( + 64, "Pattern Buffer Capacity (in number of contexts)" + ) + patternBufferAssoc = Param.Int(4, "Pattern Buffer Associativity") + + patternSetCapacity = Param.Int( + 64, "Pattern Set Capacity (>> base numTables) [0 for infinite]" + ) + patternSetAssoc = Param.Int( + 4, "Pattern Set Associativity [ignored if cap = 0]" + ) + patternSetBankBits = Param.Int( + 8, "Pattern Set amount of bits reserved for TAGE bank in key" + ) + + patternCounterWidth = Param.Int(3, "Bits in Pattern Direction Counter") + contextCounterWidth = Param.Int(2, "Bits in Context Replacement Counter") + + lightningPredEnabled = Param.Bool( + False, + "Whether to enable lightning predictions: override with 0 latency on high confidence branches", + ) + lightningPredCutoff = Param.Int( + 2, + "Lightning prediction cutoff: if the branch confidence is above this value, a lightning prediction is made", + ) + + +class LLBPRef(ConditionalPredictor): + type = "LLBPRef" + cxx_class = "gem5::branch_prediction::LLBPRef" + cxx_header = "cpu/pred/llbp_ref.hh" + inf = Param.Bool( + False, "Use infinite storage capacity for the backing storage" + ) + -class MultiperspectivePerceptron(BranchPredictor): +class MultiperspectivePerceptron(ConditionalPredictor): type = "MultiperspectivePerceptron" cxx_class = "gem5::branch_prediction::MultiperspectivePerceptron" cxx_header = "cpu/pred/multiperspective_perceptron.hh" @@ -1117,7 +1201,7 @@ class MultiperspectivePerceptronTAGE8KB(MultiperspectivePerceptronTAGE): statistical_corrector = MPP_StatisticalCorrector_8KB() -class TageSCLRef(BranchPredictor): +class TageSCLRef(ConditionalPredictor): type = "TageSCLRef" cxx_class = "gem5::branch_prediction::TageSCLRef" cxx_header = "cpu/pred/tagescl_ref.hh" diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript index f52299fecc0..30e973a1388 100644 --- a/src/cpu/pred/SConscript +++ b/src/cpu/pred/SConscript @@ -44,6 +44,7 @@ Import('*') SimObject('BranchPredictor.py', sim_objects=[ 'BranchPredictor', + 'ConditionalPredictor', 'IndirectPredictor', 'SimpleIndirectPredictor', 'BranchTargetBuffer', 'SimpleBTB', 'BTBIndexingPolicy', 'BTBSetAssociative', 'ReturnAddrStack', @@ -51,6 +52,7 @@ SimObject('BranchPredictor.py', 'LocalBP', 'TournamentBP', 'BiModeBP', 'TAGEBase', 'TAGE', 'LoopPredictor', 'TAGE_SC_L_TAGE', 'TAGE_SC_L_TAGE_64KB', 'TAGE_SC_L_TAGE_8KB', 'LTAGE', 'TAGE_SC_L_LoopPredictor', 'StatisticalCorrector', 'TAGE_SC_L', + 'LLBP', 'LLBP_TAGE_64KB', 'LLBPRef', 'TAGE_SC_L_64KB_StatisticalCorrector', 'TAGE_SC_L_8KB_StatisticalCorrector', 'TAGE_SC_L_64KB', 'TAGE_SC_L_8KB', @@ -68,6 +70,7 @@ Source('bpred_unit.cc') Source('2bit_local.cc') Source('simple_indirect.cc') Source('it_tage.cc') +Source('conditional.cc') Source('indirect.cc') Source('ras.cc') Source('tournament.cc') @@ -77,6 +80,11 @@ Source('tage.cc') Source('tagescl_ref.cc') Source('loop_predictor.cc') Source('ltage.cc') +Source('llbp.cc') +Source('llbp_ref.cc') +Source('llbpref/tage.cc') +Source('llbpref/tage_scl.cc') +Source('llbpref/llbp.cc') Source('multiperspective_perceptron.cc') Source('multiperspective_perceptron_8KB.cc') Source('multiperspective_perceptron_64KB.cc') @@ -97,3 +105,4 @@ DebugFlag('Branch') DebugFlag('Tage') DebugFlag('LTage') DebugFlag('TageSCL') +DebugFlag('LLBP') diff --git a/src/cpu/pred/bi_mode.cc b/src/cpu/pred/bi_mode.cc index f9f9330b883..5a6557c12c1 100644 --- a/src/cpu/pred/bi_mode.cc +++ b/src/cpu/pred/bi_mode.cc @@ -54,7 +54,7 @@ namespace branch_prediction { BiModeBP::BiModeBP(const BiModeBPParams ¶ms) - : BPredUnit(params), + : ConditionalPredictor(params), globalHistoryReg(params.numThreads, 0), globalHistoryBits(ceilLog2(params.globalPredictorSize)), choicePredictorSize(params.choicePredictorSize), @@ -128,7 +128,7 @@ BiModeBP::squash(ThreadID tid, void * &bp_history) * choice array's prediction is used to select between the two * direction predictors for the final branch prediction. */ -bool +Prediction BiModeBP::lookup(ThreadID tid, Addr branchAddr, void * &bp_history) { unsigned choiceHistoryIdx = ((branchAddr >> instShiftAmt) @@ -163,7 +163,7 @@ BiModeBP::lookup(ThreadID tid, Addr branchAddr, void * &bp_history) history->finalPred = finalPrediction; bp_history = static_cast(history); - return finalPrediction; + return staticPrediction(finalPrediction); } diff --git a/src/cpu/pred/bi_mode.hh b/src/cpu/pred/bi_mode.hh index c0513826730..0f230831f23 100644 --- a/src/cpu/pred/bi_mode.hh +++ b/src/cpu/pred/bi_mode.hh @@ -46,7 +46,8 @@ #define __CPU_PRED_BI_MODE_PRED_HH__ #include "base/sat_counter.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/branch_type.hh" +#include "cpu/pred/conditional.hh" #include "params/BiModeBP.hh" namespace gem5 @@ -69,11 +70,11 @@ namespace branch_prediction * the branch's PC to choose between the two, destructive aliasing is reduced. */ -class BiModeBP : public BPredUnit +class BiModeBP : public ConditionalPredictor { public: BiModeBP(const BiModeBPParams ¶ms); - bool lookup(ThreadID tid, Addr pc, void * &bp_history) override; + Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override; void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, void * &bp_history) override; diff --git a/src/cpu/pred/bpred_unit.cc b/src/cpu/pred/bpred_unit.cc index 2e0daec6760..db50b043ff2 100644 --- a/src/cpu/pred/bpred_unit.cc +++ b/src/cpu/pred/bpred_unit.cc @@ -63,6 +63,8 @@ BPredUnit::BPredUnit(const Params ¶ms) predHist(numThreads), btb(params.btb), ras(params.ras), + cPred(params.conditionalBranchPred), + overridingCPred(params.overridingBranchPred), iPred(params.indirectBranchPred), stats(this) { @@ -94,20 +96,14 @@ BPredUnit::drainSanityCheck() const assert(ph.empty()); } -void -BPredUnit::branchPlaceholder(ThreadID tid, Addr pc, - bool uncond, void * &bp_history) -{ - panic("BPredUnit::branchPlaceholder() not implemented for this BP.\n"); -} -bool +Prediction BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, PCStateBase &pc, ThreadID tid) { /** Perform the prediction. */ PredictorHistory* bpu_history = nullptr; - bool taken = predict(inst, seqNum, pc, tid, bpu_history); + Prediction pred = predict(inst, seqNum, pc, tid, bpu_history); assert(bpu_history!=nullptr); @@ -117,18 +113,19 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, DPRINTF(Branch, "[tid:%i] [sn:%llu] History entry added. " "predHist.size(): %i\n", tid, seqNum, predHist[tid].size()); - return taken; + return pred; } -bool +Prediction BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, PCStateBase &pc, ThreadID tid, PredictorHistory* &hist) { assert(hist == nullptr); + Cycles totalLatency = Cycles(0); // See if branch predictor predicts taken. // If so, get its target addr either from the BTB or the RAS. @@ -156,12 +153,38 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, } else { // Conditional branches ------- ++stats.condPredicted; - hist->condPred = lookup(tid, pc.instAddr(), hist->bpHistory); + Prediction condPred = cPred->lookup( + tid, pc.instAddr(), hist->bpHistory + ); + hist->condPred = condPred.taken; + + if (overridingCPred) { + + Prediction secondaryPred = overridingCPred->lookup( + tid, pc.instAddr(), hist->overridingBpHistory + ); + if (secondaryPred.taken != hist->condPred) { + // If the predictors disagree, + // use the result of the overriding predictor + // and incur its latency + totalLatency += secondaryPred.latency; + hist->condPred = secondaryPred.taken; + hist->overridden = true; + } else { + // If the predictors agree, + // use the result of the primary predictor + totalLatency += condPred.latency; + } + } else { + totalLatency += condPred.latency; + } + if (hist->condPred) { ++stats.condPredictedTaken; } } + hist->predTaken = hist->condPred; DPRINTF(Branch, @@ -326,9 +349,16 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, * The actual prediction tables will updated once * we know the correct direction. **/ - updateHistories(tid, hist->pc, hist->uncond, hist->predTaken, + cPred->updateHistories(tid, hist->pc, hist->uncond, hist->predTaken, hist->target->instAddr(), hist->inst, hist->bpHistory); + if (overridingCPred) { + overridingCPred->updateHistories( + tid, hist->pc, hist->uncond, hist->predTaken, + hist->target->instAddr(), hist->inst, hist->overridingBpHistory + ); + } + if (iPred) { // Update the indirect predictor with the direction prediction @@ -336,7 +366,10 @@ BPredUnit::predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, *hist->target, brType, hist->indirectHistory); } - return hist->predTaken; + return Prediction { + .taken = hist->predTaken, + .latency = totalLatency, + }; } @@ -383,11 +416,25 @@ BPredUnit::commitBranch(ThreadID tid, PredictorHistory* &hist) hist->target->instAddr()); // Update the branch predictor with the correct results. - update(tid, hist->pc, + cPred->update(tid, hist->pc, hist->actuallyTaken, hist->bpHistory, false, hist->inst, hist->target->instAddr()); + + if (hist->inst->isCondCtrl()) + updateStatsOverriding(hist->condPred, hist->actuallyTaken, hist->overridden); + + // If the overriding predictor was used, + // also update it with the correct result + if (overridingCPred) { + + overridingCPred->update( + tid, hist->pc, hist->actuallyTaken, + hist->overridingBpHistory, false, + hist->inst, hist->target->instAddr() + ); + } // Commit also Indirect predictor and RAS if (iPred) { @@ -410,6 +457,9 @@ BPredUnit::commitBranch(ThreadID tid, PredictorHistory* &hist) hist->seqNum, hist->pc, hist->target->instAddr()); stats.BTBUpdates++; + + stats.uniqueBranches.insert(hist->pc); + btb->update(tid, hist->pc, *hist->target, hist->type, @@ -469,7 +519,14 @@ BPredUnit::squashHistory(ThreadID tid, PredictorHistory* &history) } // This call will delete the bpHistory. - squash(tid, history->bpHistory); + cPred->squash(tid, history->bpHistory); + + // If the overriding predictor was used, also squash it + // This call will delete the overridingBpHistory. + if (overridingCPred) { + overridingCPred->squash(tid, history->overridingBpHistory); + assert(history->overridingBpHistory == nullptr); + } delete history; history = nullptr; @@ -548,9 +605,17 @@ BPredUnit::squash(const InstSeqNum &squashed_sn, set(hist->target, corr_target); // Correct Direction predictor ------------------ - update(tid, hist->pc, actually_taken, hist->bpHistory, + cPred->update(tid, hist->pc, actually_taken, hist->bpHistory, true, hist->inst, corr_target.instAddr()); + // If the overriding predictor was used, also update it + if (overridingCPred) { + overridingCPred->update(tid, hist->pc, actually_taken, + hist->overridingBpHistory, + true, hist->inst, + corr_target.instAddr()); + } + // Correct Indirect predictor ------------------- if (iPred) { @@ -619,6 +684,7 @@ BPredUnit::squash(const InstSeqNum &squashed_sn, "PC %#x -> T: %#x\n", tid, hist->seqNum, hist->pc, hist->target->instAddr()); + stats.uniqueBranches.insert(hist->pc); // stats.BTBUpdates++; // btb->update(tid, hist->pc, // *hist->target, @@ -633,6 +699,18 @@ BPredUnit::squash(const InstSeqNum &squashed_sn, } } +void +BPredUnit::branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, PredictorHistory* &hist) +{ + // Delegate to conditional predictor + cPred->branchPlaceholder(tid, pc, uncond, hist->bpHistory); + // If the overriding predictor is used, also call it + if (overridingCPred) { + overridingCPred->branchPlaceholder(tid, pc, uncond, + hist->overridingBpHistory); + } +} void BPredUnit::dump() @@ -658,9 +736,27 @@ BPredUnit::dump() } } +void +BPredUnit::updateStatsOverriding(bool prediction, bool actuallyTaken, bool overridden) { + if (prediction != actuallyTaken) { + if (overridden) { + ++stats.condWrongOverridden; + } else { + ++stats.condWrongBasePred; + } + } else { + if (overridden) { + ++stats.condCorrectOverridden; + } else { + ++stats.condCorrectBasePred; + } + } +} + BPredUnit::BPredUnitStats::BPredUnitStats(BPredUnit *bp) : statistics::Group(bp), + uniqueBranches(), ADD_STAT(lookups, statistics::units::Count::get(), "Number of BP lookups"), ADD_STAT(squashes, statistics::units::Count::get(), @@ -697,6 +793,16 @@ BPredUnit::BPredUnitStats::BPredUnitStats(BPredUnit *bp) "Number of conditional branches incorrect"), ADD_STAT(predTakenBTBMiss, statistics::units::Count::get(), "Number of branches predicted taken but missed in BTB"), + ADD_STAT(condWrongBasePred, statistics::units::Count::get(), + "Number of branches predicted wrong with the base predictor (not overridden)"), + ADD_STAT(condWrongOverridden, statistics::units::Count::get(), + "Number of branches predicted wrong after being overridden"), + ADD_STAT(condCorrectBasePred, statistics::units::Count::get(), + "Number of branches predicted correctly only by the base predictor (not overridden)"), + ADD_STAT(condCorrectOverridden, statistics::units::Count::get(), + "Number of branches predicted correctly after being overridden"), + ADD_STAT(BTBUniqueBranches, statistics::units::Count::get(), + "Number of unique branches encountered by the BTB"), ADD_STAT(BTBLookups, statistics::units::Count::get(), "Number of BTB lookups"), ADD_STAT(BTBUpdates, statistics::units::Count::get(), @@ -772,5 +878,9 @@ BPredUnit::BPredUnitStats::BPredUnitStats(BPredUnit *bp) } +void BPredUnit::BPredUnitStats::preDumpStats() { + BTBUniqueBranches = uniqueBranches.size(); +} + } // namespace branch_prediction } // namespace gem5 diff --git a/src/cpu/pred/bpred_unit.hh b/src/cpu/pred/bpred_unit.hh index c94e5231818..adaaf20a2a8 100644 --- a/src/cpu/pred/bpred_unit.hh +++ b/src/cpu/pred/bpred_unit.hh @@ -49,6 +49,7 @@ #include "cpu/inst_seq.hh" #include "cpu/pred/branch_type.hh" #include "cpu/pred/btb.hh" +#include "cpu/pred/conditional.hh" #include "cpu/pred/indirect.hh" #include "cpu/pred/ras.hh" #include "cpu/static_inst.hh" @@ -81,8 +82,6 @@ class BPredUnit : public SimObject /** Branch Predictor Unit (BPU) interface functions */ public: - - /** * @param params The params object, that has the size of the BP and BTB. */ @@ -102,7 +101,7 @@ class BPredUnit : public SimObject * @param tid The thread id. * @return Returns if the branch is taken or not. */ - bool predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, + Prediction predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, PCStateBase &pc, ThreadID tid); /** @@ -141,81 +140,6 @@ class BPredUnit : public SimObject * Interface functions to the conditional branch predictor * */ - - /** - * Looks up a given conditional branch PC of in the BP to see if it - * is taken or not taken. - * @param tid The thread id. - * @param pc The PC to look up. - * @param bp_history Pointer that will be set to an object that - * has the branch predictor state associated with the lookup. - * @return Whether the branch is taken or not taken. - */ - virtual bool lookup(ThreadID tid, Addr pc, void * &bp_history) = 0; - - /** - * Ones done with the prediction this function updates the - * path and global history. All branches call this function - * including unconditional once. - * @param tid The thread id. - * @param pc The branch's pc that will be updated. - * @param uncond Wheather or not this branch is an unconditional branch. - * @param taken Whether or not the branch was taken - * @param target The final target of branch. Some modern - * predictors use the target in their history. - * @param inst Static instruction information - * @param bp_history Pointer that will be set to an object that - * has the branch predictor state associated with the lookup. - * - */ - virtual void updateHistories(ThreadID tid, Addr pc, bool uncond, - bool taken, Addr target, - const StaticInstPtr &inst, void * &bp_history) = 0; - - /** - * @param tid The thread id. - * @param bp_history Pointer to the history object. The predictor - * will need to update any state and delete the object. - */ - virtual void squash(ThreadID tid, void * &bp_history) = 0; - - - /** - * Updates the BP with taken/not taken information. - * @param tid The thread id. - * @param pc The branch's PC that will be updated. - * @param taken Whether the branch was taken or not taken. - * @param bp_history Pointer to the branch predictor state that is - * associated with the branch lookup that is being updated. - * @param squashed Set to true when this function is called during a - * squash operation. - * @param inst Static instruction information - * @param target The resolved target of the branch (only needed - * for squashed branches) - * @todo Make this update flexible enough to handle a global predictor. - */ - virtual void update(ThreadID tid, Addr pc, bool taken, - void * &bp_history, bool squashed, - const StaticInstPtr &inst, Addr target) = 0; - - /** - * Special function for the decoupled front-end. In it there can be - * branches which are not detected by the BPU in the first place as it - * requires a BTB hit. This function will generate a placeholder for - * such a branch once it is pre-decoded in the fetch stage. It will - * only create the branch history object but not update any internal state - * of the BPU. - * If the branch turns to be wrong then decode or commit will - * be able to use the normal squash functionality to correct the branch. - * Note that not all branch predictors implement this functionality. - * @param tid The thread id. - * @param pc The branch's PC. - * @param uncond Whether or not this branch is an unconditional branch. - * @param bp_history Pointer that will be set to an branch history object. - */ - virtual void branchPlaceholder(ThreadID tid, Addr pc, - bool uncond, void * &bp_history); - /** * Looks up a given PC in the BTB to see if a matching entry exists. * @param tid The thread id. @@ -356,6 +280,7 @@ class BPredUnit : public SimObject inst(inst), type(getBranchType(inst)), call(inst->isCall()), uncond(inst->isUncondCtrl()), predTaken(false), actuallyTaken(false), condPred(false), + overridden(false), btbHit(false), targetProvider(TargetProvider::NoTarget), resteered(false), mispredict(false), target(nullptr), bpHistory(nullptr), @@ -408,6 +333,9 @@ class BPredUnit : public SimObject /** The prediction of the conditional predictor */ bool condPred; + /** Whether the overriding predictor was the provider */ + bool overridden; + /** Was BTB hit at prediction time */ bool btbHit; @@ -431,6 +359,8 @@ class BPredUnit : public SimObject */ void *bpHistory = nullptr; + void *overridingBpHistory = nullptr; + void *indirectHistory = nullptr; void *rasHistory = nullptr; @@ -443,7 +373,7 @@ class BPredUnit : public SimObject /** * Internal prediction function. */ - bool predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, + Prediction predict(const StaticInstPtr &inst, const InstSeqNum &seqNum, PCStateBase &pc, ThreadID tid, PredictorHistory* &bpu_history); /** @@ -462,8 +392,30 @@ class BPredUnit : public SimObject */ void commitBranch(ThreadID tid, PredictorHistory* &bpu_history); + /** + * Special function for the decoupled front-end. In it there can be + * branches which are not detected by the BPU in the first place as it + * requires a BTB hit. This function will generate a placeholder for + * such a branch once it is pre-decoded in the fetch stage. It will + * only create the branch history object but not update any internal state + * of the BPU. + * If the branch turns to be wrong then decode or commit will + * be able to use the normal squash functionality to correct the branch. + * Note that not all branch predictors implement this functionality. + * @param tid The thread id. + * @param pc The branch's PC. + * @param uncond Whether or not this branch is an unconditional branch. + * @param bp_history Pointer that will be set to an branch history object. + */ + void branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, PredictorHistory* &hist); + /** + * Stat collection for overriding + */ + void updateStatsOverriding(bool prediction, bool actuallyTaken, bool overridden); + protected: /** Number of the threads for which the branch history is maintained. */ const unsigned numThreads; @@ -492,6 +444,12 @@ class BPredUnit : public SimObject /** The return address stack. */ ReturnAddrStack * ras; + /** The conditional branch predictor. */ + ConditionalPredictor * cPred; + + /** The overriding conditional branch predictor. */ + ConditionalPredictor * overridingCPred; + /** The indirect target predictor. */ IndirectPredictor * iPred; @@ -500,6 +458,10 @@ class BPredUnit : public SimObject { BPredUnitStats(BPredUnit *bp); + std::unordered_set uniqueBranches; + + void preDumpStats() override; + /** Stats per branch type */ statistics::Vector2d lookups; statistics::Vector2d squashes; @@ -520,7 +482,14 @@ class BPredUnit : public SimObject statistics::Scalar condIncorrect; statistics::Scalar predTakenBTBMiss; + statistics::Scalar condWrongBasePred; + statistics::Scalar condWrongOverridden; + statistics::Scalar condCorrectBasePred; + statistics::Scalar condCorrectOverridden; + + /** BTB stats. */ + statistics::Scalar BTBUniqueBranches; statistics::Scalar BTBLookups; statistics::Scalar BTBUpdates; statistics::Scalar BTBHits; @@ -535,6 +504,7 @@ class BPredUnit : public SimObject } stats; + protected: /** diff --git a/src/cpu/pred/branch_type.hh b/src/cpu/pred/branch_type.hh index dcc6149a9b0..3a8defe54f1 100644 --- a/src/cpu/pred/branch_type.hh +++ b/src/cpu/pred/branch_type.hh @@ -85,6 +85,16 @@ inline std::string toString(BranchType type) } +struct Prediction +{ + /** Whether the branch is predicted taken */ + bool taken; + /** The latency that this prediction would normally take */ + Cycles latency; +}; + + + } // namespace branch_prediction } // namespace gem5 diff --git a/src/cpu/pred/conditional.cc b/src/cpu/pred/conditional.cc new file mode 100644 index 00000000000..75070b7db3c --- /dev/null +++ b/src/cpu/pred/conditional.cc @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2025 Technical University of Munich + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cpu/pred/conditional.hh" + +namespace gem5 +{ + +namespace branch_prediction +{ + +ConditionalPredictor::ConditionalPredictor(const Params ¶ms) + : ClockedObject(params), + instShiftAmt(params.instShiftAmt), + staticLatency(params.latency) +{ +} + + +void +ConditionalPredictor::branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, void * &bp_history) +{ + panic("BPredUnit::branchPlaceholder() not implemented for this BP.\n"); +} + +} // namespace branch_prediction +} // namespace gem5 diff --git a/src/cpu/pred/conditional.hh b/src/cpu/pred/conditional.hh new file mode 100644 index 00000000000..c7d5a6d208a --- /dev/null +++ b/src/cpu/pred/conditional.hh @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2025 Technical University of Munich + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* @file + * Conditional branch predictor interface + */ + +#ifndef __CPU_PRED_CONDITIONAL_BASE_HH__ +#define __CPU_PRED_CONDITIONAL_BASE_HH__ + +#include "arch/generic/pcstate.hh" +#include "cpu/inst_seq.hh" +#include "cpu/pred/branch_type.hh" +#include "params/ConditionalPredictor.hh" +#include "sim/clocked_object.hh" + +namespace gem5 +{ + +namespace branch_prediction +{ + +class ConditionalPredictor : public ClockedObject +{ + public: + + typedef ConditionalPredictorParams Params; + + ConditionalPredictor(const Params ¶ms); + + /** + * Returns the configured prediction latency in cycles + * @return The prediction latency in cycles + */ + Cycles getStaticLatency() const { + return staticLatency; + } + + /** + * Looks up a given conditional branch PC of in the BP to see if it + * is taken or not taken. + * @param tid The thread id. + * @param pc The PC to look up. + * @param bp_history Pointer that will be set to an object that + * has the branch predictor state associated with the lookup. + * @return Whether the branch is taken or not taken. + */ + virtual Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) = 0; + + /** + * Ones done with the prediction this function updates the + * path and global history. All branches call this function + * including unconditional once. + * @param tid The thread id. + * @param pc The branch's pc that will be updated. + * @param uncond Wheather or not this branch is an unconditional branch. + * @param taken Whether or not the branch was taken + * @param target The final target of branch. Some modern + * predictors use the target in their history. + * @param inst Static instruction information + * @param bp_history Pointer that will be set to an object that + * has the branch predictor state associated with the lookup. + * + */ + virtual void updateHistories(ThreadID tid, Addr pc, bool uncond, + bool taken, Addr target, + const StaticInstPtr &inst, void * &bp_history) = 0; + + /** + * @param tid The thread id. + * @param bp_history Pointer to the history object. The predictor + * will need to update any state and delete the object. + */ + virtual void squash(ThreadID tid, void * &bp_history) = 0; + + + /** + * Updates the BP with taken/not taken information. + * @param tid The thread id. + * @param pc The branch's PC that will be updated. + * @param taken Whether the branch was taken or not taken. + * @param bp_history Pointer to the branch predictor state that is + * associated with the branch lookup that is being updated. + * @param squashed Set to true when this function is called during a + * squash operation. + * @param inst Static instruction information + * @param target The resolved target of the branch (only needed + * for squashed branches) + * @todo Make this update flexible enough to handle a global predictor. + */ + virtual void update(ThreadID tid, Addr pc, bool taken, + void * &bp_history, bool squashed, + const StaticInstPtr &inst, Addr target) = 0; + + /** + * Special function for the decoupled front-end. In it there can be + * branches which are not detected by the BPU in the first place as it + * requires a BTB hit. This function will generate a placeholder for + * such a branch once it is pre-decoded in the fetch stage. It will + * only create the branch history object but not update any internal state + * of the BPU. + * If the branch turns to be wrong then decode or commit will + * be able to use the normal squash functionality to correct the branch. + * Note that not all branch predictors implement this functionality. + * @param tid The thread id. + * @param pc The branch's PC. + * @param uncond Whether or not this branch is an unconditional branch. + * @param bp_history Pointer that will be set to an branch history object. + */ + virtual void branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, void * &bp_history); + protected: + + /** Number of bits to shift instructions by for predictor addresses. */ + const unsigned instShiftAmt; + + /** Static latency of the predictor in cycles */ + const Cycles staticLatency; + + /** Return a prediction with only static latency */ + Prediction staticPrediction(bool taken) const + { + return Prediction{taken, staticLatency}; + } +}; + +} // namespace branch_prediction +} // namespace gem5 + +#endif //__CPU_PRED_CONDITIONAL_BASE_HH__ diff --git a/src/cpu/pred/it_tage.cc b/src/cpu/pred/it_tage.cc index 2d11e31b269..c7796e67835 100644 --- a/src/cpu/pred/it_tage.cc +++ b/src/cpu/pred/it_tage.cc @@ -207,8 +207,8 @@ ITTAGE_TAGE::updateHistories(ThreadID tid, bool speculative, DPRINTF(Indirect, "%s(hist:%#x, nbits:%i) pc:%#x; ptr:%d, GHR:%#x\n", __func__, bi->ghist, bi->nGhist, bi->branchPC, tHist.ptGhist, getGHR(tid)); - assert(threadHistory[tid].gHist == - &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); + //assert(threadHistory[tid].gHist == + // &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); } @@ -252,7 +252,7 @@ ITTAGE_TAGE::updateIndirect(ThreadID tid, -bool +int ITTAGE_TAGE::allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) { if (TAGEBase::allocateEntry(idx, bi, taken)) { @@ -264,9 +264,9 @@ ITTAGE_TAGE::allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) assert(idx <= nHistoryTables); assert(b->tableIndices[idx] < (1<<(logTagTableSizes[idx]))); set(tgtTable[idx][b->tableIndices[idx]].target, b->corrTarget); - return true; + return 1; } - return false; + return 0; } diff --git a/src/cpu/pred/it_tage.hh b/src/cpu/pred/it_tage.hh index a75d2c3ccb9..c89dc8de99a 100644 --- a/src/cpu/pred/it_tage.hh +++ b/src/cpu/pred/it_tage.hh @@ -139,7 +139,7 @@ class ITTAGE_TAGE : public TAGEBase void updateIndirect(ThreadID tid, ITTAGEBranchInfo* bi, int nrand, const PCStateBase& target); - bool allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) override; + int allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) override; /** * Handles the update of the TAGE entries diff --git a/src/cpu/pred/llbp.cc b/src/cpu/pred/llbp.cc new file mode 100644 index 00000000000..42dbb42ad94 --- /dev/null +++ b/src/cpu/pred/llbp.cc @@ -0,0 +1,989 @@ +/* + +# Copyright (c) 2025 Technical University of Munich +# Copyright (c) 2024 The University of Edinburgh +# All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +Implementation of the last-level branch predictor (LLBP). + +*/ + +#include "cpu/pred/llbp.hh" +#include "debug/LLBP.hh" +#include + +namespace gem5 +{ + +namespace branch_prediction +{ + +LLBP::LLBP(const LLBPParams ¶ms) + : ConditionalPredictor(params), + base(params.base), + backingStorageCapacity(params.backingStorageCapacity), + backingStorageLatency(params.backingStorageLatency), + patternSetCapacity(params.patternSetCapacity), + patternSetAssoc(params.patternSetAssoc), + patternSetBankBits(/* Bits for bucket */ patternSetAssoc ? ceilLog2(patternSetAssoc) : 0 + + /* Bits for bank (36) */ ceilLog2(36)), + contextCounterWidth(params.contextCounterWidth), + patternCounterWidth(params.patternCounterWidth), + lightningPredEnabled(params.lightningPredEnabled), + lightningPredCutoff(params.lightningPredCutoff), + stats(this), + backingStorage(), + patternBuffer(params.patternBufferCapacity, + params.patternBufferAssoc, + this->backingStorage, + stats.patternBufferEvictions), + TTWidth(params.patterTagBits), + optimalPrefetching(backingStorageLatency == Cycles(0)), + rcr(params.rcrType, + params.rcrWindow, + params.rcrDist, + params.rcrShift, + params.rcrTagWidth) +{ + // assert(floorLog2(patternSetAssoc) + // TODO: Add assert to check that the base predictor is of type LLBP_TAGE_64KB + static_cast(base->tage)->setParent(this); + DPRINTF(LLBP, "Using experimental LLBP\n"); + DPRINTF(LLBP, "RCR: T=%d, W=%d, D=%d, S=%d, tagWidthBits=%d\n", + rcr.T, rcr.W, rcr.D, rcr.S, params.rcrTagWidth); + DPRINTF(LLBP, "Storage: cap=%d, bits=%d\n", + backingStorageCapacity, contextCounterWidth); +} + +void +LLBP::init() +{ + // First initialize the base predictor + base->tage->init(); + + // for (int i = 1; i <= base->getNumHistoryTables(); i++) { + // auto m = (i%2) ? base->tage->histLengths[i] : base->tage->histLengths[i]+2; + // fghrT1[i].init(m, TTWidth); + // fghrT2[i].init(m, TTWidth - 1); + // printf("T1[%d]: HistLen=%d, Width=%d\n", + // i, m, TTWidth); + // } + + fltTables.resize(base->getNumHistoryTables() + 1, 0); + +#define FILTER_TABLES +#ifdef FILTER_TABLES + // LLBP does not provide for all different history lenghts in + // TAGE a prediction only for the following once which where + // empirically determined. Note this + // are not the actual length but the table indices in TAGE. + auto l = {6,10,13,14,15,16,17,18, 19,20,22,24,26,28,32,36}; + +#else + std::list l; + for (int i = 1; i <= base->getNumHistoryTables(); i++) { + if (base->tage->noSkip[i]) { + l.push_back(i); + } + } +#endif //FILTER_TABLES + + int n = 0; + for (auto i : l) { + // To reduce the complexity of the multiplexer LLBP groups + // always four consecutive history lenght in one bucket. + // As the pattern sets are implemented a set associative + // structure the lower bits determine the set=bucket. + // The `fltTable`-map not only filters the history lengths + // but also maps each length the correct pattern set index. + // E.e. for the four way associativity the following function + // ensures that history length 6,10,13,14 gets assign + // 0,4,8,12 with the lowest two bits 0b00. Thus, the set will + // be the same. + auto pa = patternSetAssoc ? patternSetAssoc : 1; + auto bucket = n / pa; + fltTables[i] = ((i) << ceilLog2(pa) ) | bucket; + printf("%i=>%i:%i:%i ", i, n, bucket, fltTables[i]); + n++; + } + printf("\n"); + +} + +void +LLBP::squash(ThreadID tid, void *&bp_history) +{ + LLBPBranchInfo *bi = static_cast(bp_history); + if (bi->overridden) { + stats.squashedOverrides++; + } + base->squash(tid, bi->ltage_bi); + delete bi; + bp_history = nullptr; +} + +void +LLBP::update(ThreadID tid, Addr pc, bool taken, + void *&bp_history, bool resteer, + const StaticInstPtr &inst, Addr target) +{ + assert(bp_history); + LLBPBranchInfo *bi = static_cast(bp_history); + TAGE_SC_L::TageSCLBranchInfo *tage_bi = static_cast(bi->ltage_bi); + + if (resteer) { + if (bi->rcrBackup.size()) { + rcr.restore(bi->rcrBackup); + rcr.update(pc >> instShiftAmt, inst, taken); + } + + patternBuffer.clearInFlight(curCycle(), backingStorageLatency); + + // base->update(tid, pc, taken, bi->ltage_bi, resteer, inst, target); + base->update(tid, pc, taken, tage_bi, resteer, inst, target); + return; + } + + // This is a bit a hackish way to communicate the LLBP override information + // to the base predictor. The base predictor will use the current bi + // in its update and allocation functions + curUpdateBi = bi; + + // Do the base predictor update. + base->update(tid, pc, taken, tage_bi, resteer, inst, target); + + + if (inst->isCondCtrl()) + storageUpdate(tid, pc, taken, bi); + + + std::string rcr_cont = ""; + + for (auto v: rcr.bb) + rcr_cont.append(std::to_string(v) + " | "); + + + DPRINTF(LLBP, "LLBP::%s(pc=%llx, inst=%s, taken=%i, resteer=%i): " + "ccid=%llu, uncond=%i, sz=%d, RCR: %s\n", + __func__, + pc, inst->getName().c_str(), + taken, resteer, + bi->cid, + inst->isUncondCtrl(), + backingStorage.size(), + rcr_cont + ); + + // auto& tHist = base->tage->threadHistory[tid]; + // for (int n = tage_bi->tageBranchInfo->nGhist; n > 0; n--) { + // bool bit = *(tHist.gHist+n-1); + // DPRINTF(LLBP, "B:%i\n", bit); + // for (int i = 1; i <= base->getNumHistoryTables(); i++) { + // if (base->tage->noSkip[i]) { + // fghrT1[i].update(tHist.gHist+n-1); + // fghrT2[i].update(tHist.gHist+n-1); + // } + // } + // } + + branchCount++; + delete tage_bi; + + delete bi; + bp_history = nullptr; +} + +void +LLBP::branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, void * &bpHistory) +{ + LLBPBranchInfo *bi = new LLBPBranchInfo(pc, !uncond); + base->branchPlaceholder(tid, pc, uncond, bi->ltage_bi); + bpHistory = (void*)(bi); +} + +Prediction +LLBP::lookup(ThreadID tid, Addr pc, void *&bp_history) +{ + Prediction retval = predict(tid, pc, true, bp_history); + return retval; +} + +void +LLBP::calculateKeys(Addr pc) +{ + // TODO: Not needed anymore + std::string s = ""; + for (int i = 1; i <= base->getNumHistoryTables(); i++) { + // if (base->tage->noSkip[i]) { + if (fltTables[i] > 0) { + uint64_t key = pc >> instShiftAmt; + key ^= fghrT1[i].comp ^ (fghrT2[i].comp << 1); + key &= ((1ULL << uint64_t(TTWidth)) - 1ULL); + KEY[i] = uint64_t(key) << 10ULL | uint64_t(fltTables[i]); + s.append(std::to_string(i) + ":" + std::to_string(KEY[i]) + " | "); + } + } + DPRINTF(LLBP, "LLBP::%s(pc=%#llx): Keys: %s\n", __func__, pc, s); +} + +uint64_t +LLBP::calculateKey(TAGEBase::BranchInfo* tageBi, int tageBank, Addr pc) +{ + uint64_t key = 0; + uint64_t tag = tageBi->tableTags[tageBank]; + uint64_t index = tageBi->tableIndices[tageBank]; + // Align the index to the upper bits of the key + // TODO: Parametrize this + // 10 bits is the number of bits used in the TAGEBase::gindex + index <<= uint64_t(TTWidth - 10); + key ^= tag ^ index; + key &= ((1ULL << uint64_t(TTWidth)) - 1ULL); + return uint64_t(key) << 10 | uint64_t(fltTables[tageBank]); +} + +Prediction +LLBP::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void *&b) +{ + Addr pc = branch_pc; + + LLBPBranchInfo *bi = new LLBPBranchInfo(pc, cond_branch); + + Prediction ltage_prediction = base->predict( + tid, branch_pc, cond_branch, bi->ltage_bi + ); + + b = (void*)(bi); + + TAGE_SC_L::TageSCLBranchInfo *scltage_bi = static_cast(bi->ltage_bi); + + auto tage_bi = scltage_bi->tageBranchInfo; + bi->overridden = false; + bi->base_pred = ltage_prediction.taken; + + bool lightningOverride = false; + + Cycles latency = ltage_prediction.latency; + + int8_t llbp_confidence = 0; + + if (cond_branch) + { + int tage_bank = 0; + + if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH) + tage_bank = tage_bi->hitBank; + if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH) + tage_bank = tage_bi->altBank; + if (tage_bi->provider == TAGE_SC_L::LOOP || tage_bi->provider == TAGE_SC_L::SC) + tage_bank = base->getNumHistoryTables(); + if (tage_bank) + ++stats.baseHitsTotal; + + auto ccid = rcr.getCCID(); + bi->index = 0; + bi->cid = ccid; + if (backingStorage.count(ccid)) { + auto& context = backingStorage.at(ccid); + PatternBufferEntry* pbe = patternBuffer.get(ccid); + if (pbe) { + auto& entry = *pbe; + int bestPattern = findBestPattern(context, tage_bi, branch_pc); + bi->index = bestPattern >= 0 ? bestPattern : 0; + Cycles additionalLatency = calculateRemainingLatency(entry.insertTime); + if (additionalLatency == 0) { + if (bi->index > 0) + { + uint64_t key = calculateKey(tage_bi, bi->index, branch_pc); + auto &pattern = *context.patterns.getEntry(key); + + context.patterns.wasHit(key); + entry.lastUsed = curCycle(); + + ++stats.demandHitsTotal; + llbp_confidence = pattern.counter; + bool llbp_prediction = llbp_confidence >= 0; + + // Override early if lightning is enabled + if (PatternSet::absConfidence(llbp_confidence) > lightningPredCutoff) { + bi->lightningTarget = true; + bi->llbp_pred = llbp_prediction; + if (lightningPredEnabled) { + lightningOverride = true; + bi->overridden = true; + latency = Cycles(0); + ++stats.lightningHitsTotal; + } + } + + if (bi->index >= tage_bank && !lightningOverride) { + ++stats.demandHitsOverride; + bi->overridden = true; + bi->llbp_pred = llbp_prediction; + } else if (!lightningOverride) { + ++stats.demandHitsNoOverride; + } + } else { + ++stats.demandMissesPatternMiss; + ++stats.demandMissesTotal; + } + } else { + ++stats.demandMissesTotal; + } + } else { + ++stats.demandMissesContextNotPrefetched; + ++stats.demandMissesTotal; + } + + } else { + ++stats.demandMissesContextUnknown; + ++stats.demandMissesTotal; + } + DPRINTF(LLBP, "LLBP::%s(pc=%#llx): CID=%lx, Base:[Hit=%i,p=%i] LLBP:[Hit=%i,p=%i] confidence=%d, overridden=%s\n", + __func__, + pc, + bi->cid, + tage_bi->hitBank, bi->base_pred, + bi->index, bi->llbp_pred, + llbp_confidence, + bi->overridden); + } + + if (bi->overridden) { + // Overridden prediction + tage_bi->tagePred = bi->llbp_pred; + tage_bi->longestMatchPred = tage_bi->altTaken = bi->llbp_pred; + tage_bi->hitBank = tage_bi->altBank = 0; + tage_bi->provider = TAGEBase::BIMODAL_ONLY; + scltage_bi->lpBranchInfo->predTaken = bi->llbp_pred; + scltage_bi->lpBranchInfo->loopPredUsed = false; + scltage_bi->scBranchInfo->usedScPred = false; + } + + return Prediction {.taken = bi->getPrediction(), .latency = latency}; +} + +void +LLBP::updateHistories( + ThreadID tid, Addr pc, bool uncond, + bool taken, Addr target, + const StaticInstPtr &inst, + void *&bp_history) +{ + LLBPBranchInfo *bi; + if (bp_history == nullptr) { + assert(uncond); + bi = new LLBPBranchInfo(pc, !uncond); + bp_history = (void*)(bi); + } else { + bi = static_cast(bp_history); + } + + + // Backup the RCR state in case we need to restore it. + rcr.backup(bi->rcrBackup); + + // Update the RCR with the current branch + if (rcr.update(pc >> instShiftAmt, inst, taken)) { + + // If the RCR has updated the context ID, we need to + // check whether we have to prefetch the context. + uint64_t pcid = rcr.getPCID(); + if (backingStorage.count(pcid)) + { + if (patternBuffer.get(pcid) == nullptr) + { + ++stats.prefetchesIssued; + patternBuffer.insert(pcid, curCycle()); + } + } + } + + base->updateHistories(tid, pc, uncond, taken, target, inst, bi->ltage_bi); +} + + +int8_t LLBP::absPredCounter(int8_t counter) +{ + return counter >= 0 ? counter : -counter - 1; +} + +/** + * Update LLBP with the real outcome of a branch. + * It is currently assumed that the PB still contains the + * corresponding pattern set (no access latency applied). + * The context is created if it does not exist yet. + * The pattern is updated / a longer pattern is allocated. + * + * @param tid Thread ID + * @param pc Program counter + * @param taken Whether the branch was taken + * @param bi Branch info + */ +void LLBP::storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo *bi) +{ + // TAGE_SC_L::TageSCLBranchInfo *ltage_bi = + // static_cast(bi->ltage_bi); + + uint64_t cid = bi->cid; + + + auto tage_bi = static_cast(bi->ltage_bi)->tageBranchInfo; + + DPRINTF(LLBP, "LLBP::%s(pc=%lx, taken=%i) " + "cid=%llu, index=%d, " + "prediction=%d, mispred=%d, " + "llbp_pred=%d, base_pred=%d, overridden=%i\n", + __func__, pc, taken, + cid, bi->index, + bi->getPrediction(), taken != bi->getPrediction(), + bi->llbp_pred, bi->base_pred, bi->overridden); + + + + /************************************************** + * Allocation + * + * If the branch was mispredicted, we allocate a new pattern + * with longer history in the context. + * The pattern with the weakest confidence is replaced. + ***************************************************/ + // if (bi->getPrediction() != taken) { + auto& alloc_banks = static_cast(base->tage)->alloc_banks; + if (alloc_banks.size() > 0) { + + // Check if the context already exists + if (!backingStorage.count(cid)) { + + // If not, we create a new context + while (backingStorage.size() >= backingStorageCapacity) + { + ++stats.backingStorageEvictions; + uint64_t i = findVictimContext(); + backingStorage.erase(i); + } + DPRINTF(LLBP, "LLBP: CTX Alloc:%llx,\n", cid); + // backingStorage.emplace(cid, Context(PatternSet(64*8, 64*8, 8, stats))); + if (patternSetCapacity == 0) { + backingStorage.emplace(cid, Context(PatternSet(patternSetBankBits, stats))); + } else { + // FIX: The Associativity is actually the set size, not the associativity + backingStorage.emplace(cid, Context(PatternSet( + patternSetCapacity, patternSetAssoc, patternSetBankBits/* TODO Remove! */, stats + ))); + } + ++stats.backingStorageInsertions; + } + + Context& context = backingStorage.at(cid); + + // uint64_t key = context.patterns.calculateKey(tage_bi, tage_bank, pc); + // context.patterns.insertEntry(key, taken); + + for (auto bank : alloc_banks) { + if (fltTables[bank]) { + uint64_t key = calculateKey(tage_bi, bank, pc); + // uint64_t key = KEY[j]; + DPRINTF(LLBP, "LLBP Alloc:%i, %llx\n", bank, key); + ++stats.allocationsTotal; + context.patterns.insertEntry(key, taken); + } + } + } + + + + + + + /************************************************** + * Update + * + */ + + // Update the gem5 statistics for override tracking + if (bi->overridden) { + if (bi->getPrediction() != taken) { + ++stats.wrongOverridesTotal; + if (bi->llbp_pred == bi->base_pred) { + ++stats.wrongOverridesIdentical; + } + } else { + ++stats.correctOverridesTotal; + if (bi->llbp_pred == bi->base_pred) { + ++stats.correctOverridesIdentical; + } + } + } + + // Lightning predictions are counted as regrets if LLBP would have differed from base, + // and LLBP's guess would have been wrong (waiting for base would have saved a misprediction) + if (bi->lightningTarget) { + if (bi->llbp_pred != bi->base_pred + && bi->llbp_pred != taken) { + ++stats.lightningHitsRegret; + } + } + + // Check whether the branch context is known + // If not, we create a new context + if (backingStorage.count(cid)) + { + LLBP::Context& context = backingStorage.at(cid); + + // int i = bi->index; + if (bi->index > 0 && bi->overridden) + { + uint64_t key = calculateKey(tage_bi, bi->index, pc); + // uint64_t key = KEY[i]; + LLBP::Pattern* p = context.patterns.getEntry(key); + + if (p) { + LLBP::Pattern& pattern = *p; + + int8_t conf_before = pattern.counter; + TAGEBase::ctrUpdate(pattern.counter, taken, patternCounterWidth); + int8_t conf_after = pattern.counter; + + DPRINTF(LLBP, "LLBP::%s() CID=%llx key=%llx: %d -> %d (%s)\n", + __func__, cid, key, conf_before, conf_after, taken ? "taken" : "not taken"); + // This function updates the context replacement counter + // - If a pattern becomes confident (correct prediction) + // the replacement counter is increased + // - If a pattern becomes low confident (incorrect prediction) + // the replacement counter is decreased + if (pattern.counter == (taken ? 1 : -2)) + { + // Context is now medium confidence + TAGEBase::unsignedCtrUpdate(context.confidence, true, + contextCounterWidth); + } + else if (pattern.counter == (taken ? -1 : 0)) + { + // Context is now low confidence + TAGEBase::unsignedCtrUpdate(context.confidence, false, + contextCounterWidth); + } + + if (bi->getPrediction() == taken) { + context.patterns.wasUseful(key); + } + } + } + + + // // If a misprediction occurs, we allocate a new pattern with longer history + // // in the context. The pattern with the weakest confidence is replaced. + // if (bi->getPrediction() != taken) { + // if (i < base->getNumHistoryTables()) { + // ++stats.allocationsTotal; + // i = i+1; + // while (!base->tage->noSkip[i] && i < base->getNumHistoryTables()) + // i = i+1; + // uint64_t key = context.patterns.calculateKey(tage_bi, i, pc); + // context.patterns.insertEntry(key, taken); + // } + // } + // } + // else + // { + // while (backingStorage.size() >= backingStorageCapacity) + // { + // ++stats.backingStorageEvictions; + // uint64_t i = findVictimContext(); + // backingStorage.erase(i); + // } + + // // TODO: Check if this is a skip table + // int tage_bank = 1; + // if (tage_bi->provider == TAGEBase::TAGE_LONGEST_MATCH) + // tage_bank = tage_bi->hitBank; + // if (tage_bi->provider == TAGEBase::TAGE_ALT_MATCH) + // tage_bank = tage_bi->altBank; + + + // if (patternSetCapacity == 0) { + // backingStorage.emplace(cid, Context(PatternSet(patternSetBankBits, stats))); + // } else { + // backingStorage.emplace(cid, Context(PatternSet( + // patternSetCapacity, patternSetAssoc, patternSetBankBits, stats + // ))); + // } + + } + + + +} + +/** + * Find the best (=longest) pattern in the context. + * The context is searched by comparing the pc tag with decreasing length + * (similar to the TAGE predictor). + * + * @param tid Thread ID + * @param pc Program counter + * @param ctx Context to search in + * @return Index of the best pattern, or -1 if not found + */ +int LLBP::findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi, Addr pc) +{ + for (int i = base->getNumHistoryTables(); i > 0; i--) + { + // if (!base->tage->noSkip[i]) continue; + if (fltTables[i] == 0) continue; + uint64_t key = calculateKey(bi, i, pc); + // uint64_t key = KEY[i]; + if (ctx.patterns.getEntry(key)) + { + return i; + } + } + return -1; +} + +/** + * Find the context with the lowest confidence. + * @return Index of the context with the lowest confidence, or -1 if not found + */ +uint64_t LLBP::findVictimContext() +{ + auto elem = std::min_element( + backingStorage.begin(), backingStorage.end(), + [](const auto &a, const auto &b) + { + return a.second.confidence < b.second.confidence; + }); + if (elem == backingStorage.end()) + { + return -1; + } + return elem->first; +} + +Cycles LLBP::calculateRemainingLatency(Cycles insertTime) { + Cycles passedTime = (curCycle() - insertTime); + if (passedTime >= backingStorageLatency) + return Cycles(0); + return backingStorageLatency - passedTime; +} + +/* from LLBP source code: */ + +LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth) + : tagWidthBits(_CTWidth), T(_T), W(_W), D(_D), S(_shift) +{ + bb.resize(maxwindow); + ctxs = {0, 0}; +} + +/** + * Given {n} number of branches starting from the end of the RCR (front of the vec) + * (minus {skip} # of branches) we create the hash function by shifting + * each PC by {shift} number if bits i.e. + * + * 000000000000| PC | :vec[end-skip] + * ^ 0000000000| PC |00 :vec[end-skip-1] + * ^ 00000000| PC |0000 :vec[end-skip-2] + * . . + * . . + * . . + * ^ | PC |000000000000 :vec[end-skip-n-1] + * ---------------------- + * final hash value + * Then, the hash value is wrapped to the size of the context tag: + * @return final hash value % 2^tagWidthBits +*/ +uint64_t +LLBP::RCR::calcHash(int n, int skip, int shift) +{ + uint64_t hash = 0; + if (bb.size() < (skip + n)) + { + return 0; + } + + // Compute the rolling hash in element order (newer branches at the front) + uint64_t sh = 0; + auto it = bb.begin(); + std::advance(it, skip); + for (; (it != bb.end()) && (n > 0); it++, n--) + { + uint64_t val = *it; + + // Shift the value + hash ^= val << uint64_t(sh); + + sh += shift; + if (sh >= tagWidthBits) + { + sh -= uint64_t(tagWidthBits); + } + } + return moduloTwoExp(hash, tagWidthBits); +} + +uint64_t LLBP::RCR::getCCID() +{ + return ctxs.ccid; +} // Hash of all branches + +uint64_t LLBP::RCR::getPCID() +{ + return ctxs.pcid; +} + +bool LLBP::RCR::update(Addr pc, const StaticInstPtr &inst, bool taken) +{ + bool update = false; + + switch (T) + { + case 0: // All branches + update = true; + break; + + case 1: // Only calls + if (inst->isCall()) update = true; + break; + + case 2: // Only calls and returns + if (inst->isCall() || inst->isReturn()) + update = true; + break; + + case 3: // Only unconditional branches + if (inst->isUncondCtrl()) update = true; + break; + + case 4: // All taken branches + if (taken) update = true; + break; + } + + if (update) + { + // Add the new branch to the history + bb.push_front(pc); + + // Remove the oldest branch + bb.pop_back(); + + // The current context. + ctxs.ccid = calcHash(W, D, S); + // The prefetch context. + ctxs.pcid = calcHash(W, 0, S); + + + return true; + } + + return false; +} + +void LLBP::RCR::backup(std::list& vec) +{ + vec.clear(); + int count = D + W; + for (auto it = bb.begin(); + (it != bb.end()) && (count > 0); + ++it, --count) + { + vec.push_back(*it); + } +} + +void LLBP::RCR::restore(std::list& vec) +{ + bb.clear(); + for (auto it = vec.begin(); it != vec.end(); ++it) + { + bb.push_back(*it); + } + // The current context. + ctxs.ccid = calcHash(W, D, S); + // The prefetch context. + ctxs.pcid = calcHash(W, 0, S); +} + +LLBP::LLBPStats::LLBPStats(LLBP *llbp) + : statistics::Group(llbp), + parent(llbp), + ADD_STAT(allocationsTotal, statistics::units::Count::get(), + "Total number of new patterns allocated in any pattern set"), + ADD_STAT(prefetchesIssued, statistics::units::Count::get(), + "Number of prefetches issued to the backing storage"), + ADD_STAT(baseHitsTotal, statistics::units::Count::get(), + "Total on-demand hits of the base predictor"), + ADD_STAT(demandHitsTotal, statistics::units::Count::get(), + "Total on-demand hits to the pattern buffer"), + ADD_STAT(demandHitsOverride, statistics::units::Count::get(), + "On-demand hits to the pattern buffer with LLBP overriding the base predictor"), + ADD_STAT(demandHitsNoOverride, statistics::units::Count::get(), + "On-demand hits to the pattern buffer, using the base predictor (LLBP dropped)"), + ADD_STAT(demandMissesTotal, statistics::units::Count::get(), + "Total on-demand misses to the pattern buffer"), + ADD_STAT(demandMissesPatternMiss, statistics::units::Count::get(), + "On-demand misses to the pattern buffer, the chosen pattern-set did not contain the needed pattern"), + ADD_STAT(demandMissesContextTooLate, statistics::units::Count::get(), + "On-demand misses to the pattern buffer where the context was still delayed from insertion latency"), + ADD_STAT(demandMissesContextNotPrefetched, statistics::units::Count::get(), + "On-demand misses to the pattern buffer where the context was not scheduled for insertion"), + ADD_STAT(demandMissesContextUnknown, statistics::units::Count::get(), + "On-demand misses to the pattern buffer where the context was not in the backing storage"), + ADD_STAT(demandMissesPfInflight, statistics::units::Count::get(), + "On-demand misses to the pattern buffer where the context was not scheduled for insertion"), + // TODO fix stats + ADD_STAT(ctxHits, statistics::units::Count::get(), + "Total number of new patterns allocated in any pattern set"), + ADD_STAT(ptrnHits, statistics::units::Count::get(), + "Total number of new patterns allocated in any pattern set"), + ADD_STAT(patternHits, statistics::units::Count::get(), + "Number of times any pattern was hit (distribution)"), + ADD_STAT(patternUseful, statistics::units::Count::get(), + "Number of times any pattern was useful (distribution)"), + ADD_STAT(patternSetOccupancy, statistics::units::Count::get(), + "Number of patterns used in the pattern sets (distribution)"), + ADD_STAT(patternBufferEvictions, statistics::units::Count::get(), + "Number of pattern sets evicted from the pattern buffer due to capacity limits"), + ADD_STAT(backingStorageEvictions, statistics::units::Count::get(), + "Number of pattern sets evicted from the backing storage due to capacity limits"), + ADD_STAT(backingStorageInsertions, statistics::units::Count::get(), + "Number of pattern sets inserted into the backing storage (including replacements)"), + ADD_STAT(correctOverridesTotal, statistics::units::Count::get(), + "Number of branches predicted correctly by LLBP (LLBP was provider)"), + ADD_STAT(correctOverridesIdentical, statistics::units::Count::get(), + "Number of branches predicted correctly by LLBP, but the base predictor would also be correct (neutral)"), + ADD_STAT(correctOverridesUnique, statistics::units::Count::get(), + "Number of branches predicted correctly by LLBP, where the base predictor would be incorrect (good)"), + ADD_STAT(wrongOverridesTotal, statistics::units::Count::get(), + "Number of branches predicted wrong by LLBP (LLBP was provider)"), + ADD_STAT(wrongOverridesIdentical, statistics::units::Count::get(), + "Number of branches predicted wrong by LLBP, but the base predictor would also be wrong (neutral)"), + ADD_STAT(wrongOverridesUnique, statistics::units::Count::get(), + "Number of branches predicted correctly by LLBP, where the base predictor would be correct (bad)"), + ADD_STAT(squashedOverrides, statistics::units::Count::get(), + "Number of branches predicted by LLBP, but squashed before the outcome was known"), + ADD_STAT(profitOrLoss, statistics::units::Count::get(), + "Net P/L of (unique correct overrides - unique wrong overrides)"), + ADD_STAT(lightningHitsTotal, statistics::units::Count::get(), + "Number of branches overridden by lightning predictions"), + ADD_STAT(lightningHitsRegret, statistics::units::Count::get(), + "Number of branches (theoretically) overridden by lightning predictions, but turned out incorrect and different from base pred") + { + patternHits.init(16).flags(statistics::pdf); + patternUseful.init(16).flags(statistics::pdf); + if (parent) + patternSetOccupancy.init(parent->patternSetCapacity ? parent->patternSetCapacity + 1 : 16).flags(statistics::pdf); + else + patternSetOccupancy.init(17).flags(statistics::pdf); + + correctOverridesUnique = correctOverridesTotal - correctOverridesIdentical; + wrongOverridesUnique = wrongOverridesTotal - wrongOverridesIdentical; + + profitOrLoss = correctOverridesUnique - wrongOverridesUnique; + } + + +void +LLBP_TAGE_64KB::handleAllocAndUReset(bool alloc, bool taken, TAGEBase::BranchInfo* bi, int nrand) +{ + alloc_banks.clear(); + + // If LLBP has overridden the base predictor and a misprediction occured + // we need to let the base predictor know which length of the history + // has been matched. + bool modified = false; + if (parent->curUpdateBi->overridden) { + // If LLBP was provider we allocate if the prediction was wrong + // and the history length is shorter than the maximum. + alloc = (parent->curUpdateBi->llbp_pred != taken) && (parent->curUpdateBi->index < nHistoryTables); + bi->hitBank = parent->curUpdateBi->index; + modified = true; + } + // Do the actual allocation + TAGE_SC_L_TAGE_64KB::handleAllocAndUReset(alloc, taken, bi, nrand); + + // Afterwards, reset the override otherwise the base predictor + // will update an incorrect entry + if (modified) { + bi->hitBank = 0; + } +} + +int +LLBP_TAGE_64KB::allocateEntry(int bank, TAGEBase::BranchInfo* bi, bool taken) +{ + auto r = TAGE_SC_L_TAGE_64KB::allocateEntry(bank, bi, taken); + + // If the allocation was successful, record the table bank such that + // LLBP can allocate a new pattern with the same history length. + if (r > 0) { + alloc_banks.push_back(bank); + } + return r; +} + +void +LLBP_TAGE_64KB::handleTAGEUpdate(Addr pc, bool taken, TAGEBase::BranchInfo* bi) +{ + // Update the usefulness + if (parent->curUpdateBi->index > 0) { + // If TAGE was provider, it was correct and + // LLBP was incorrect this prediction was useful. + if ((!parent->curUpdateBi->overridden) && (bi->longestMatchPred == taken) && (parent->curUpdateBi->llbp_pred != taken)) { + if (bi->hitBank > 0) { + if(gtable[bi->hitBank][bi->hitBankIndex].u < ((1 << tagTableUBits) -1)) { + gtable[bi->hitBank][bi->hitBankIndex].u++; + } + } + } + } + // Only update the providing component if LLBP has overridden than + // don't update TAGE. The BIM might be updated if the LLBP override + // is weak confidence. + if (parent->curUpdateBi->overridden) { + // if (parent->overridePred != taken) { + + // } + return; + } + // If not overridden do the normal TAGE update + TAGE_SC_L_TAGE_64KB::handleTAGEUpdate(pc, taken, bi); +} + +bool +LLBP_TAGE_64KB::isUseful(bool taken, TAGEBase::BranchInfo* bi) const +{ + // If LLBP overrides we do the usefulness update in `handleTAGEUpdate` + return (parent->curUpdateBi->index > 0) ? false + : TAGE_SC_L_TAGE_64KB::isUseful(taken, bi); +} + +bool +LLBP_TAGE_64KB::isNotUseful(bool taken, TAGEBase::BranchInfo* bi) const +{ + return (parent->curUpdateBi->index > 0) ? false + : TAGE_SC_L_TAGE_64KB::isNotUseful(taken, bi); +} + +} // namespace branch_prediction +} // namespace gem5 diff --git a/src/cpu/pred/llbp.hh b/src/cpu/pred/llbp.hh new file mode 100644 index 00000000000..c79541a3c99 --- /dev/null +++ b/src/cpu/pred/llbp.hh @@ -0,0 +1,614 @@ +/* + +# Copyright (c) 2025 Technical University of Munich +# Copyright (c) 2024 The University of Edinburgh +# All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +Implementation of the last-level branch predictor (LLBP). + +*/ + +#ifndef __CPU_PRED_LLBP_HH__ +#define __CPU_PRED_LLBP_HH__ + +#include +#include +#include +#include + +#include "base/cache/associative_cache.hh" +#include "base/cache/cache_entry.hh" +#include "base/statistics.hh" +#include "base/types.hh" +#include "cpu/pred/tage_sc_l.hh" +#include "cpu/pred/tage_sc_l_64KB.hh" +#include "params/LLBP.hh" +#include "params/LLBP_TAGE_64KB.hh" + +namespace gem5 +{ + +namespace branch_prediction +{ + +class LLBP : public ConditionalPredictor +{ + public: + LLBP(const LLBPParams ¶ms); + + Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override; + + void squash(ThreadID tid, void * &bp_history) override; + void update(ThreadID tid, Addr pc, bool taken, + void * &bp_history, bool squashed, + const StaticInstPtr & inst, Addr target) override; + + void init() override; + void branchPlaceholder(ThreadID tid, Addr pc, + bool uncond, void * &bpHistory) override; + + void updateHistories(ThreadID tid, Addr pc, bool uncond, + bool taken, Addr target, + const StaticInstPtr &inst, + void * &bp_history) override; + protected: + + TAGE_SC_L* base; + + int backingStorageCapacity; + const Cycles backingStorageLatency; + + int patternSetCapacity; + int patternSetAssoc; + int patternSetBankBits; + + int contextCounterWidth; + int patternCounterWidth; + + bool lightningPredEnabled; + int lightningPredCutoff; + struct LLBPStats : public statistics::Group + { + LLBPStats(LLBP *llbp); + + void preDumpStats() override { + if (!parent) return; + for(auto& ctx : parent->backingStorage) { + ctx.second.patterns.commitStats(); + } + } + + LLBP* parent; + + statistics::Scalar allocationsTotal; + statistics::Scalar prefetchesIssued; + statistics::Scalar baseHitsTotal; + statistics::Scalar demandHitsTotal; + statistics::Scalar demandHitsOverride; + statistics::Scalar demandHitsNoOverride; + statistics::Scalar demandMissesTotal; + statistics::Scalar demandMissesPatternMiss; + statistics::Scalar demandMissesContextTooLate; + statistics::Scalar demandMissesContextNotPrefetched; + statistics::Scalar demandMissesContextUnknown; + statistics::Scalar demandMissesPfInflight; + statistics::Scalar ctxHits; + statistics::Scalar ptrnHits; + + statistics::SparseHistogram patternHits; + statistics::SparseHistogram patternUseful; + statistics::Histogram patternSetOccupancy; + statistics::Scalar patternBufferEvictions; + statistics::Scalar backingStorageEvictions; + statistics::Scalar backingStorageInsertions; + statistics::Scalar correctOverridesTotal; + statistics::Scalar correctOverridesIdentical; + statistics::Formula correctOverridesUnique; + statistics::Scalar wrongOverridesTotal; + statistics::Scalar wrongOverridesIdentical; + statistics::Formula wrongOverridesUnique; + statistics::Scalar squashedOverrides; + statistics::Formula profitOrLoss; + statistics::Scalar lightningHitsTotal; + statistics::Scalar lightningHitsRegret; + } stats; + + Cycles calculateRemainingLatency(Cycles insertTime); + + Prediction predict(ThreadID tid, Addr pc, + bool cond_branch, void * &bp_history); + + struct LLBPBranchInfo + { + bool overridden; + bool llbp_pred; + bool base_pred; + bool lightningTarget; + Addr pc; + int index; // TODO: rename this to hitIndex of llbpHit + uint64_t key; + bool conditional; + std::list rcrBackup; + uint64_t cid; + void* ltage_bi; + + LLBPBranchInfo(Addr pc, bool conditional) + : overridden(false), + lightningTarget(false), + pc(pc), + index(-1), + conditional(conditional), + cid(-1), + ltage_bi(nullptr) + {} + + bool getPrediction() { + return overridden ? llbp_pred : base_pred; + } + + ~LLBPBranchInfo() + {} + }; + + + + struct Pattern + { + uint64_t tag = 0; + int8_t counter = 0; + int hit = 0; + int useful = 0; + bool valid = false; + }; + + class PatternSet { + typedef typename std::vector set_t; + public: + PatternSet( + int numEntries, + int associativity, + int bankBits, + LLBPStats& stats + ): bankBits(bankBits), // TODO: Remove + associativity(associativity), + numSets(numEntries / associativity), + setMask(numSets - 1), + setSize(numEntries / numSets), + unbounded(false), + occupancy(0), + stats(stats) + { + assert(numEntries % setSize == 0); + assert(associativity * numSets == numEntries); + // this->numSets = numEntries / setSize; + sets.resize(numSets); + for (auto& set : sets) { + set.resize(setSize); + } + } + + PatternSet(int bankBits, LLBPStats& stats) + : bankBits(bankBits), + associativity(1), + numSets(1), + setMask(0), + setSize(0), + unbounded(true), + occupancy(0), + stats(stats) + { + } + + ~PatternSet() { + commitStats(); + } + + // key_t index(const key_t& key) { return key & _set_mask; } + + // set_t& getSet(const key_t& key) { + // return _cache[index(key)]; + // } + + + // int getBank(uint64_t key) { + // // FIX: THIS IS BOGUS! + // return bitmaskLowerN(bankBits) & key; + // } + + set_t& getSet(uint64_t key) { + int idx = int(key & setMask); + assert(idx < sets.size()); + return sets[idx]; + } + + // int getID(uint64_t key) { + // uint64_t bank = getBank(key); + // return bank / setSize; + // } + + Pattern* getEntry(uint64_t key) { + if (unbounded) { + Pattern& res = unboundedSet[key]; + if (!res.valid) return nullptr; + return &res; + } + set_t& set = getSet(key); + Pattern* result = findPatternInSet(key, set); + return result; + } + + void insertEntry(uint64_t key, bool taken) { + if (unbounded) { + Pattern& tgt = unboundedSet[key]; + tgt.tag = key; + tgt.counter = taken ? 0 : -1; + tgt.hit = 0; + tgt.useful = 0; + tgt.valid = true; + } else { + set_t& set = getSet(key); + Pattern& victim = findVictimPattern(set); + if (victim.valid) { + stats.patternUseful.sample(victim.useful); + stats.patternHits.sample(victim.hit); + } + victim.tag = key; + victim.counter = taken ? 0 : -1; + victim.hit = 0; + victim.useful = 0; + victim.valid = true; + } + + saturatingAdd(occupancy, associativity * numSets); + } + + void wasUseful(uint64_t key) { + Pattern* p = getEntry(key); + if (p) { + p->useful++; + } + } + + void wasHit(uint64_t key) { + Pattern* p = getEntry(key); + if (p) { + p->hit++; + } + } + + void commitStats() { + if (unbounded) { + stats.patternSetOccupancy.sample(unboundedSet.size()); + } else { + stats.patternSetOccupancy.sample(occupancy); + } + for (auto& set: sets) { + for (auto& pat: set) { + if (pat.valid) { + stats.patternUseful.sample(pat.useful); + stats.patternHits.sample(pat.hit); + } + } + } + } + + static int absConfidence(int8_t ctr) { + if (ctr < 0) { + return abs(ctr) - 1; + } + return ctr; + } + + static uint64_t bitmaskLowerN(int n) { // TODO unused + return (1 << n) - 1; + } + + static void saturatingSub(int& n) { + if (n > 0) { + --n; + } + } + + static void saturatingAdd(int& n, int max) { + if (n < max) { + ++n; + } + } + + private: + + Pattern* findPatternInSet(uint64_t key, set_t& set) { + auto result = std::find_if(set.begin(), set.end(), [key](Pattern& pat) { + return pat.tag == key && pat.valid; + }); + + if (result == set.end()) + return nullptr; + + return &*result; + } + + Pattern& findVictimPattern(set_t& set) { + auto firstInvalid = std::find_if(set.begin(), set.end(), [](Pattern& e) { + return !e.valid; + }); + + if (firstInvalid != set.end()) + return *firstInvalid; + + auto result = std::min_element(set.begin(), set.end(), [&](const Pattern& a, const Pattern& b) { + return absConfidence(a.counter) < absConfidence(b.counter); + }); + + return *result; + } + + const int bankBits; + const int associativity; + const int numSets; + const uint64_t setMask; + const int setSize; + const bool unbounded; + int occupancy; + + LLBPStats& stats; + std::unordered_map unboundedSet; + std::vector sets; + }; + + class Context + { + public: + PatternSet patterns; + /** Confidence counter of the context (guides replacement) */ + uint8_t confidence; + + Context(PatternSet patterns): patterns(patterns), confidence(0) {} + }; + + + // TODO: Make this a set-associative cache + // Use the underlying structure (template) that is used for the pattern set. + typedef std::unordered_map BackingStorage; + + BackingStorage backingStorage; + + struct PatternBufferEntry { + uint64_t cid; + Cycles insertTime; + Cycles lastUsed; + bool valid = false; + }; + + + class PatternBuffer { + public: + PatternBuffer( + int numEntries, + int setSize, + BackingStorage& backingStorage, + statistics::Scalar& patternBufferEvictions + ) + : setSize(setSize), + backingStorage(backingStorage), + patternBufferEvictions(patternBufferEvictions) + { + assert(numEntries % setSize == 0); + this->numSets = numEntries / setSize; + + sets.resize(numSets); + for (auto& set : sets) { + set.resize(setSize); + } + } + + void insert(uint64_t cid, Cycles now) { + auto& set = getSet(cid); + if (!backingStorage.count(cid)) + return; + PatternBufferEntry& victim = findVictim(set); + if (victim.valid) { + ++patternBufferEvictions; + } + victim.cid = cid; + victim.insertTime = now; + victim.lastUsed = now; + victim.valid = true; + } + + PatternBufferEntry* get(uint64_t cid) { + auto& set = getSet(cid); + return findEntry(cid, set); + } + + void clearInFlight(Cycles now, Cycles latency) { + for (auto& set: sets) { + for (auto& e: set) { + Cycles passedTime = (now - e.insertTime); + if (passedTime < latency) { + e.valid = false; + } + } + } + } + + std::vector& getSet(uint64_t cid) { + return sets[cid % numSets]; + } + + private: + PatternBufferEntry* findEntry(uint64_t cid, std::vector& set) { + auto result = std::find_if(set.begin(), set.end(), [cid](PatternBufferEntry& e) { + return e.cid == cid && e.valid; + }); + + if (result == set.end()) + return nullptr; + + return &*result; + } + + PatternBufferEntry& findVictim(std::vector& set) { + auto firstInvalid = std::find_if(set.begin(), set.end(), [](PatternBufferEntry& e) { + return !e.valid; + }); + + if (firstInvalid != set.end()) + return *firstInvalid; + + auto worst = std::min_element(set.begin(), set.end(), [&](const PatternBufferEntry& a, const PatternBufferEntry& b) { + return a.insertTime < b.insertTime; + }); + + return *worst; + } + int numSets; + int setSize; + BackingStorage& backingStorage; + statistics::Scalar& patternBufferEvictions; + std::vector> sets; + } patternBuffer; + + TAGEBase::FoldedHistory fghrT1[40]; + TAGEBase::FoldedHistory fghrT2[40]; + + uint64_t KEY[40]; // Key for each history length + void calculateKeys(Addr pc); + + public: + // The branch info of the branch that currently gets updated. + // A bit of a hack to communicate the LLBP prediction information + // to the base predictor. + LLBPBranchInfo *curUpdateBi = nullptr; // The branch info of the current update + + protected: + // TODO: move all parameters together, make them const and add details. Do the same in the python file. + const int TTWidth; // Tag table width in bits + const bool optimalPrefetching; // Ignores prefetching into the PB + // const int patternSetSize; // Number of patterns per context + // const int patternSetAssociativity; // Associativity of the pattern set + // const int patternSetSetSize; // Number of sets in the pattern set + + // A map to filter the used history lengths. + std::vector fltTables; + int branchCount = 0; // Number of branches executed + + uint64_t calculateKey(TAGEBase::BranchInfo* tageBi, int tageBank, Addr pc); + + + + int8_t absPredCounter(int8_t counter); + void storageUpdate(ThreadID tid, Addr pc, bool taken, LLBPBranchInfo* bi); + void storageInvalidate(); + int findBestPattern(Context &ctx, TAGEBase::BranchInfo *bi, Addr pc); + int findVictimPattern(int min, Context& ctx); + uint64_t findVictimContext(); + + + + /* From LLBP Source Code */ + + class RCR + { + public: + const int maxwindow = 120; + + uint64_t calcHash(int n, + int start=0, int shift=0); + + // The context tag width + const int tagWidthBits; + + // A list of previouly taken branches + std::list bb; + + // We compute the context ID and prefetch context ID + // only when the content of the RCR changes. + struct + { + uint64_t ccid = 0; + uint64_t pcid = 0; + } ctxs; + + + // The hash constants + const int T, W, D, S; + + RCR(int _T, int _W, int _D, int _shift, int _CTWidth); + + // Push a new branch into the RCR. + bool update(Addr pc, const StaticInstPtr & inst, bool taken); + + // Save the RCR state into a list + void backup(std::list &vec); + + // Restore the RCR state from a list + void restore(std::list &vec); + + /** + * Computes the modulo of a number val with respect to 2^exp + * i.e. val % 2^exp + * + * @param val The value to be wrapped + * @param exp The size as exponent of 2 + * @return The wrapped value + */ + inline static uint64_t moduloTwoExp(uint64_t val, int exp) { + return val & ((1 << (uint64_t) exp) - 1); + } + + // Get the current context ID + uint64_t getCCID(); + + // Get the prefetch context ID + uint64_t getPCID(); + } rcr; +}; + +class LLBP_TAGE_64KB : public TAGE_SC_L_TAGE_64KB +{ + LLBP *parent; + + public: + LLBP_TAGE_64KB(const LLBP_TAGE_64KBParams &p) + : TAGE_SC_L_TAGE_64KB(p) + {} + + void handleAllocAndUReset(bool alloc, bool taken, + TAGEBase::BranchInfo* bi, int nrand) override; + + void handleTAGEUpdate(Addr branch_pc, bool taken, + TAGEBase::BranchInfo* bi) override; + int allocateEntry(int bank, TAGEBase::BranchInfo* bi, bool taken) override; + bool isUseful(bool taken, TAGEBase::BranchInfo* bi) const override; + bool isNotUseful(bool taken, TAGEBase::BranchInfo* bi) const override; + + void setParent(LLBP *p) { + parent = p; + } + + std::vector alloc_banks; +}; + +} // namespace branch_prediction +} // namespace gem5 + + #endif // __CPU_PRED_LLBP_HH__ diff --git a/src/cpu/pred/llbp_ref.cc b/src/cpu/pred/llbp_ref.cc new file mode 100644 index 00000000000..3b6ef73d660 --- /dev/null +++ b/src/cpu/pred/llbp_ref.cc @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2025 Technical University of Munich + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Copyright (c) 2004-2006 The Regents of The University of Michigan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cpu/pred/llbp_ref.hh" + +#include "base/intmath.hh" +#include "base/logging.hh" +#include "base/trace.hh" +#include "debug/Fetch.hh" +#include "cpu/pred/llbpref/llbp.h" + +namespace gem5 +{ + +namespace branch_prediction +{ + +LLBPRef::LLBPRef(const LLBPRefParams ¶ms) + : ConditionalPredictor(params), + predictor(nullptr), + stats(this) +{ + if (params.inf) { + predictor = new LLBP::LLBPInfTageSCL64k(); + } else { + predictor = new LLBP::LLBPTageSCL64k(); + } +} + +LLBPRef::~LLBPRef() +{ + static_cast(predictor)->PrintStat(1.0); + delete predictor; +} + +void +LLBPRef::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, + Addr target, const StaticInstPtr &inst, + void * &bp_history) +{ +// Place holder for a function that is called to update predictor history +} + + +Prediction +LLBPRef::lookup(ThreadID tid, Addr branch_addr, void * &bp_history) +{ + auto pc = branch_addr >> instShiftAmt; + auto pred = predictor->GetPrediction(pc); + return staticPrediction(pred); +} + +void +LLBPRef::update(ThreadID tid, Addr branch_addr, bool taken, void *&bp_history, + bool squashed, const StaticInstPtr & inst, Addr target) +{ + if (squashed) { + return; + } + + auto brtype = getBranchType(inst); + OpType opType = OPTYPE_OP; + switch (brtype) { + case BranchType::DirectUncond: + opType = OPTYPE_JMP_DIRECT_UNCOND; + break; + case BranchType::DirectCond: + opType = OPTYPE_JMP_DIRECT_COND; + break; + case BranchType::IndirectUncond: + opType = OPTYPE_JMP_INDIRECT_UNCOND; + break; + case BranchType::IndirectCond: + opType = OPTYPE_JMP_INDIRECT_COND; + break; + case BranchType::CallDirect: + opType = OPTYPE_CALL_DIRECT_UNCOND; + break; + case BranchType::CallIndirect: + opType = OPTYPE_CALL_INDIRECT_UNCOND; + break; + case BranchType::Return: + opType = OPTYPE_RET_UNCOND; + break; + default: + opType = OPTYPE_OP; + break; + } + + if (opType == OPTYPE_OP) { + return; + } + auto pc = branch_addr >> instShiftAmt; + auto _target = target >> instShiftAmt; + + if (brtype == BranchType::DirectCond) { + predictor->UpdatePredictor(pc, taken, false, _target); + } else { + predictor->TrackOtherInst(pc, opType, taken, _target); + } + + + + +} + +void +LLBPRef::LLBPStats::preDumpStats() +{ + // This function is called before the stats are dumped. + // We can use it to print some additional information. + // DPRINTF(LLBP, "LLBPRef: Pre-dump stats for predictor\n"); + parent->predictor->PrintStat(1.0); +} + + +} // namespace branch_prediction +} // namespace gem5 diff --git a/src/cpu/pred/llbp_ref.hh b/src/cpu/pred/llbp_ref.hh new file mode 100644 index 00000000000..c030bf08d34 --- /dev/null +++ b/src/cpu/pred/llbp_ref.hh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2011, 2014 ARM Limited + * Copyright (c) 2022-2023 The University of Edinburgh + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Copyright (c) 2004-2006 The Regents of The University of Michigan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CPU_PRED_LLBP_REF_PRED_HH__ +#define __CPU_PRED_LLBP_REF_PRED_HH__ + +#include + +#include "base/sat_counter.hh" +#include "base/types.hh" +#include "cpu/pred/branch_type.hh" +#include "cpu/pred/conditional.hh" +#include "params/LLBPRef.hh" + +namespace LLBP +{ + class LLBP; +} + +namespace gem5 +{ + +namespace branch_prediction +{ + + +/** + * Implements a local predictor that uses the PC to index into a table of + * counters. Note that any time a pointer to the bp_history is given, it + * should be NULL using this predictor because it does not have any branch + * predictor state that needs to be recorded or updated; the update can be + * determined solely by the branch being taken or not taken. + */ +class LLBPRef : public ConditionalPredictor +{ + public: + /** + * Default branch predictor constructor. + */ + LLBPRef(const LLBPRefParams ¶ms); + ~LLBPRef(); + + // Overriding interface functions + Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override; + + // void branchPlaceholder(ThreadID tid, Addr pc, bool uncond, + // void * &bpHistory) override; + + void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, + Addr target, const StaticInstPtr &inst, + void * &bp_history) override; + + void update(ThreadID tid, Addr pc, bool taken, + void * &bp_history, bool squashed, + const StaticInstPtr & inst, Addr target) override; + + void squash(ThreadID tid, void * &bp_history) override + { assert(bp_history == NULL); } + + private: + // LLBP::LLBPTageSCL64k *predictor; + LLBP::LLBP *predictor; // Pointer to the LLBP predictor + + struct LLBPStats : public statistics::Group + { + LLBPRef *parent; + LLBPStats(LLBPRef *_parent) + : statistics::Group(_parent, "LLBPRef"), + parent(_parent) {} + + void preDumpStats() override; + } stats; +}; + +} // namespace branch_prediction +} // namespace gem5 + +#endif // __CPU_PRED_2BIT_LOCAL_PRED_HH__ diff --git a/src/cpu/pred/llbpref/base_predictor.h b/src/cpu/pred/llbpref/base_predictor.h new file mode 100644 index 00000000000..3155b88c69b --- /dev/null +++ b/src/cpu/pred/llbpref/base_predictor.h @@ -0,0 +1,45 @@ + +#pragma once +// #ifndef __BASE_PREDICTOR__ +// #define __BASE_PREDICTOR__ + +#include "common.h" + +class BasePredictor { + static inline UINT32 SatIncrement(UINT32 x, UINT32 max) { + if (x < max) return x + 1; + return x; + } + + static inline UINT32 SatDecrement(UINT32 x) { + if (x > 0) return x - 1; + return x; + } + + public: + BasePredictor() {}; + virtual ~BasePredictor() = default; + + virtual bool GetPrediction(uint64_t PC) = 0; + virtual void FirstTimeUpdate(uint64_t PC, bool taken, + uint64_t branchTarget) {}; + virtual void UpdatePredictor(uint64_t PC, bool resolveDir, + bool predDir, uint64_t branchTarget) = 0; + + virtual void TrackOtherInst(uint64_t PC, OpType opType, bool taken, + uint64_t branchTarget) = 0; + + virtual void PrintStat(double NUMINST) {}; + virtual void DumpTables(std::string filename) {}; + virtual void LoadTables(std::string filename) {}; + virtual void StartTracer(std::string filename) {}; + virtual void tick() {}; + virtual void resetStats() {}; + virtual void btbMiss() {}; + virtual void setState(bool warmup=false) {}; +}; + + +BasePredictor* CreateBP(std::string bp_name); + +// #endif //__BASE_PREDICTOR__ diff --git a/src/cpu/pred/llbpref/cache.h b/src/cpu/pred/llbpref/cache.h new file mode 100644 index 00000000000..39595c9edfb --- /dev/null +++ b/src/cpu/pred/llbpref/cache.h @@ -0,0 +1,179 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#pragma once + +template +class BaseCache { + protected: + typedef typename std::pair key_value_pair_t; + typedef typename std::list::iterator list_iterator_t; + typedef typename std::list set_t; + + std::unordered_map _index; + std::vector> _cache; + const size_t _max_size; + const size_t _assoc; + const uint64_t _sets; + const uint64_t _set_mask; + + public: + BaseCache(size_t max_size, size_t assoc) + : _max_size(max_size), + _assoc(assoc), + _sets(max_size / assoc), + _set_mask(_sets - 1) { + // Check if number of sets is a power of 2 + assert((_sets & (_sets - 1)) == 0); + assert(_assoc * _sets == _max_size); + _cache.resize(_sets); + // for (auto& set : _cache) { + // set.resize(assoc); + // } + } + + void printCfg() { + printf("Max size: %lu, Assoc: %lu, Sets: %lu\n", _max_size, _assoc, + _sets); + } + + size_t size() const { return _index.size(); } + + key_t index(const key_t& key) { return key & _set_mask; } + + set_t& getSet(const key_t& key) { + return _cache[index(key)]; + } + + const std::unordered_map getMap() { return _index; } + + value_t* get(const key_t& key) { + auto it = _index.find(key); + if (it == _index.end()) { + return nullptr; + } + return &it->second->second; + } + + void erase(const key_t& key) { + auto it = _index.find(key); + if (it == _index.end()) { + return; + } + auto& set = getSet(key); + set.erase(it->second); + _index.erase(key); + } + + value_t* getVictim(const key_t& key) { + auto& set = getSet(key); + if (set.size() < _assoc) { + return nullptr; + } + return &set.back().second; + } + + void touch(const key_t& key) { + auto it = _index.find(key); + if (it == _index.end()) { + return; + } + auto& set = getSet(key); + set.splice(set.begin(), set, it->second); + } + + void bump(const key_t& key, bool front=true) { + auto it = _index.find(key); + if (it == _index.end()) { + return; + } + auto& set = getSet(key); + if (front) { + set.splice(set.begin(), set, it->second); + } else { + set.splice(set.end(), set, it->second); + } + } + + bool exists(const key_t& key) const { + return _index.find(key) != _index.end(); + } + + int distance(const key_t& key) { + auto it = _index.find(key); + if (it == _index.end()) { + return -1; + } + auto& set = getSet(key); + return std::distance(set.begin(), it->second); + } + + set_t& getResizedSet(const key_t& key) { + auto& set = getSet(key); + + // If this element will exceed the max size, remove the last element + if (set.size() >= _assoc) { + auto last = set.end(); + last--; + _index.erase(last->first); + set.pop_back(); + } + return set; + } + + value_t* insertAt(const key_t& key, int at = 0) { + auto v = get(key); + if (v != nullptr) { + return v; + } + + // Get the set with a free item + auto& set = getResizedSet(key); + + // Move to the insert position + auto it2 = set.begin(); + at = std::min(at, (int)set.size()); + std::advance(it2, at); + + it2 = set.emplace(it2, key_value_pair_t(key, value_t())); + _index[key] = it2; + return &(it2->second); + } + + value_t* insert(const key_t& key) { + auto v = get(key); + if (v != nullptr) { + return v; + } + + // Get the set with a free item + auto& set = getResizedSet(key); + + // Move to the insert position + auto it = set.begin(); + + it = set.emplace(it, key_value_pair_t(key, value_t())); + _index[key] = it; + return &(it->second); + } +}; diff --git a/src/cpu/pred/llbpref/common.h b/src/cpu/pred/llbpref/common.h new file mode 100755 index 00000000000..9b83ba9770c --- /dev/null +++ b/src/cpu/pred/llbpref/common.h @@ -0,0 +1,96 @@ +/////////////////////////////////////////////////////////////////////// +// Copyright 2015 Samsung Austin Semiconductor, LLC. // +/////////////////////////////////////////////////////////////////////// + + +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include + +using namespace std; + +#define UINT32 unsigned int +#define INT32 int +#define UINT64 unsigned long long +// #define COUNTER unsigned long long + + +// #define NOT_TAKEN 0 +// #define TAKEN 1 + +#define FAILURE 0 +#define SUCCESS 1 + +//JD2_2_2016 +//typedef enum { +// OPTYPE_OP =2, +// OPTYPE_BRANCH_COND =3, +// OPTYPE_RET =4, +// OPTYPE_BRANCH =6, +// OPTYPE_INDIRECT =7, +// OPTYPE_MAX =8 +//}OpType; + +//JD2_17_2016 break down types into COND/UNCOND +typedef enum { + OPTYPE_OP =2, + + OPTYPE_RET_UNCOND, + OPTYPE_JMP_DIRECT_UNCOND, + OPTYPE_JMP_INDIRECT_UNCOND, + OPTYPE_CALL_DIRECT_UNCOND, + OPTYPE_CALL_INDIRECT_UNCOND, + + OPTYPE_RET_COND, + OPTYPE_JMP_DIRECT_COND, + OPTYPE_JMP_INDIRECT_COND, + OPTYPE_CALL_DIRECT_COND, + OPTYPE_CALL_INDIRECT_COND, + + OPTYPE_ERROR, + + OPTYPE_MAX +}OpType; + + + +typedef enum { + NoBranch, + Return, + CallDirect, + CallIndirect, + DirectCond, + DirectUncond, + IndirectCond, + IndirectUncond, + MAX +} BrType; + + +/** Some helper functions */ +OpType convertBrType(BrType type); + +#define PRINTDEBUG 0 + +#define DPRINTFF(...) \ + if (PRINTDEBUG) [[unlikely]] { \ + printf(__VA_ARGS__); \ + } + +#define DPRINTIF(cond, ...) \ + if (PRINTDEBUG && (cond)) [[unlikely]] { \ + printf(__VA_ARGS__); \ + } + +#define PRINTIF(cond, ...) \ + if (cond) [[unlikely]] { \ + printf(__VA_ARGS__); \ + } + +#endif + diff --git a/src/cpu/pred/llbpref/counters.h b/src/cpu/pred/llbpref/counters.h new file mode 100644 index 00000000000..55c42f50666 --- /dev/null +++ b/src/cpu/pred/llbpref/counters.h @@ -0,0 +1,104 @@ +#pragma once + +#include + + +// up-down saturating counter +inline void ctrupdate(int8_t& ctr, bool taken, int nbits) { + if (taken) { + if (ctr < ((1 << (nbits - 1)) - 1)) ctr++; + } else { + if (ctr > -(1 << (nbits - 1))) ctr--; + } +} + +// up-down saturating counter +template +inline void ctrupdate(T& ctr, bool up, int nbits) { + if (up) { + if (ctr < ((1 << (T)nbits) - 1)) ctr++; + } else { + if (ctr > 0) ctr--; + } +} + +enum {LowConf = 0, MedConf = 1, HighConf = 2}; +inline unsigned compConf(int8_t ctr, const int cwidth) { + if (cwidth < 2) + return HighConf; + // Two bit counters saturate at +1 and -2 + if (cwidth < 3) + return ((ctr == -2) || (ctr == 1)) ? HighConf : LowConf; + + if (abs (2 * ctr + 1) >= (1 << cwidth) - 1) + return HighConf; + if (abs (2 * ctr + 1) >= (1 << (cwidth - 1)) - 1) + return MedConf; + return LowConf; +} + +inline int8_t saturate(int8_t ctr, int nbits) { + if (ctr > ((1 << (nbits - 1)) - 1)) return ((1 << (nbits - 1)) - 1); + if (ctr < -(1 << (nbits - 1))) return -(1 << (nbits - 1)); + return ctr; +} + +inline int center(int8_t ctr) { + return 2 * ctr + 1; +} + +#define CUMAX(x) ((1 << (x)) - 1) + +//counter +//MAX : max value +//MIN : min value +template +class COUNTER { +private: + T ctr; +public: + T read() { + return ctr; + } + + bool pred() { + return ctr >= 0; + } + bool satmax(){ + return ctr == MAX; + } + bool satmin(){ + return ctr == MIN; + } + void write(T v) { + assert(v <= MAX); + assert(v >= MIN); + ctr = v; + } + void add(T d) { + ctr = ctr + d; + if (ctr > MAX){ + ctr = MAX; + }else if (ctr < MIN){ + ctr = MIN; + } + } + void update(bool incr) { + if (incr) { + if (ctr < MAX) + ctr = ctr + 1; + } else { + if (ctr > MIN) + ctr = ctr - 1; + } + } +}; +//signed integer counter +template +class SCOUNTER : public COUNTER{ +}; +//unsigned integer counter +template +class UCOUNTER : public COUNTER{ +}; + diff --git a/src/cpu/pred/llbpref/hist_registers.h b/src/cpu/pred/llbpref/hist_registers.h new file mode 100644 index 00000000000..1c4e19b94a3 --- /dev/null +++ b/src/cpu/pred/llbpref/hist_registers.h @@ -0,0 +1,508 @@ +#pragma once + +#include +#include +#include +#include + + + +//========================================================== +// History Code: +// The code is based on the "Dynamically Sizing the TAGE Branch Predictor" +// paper by Stephen Pruett submitted to the CBP-5 workshop. + + + +// [PPM, page 4] Discusses how to build a low latency FoldedHistory register +struct HistoryRegister { + public: + uint32_t size; + uint32_t head; + std::vector history; + long long history_l; + + void init(uint32_t s) { + size = s; + history.resize(size); + + for (uint32_t i = 0; i < size; ++i) { + history[i] = false; + } + history_l = 0; + + head = 0; + } + + HistoryRegister() {} + + HistoryRegister(uint32_t s) { init(s); } + + void push(bool p) { + head = (head + 1) % size; + history[head] = p; + + history_l <<= 1; + history_l += (p & 0x1); + } + + bool operator[](const uint32_t i) { + uint32_t index = (head + size - i) % size; + assert(index < size); + return history[index]; + } + + void print() { + printf("History"); + for (uint32_t i = 0; i < size; ++i) { + printf("%d, ", (bool)history[(head - i) % size]); + } + printf("\n"); + } + + long long getHistory() { return history_l; } + + uint32_t getSize() { return size; } +}; + +struct PCHistoryRegister { + public: + uint32_t size; + uint32_t head; + typedef std::pair entry_t; + std::vector history; + long long history_l; + + void init(uint32_t s) { + size = s; + history.resize(size); + + for (uint32_t i = 0; i < size; ++i) { + history[i] = entry_t(0,false); + } + history_l = 0; + + head = 0; + } + + PCHistoryRegister() {} + + PCHistoryRegister(uint32_t s) { init(s); } + + void push(uint64_t pc, bool t) { + head = (head + 1) % size; + history[head] = entry_t(pc,t); + } + + entry_t operator[](const uint32_t i) { + uint32_t index = (head + size - i) % size; + assert(index < size); + return history[index]; + } + + void print() { + printf("History"); + for (uint32_t i = 0; i < size; ++i) { + auto e = history[(head - i) % size]; + printf("%lu:%d ",e.first, (bool)e.second); + } + printf("\n"); + } + + std::string toStr() { + std::string s = ""; + for (uint32_t i = 0; i < size; ++i) { + auto e = history[(head - i) % size]; + // s += std::to_string(e.first) + "," + std::to_string(e.second) + ","; + s += std::to_string(e.first) + ","; + } + return s; + } + + + uint32_t getSize() { return size; } +}; + + +class FoldedHistory { + private: + uint32_t inputWidth; // size of emulated history register + uint32_t outputWidth; // size of folded register + uint32_t + maxOutputWidth; // first width register is set to. Used to calc size. + int32_t remainder; + int32_t value; + HistoryRegister* ghr; + + FoldedHistory() {} + + public: + FoldedHistory(HistoryRegister* g, uint32_t iw, uint32_t ow) { + inputWidth = iw; + outputWidth = ow; + maxOutputWidth = outputWidth; + ghr = g; + + // using a 32-bit integer as register + // -need an extra bit, so max is 31 bits... + assert(outputWidth < 32); + assert(outputWidth != 0); + remainder = inputWidth % outputWidth; + value = 0; + } + + // Expectation is that FoldedHistory push is called + // after HistoryRegister push + void update() { + // input bit most recent shifted into ghr + bool inBit = (*ghr)[0]; + + // Shift in bit + value = (value << 1) | (inBit ? 0x01 : 0x00); + + // Fold shifted-out bit in + value = value ^ (value >> outputWidth); + value = value & ((1 << outputWidth) - 1); + + // Get bit to shift out from Global History + bool outputBit = (*ghr)[inputWidth]; + int32_t outputValue = (outputBit) ? (0x01 << (remainder)) : 0x0; + + // Shift out bit + value = value ^ outputValue; + } + + inline int32_t get() { return value; } + + void reset() { value = 0; } + + uint32_t getSize() { return maxOutputWidth; } +}; + + + + +// [PPM, page 4] Discusses how to build a low latency FoldedHistory register +struct HistoryRegisterFast { + public: + const uint32_t size; + uint32_t head; + + uint8_t* history; + long long history_l; + + HistoryRegisterFast(uint32_t s) + : size(s) + { + history = new uint8_t[size](); + + for (uint32_t i = 0; i < size; ++i) { + history[i] = false; + } + history_l = 0; + + head = 0; + } + + void push(bool p) { + head--; + history[head & (size - 1)] = p; + history_l = (history_l << 1) + (p & 0x1); + } + + inline bool operator[](const uint32_t i) { + return history[(head + i) & (size - 1)]; + } + + void print() { + printf("History"); + for (uint32_t i = 0; i < size; ++i) { + printf("%d, ", (bool)history[(head - i) % size]); + } + printf("\n"); + } + + long long getHistory() { return history_l; } + + uint32_t getSize() { return size; } +}; + + +class FoldedHistoryFast { + private: + const uint32_t inputWidth; // size of emulated history register + const uint32_t outputWidth; // size of folded register + const uint32_t + maxOutputWidth; // first width register is set to. Used to calc size. + const int32_t remainder; + HistoryRegisterFast& ghr; // Reference to global history register + + + public: + int32_t value; + + FoldedHistoryFast(HistoryRegisterFast& g, uint32_t iw, uint32_t ow) + : inputWidth(iw), + outputWidth(ow), + maxOutputWidth(ow), + remainder(iw % ow), + ghr(g) + { + // using a 32-bit integer as register + // -need an extra bit, so max is 31 bits... + assert(outputWidth < 32); + assert(outputWidth != 0); + value = 0; + } + + // Expectation is that FoldedHistory push is called + // after HistoryRegister push + void update() { + + + + // // Shift in new bit + // value = (value << 1) | ghr[0]; + + // // Get bit to shift out from Global History + // value = value ^ ghr[inputWidth] << outputWidth; + + // // Fold and mask + // value = value ^ (value >> outputWidth); + // value = value & ((1 << outputWidth) - 1); + + + + + // input bit most recent shifted into ghr + bool inBit = ghr[0]; + + // Shift in bit + value = (value << 1) | (inBit ? 0x01 : 0x00); + + + + + + + + // Fold shifted-out bit in + value = value ^ (value >> outputWidth); + value = value & ((1 << outputWidth) - 1); + + // Get bit to shift out from Global History + bool outputBit = ghr[inputWidth]; + int32_t outputValue = (outputBit) ? (0x01 << (remainder)) : 0x0; + + // Shift out bit + value = value ^ outputValue; + } + + inline int32_t get() { return value; } + + void reset() { value = 0; } + + uint32_t getSize() { return maxOutputWidth; } +}; + + + + + + + + +// utility class for index computation +// this is the cyclic shift register for folding +// a long global history into a smaller number of bits; see P. Michaud's +// PPM-like predictor at CBP-1 +class folded_history { + public: + unsigned comp; + int CLENGTH; + int OLENGTH; + int OUTPOINT; + int histbufferlen; + + folded_history() {} + + void init(int original_length, int compressed_length, int _histbufferlen) { + comp = 0; + OLENGTH = original_length; + CLENGTH = compressed_length; + OUTPOINT = OLENGTH % CLENGTH; + histbufferlen = _histbufferlen; + } + + void update(uint8_t *h, int PT) { + comp = (comp << 1) ^ h[PT & (histbufferlen - 1)]; + comp ^= h[(PT + OLENGTH) & (histbufferlen - 1)] << OUTPOINT; + comp ^= (comp >> CLENGTH); + comp = (comp) & ((1 << CLENGTH) - 1); + } +}; + +class Bentry // TAGE bimodal table entry +{ + public: + int8_t ctr; + int8_t hyst; + int8_t pred; + uint64_t pc; + uint64_t id; + + Bentry() { + ctr = -1; + pred = 0; + hyst = 1; + pc = 0; + id = 0; + } +}; + +class Gentry // TAGE global table entry +{ + public: + int8_t ctr; + uint tag; + int8_t u; + int correct; + int incorrect; + int useful; + uint64_t pc; + int hlen; + int idx; + uint64_t key; + uint64_t ctx; + + Gentry() { + ctr = 0; + u = 0; + tag = 0; + correct = -1; + incorrect = 0; + useful = 0; + pc = 0; + key = 0; + ctx = 0; + } +}; + + + + +// FTL++ + + +//history register +class HISTORY { +private: + int MAXHIST; // constant value + + bool *bhr; // branch history + +public: + void init(int HLENGTH) { + MAXHIST = HLENGTH+1; // set constant value + bhr = new bool[MAXHIST]; + for (int i = 0; i < MAXHIST; i++){ + bhr[i] = false; + } + } + + bool read(int n) { + assert(n < MAXHIST); + return bhr[n]; + } + + uint32_t read(int from, int n) { + assert(n < MAXHIST); + int r = 0; + for (int i = from; i < n; ++i) { + r ^= bhr[i] << ((i - from) % 32); + } + return r; + } + //push + void push(bool t) { + for(int i=MAXHIST-1; i>=0; i--) { + bhr[i] = bhr[i-1]; + } + bhr[0] = t; + } +}; + +//folded history +template +class FOLDHIST { +private: + //path history length + static const int PathHistoryLength = 32; + +private: + uint32_t start[2]; // constant value + uint32_t end[2]; // constant value + uint32_t pos[3]; // constant value + + uint32_t comp; // folded history + +public: + + void init(int s, int e,int s2,int e2) { + comp = 0; + start[0] = s; + start[1] = s2; + end[0] = e; + end[1] = e2; + pos[0] = 0; + pos[1] = (pos[0] + end[0] - start[0]) % WIDTH; + pos[2] = (pos[1] + end[1] - start[1]) % WIDTH; + assert(pos[0] < WIDTH); + assert(pos[1] < WIDTH); + assert(pos[2] < WIDTH); + } + void init(int s, int e) { + comp = 0; + start[0] = s; + start[1] = s; + end[0] = e; + end[1] = e; + pos[0] = 0; + pos[1] = (pos[0] + end[0] - start[0]) % WIDTH; + pos[2] = (pos[1] + end[1] - start[1]) % WIDTH; + assert(pos[0] < WIDTH); + assert(pos[1] < WIDTH); + assert(pos[2] < WIDTH); + } + + void init(int l) { + init(0, l,0,(l>=PathHistoryLength)?PathHistoryLength:l); + } + + uint32_t read(uint32_t pc) { + assert(comp >= 0); + assert(comp < (1 << WIDTH)); + + pc &= (1 << WIDTH) - 1; + return pc ^ comp; + } + uint32_t read(uint32_t pc, int rot) { + assert(comp >= 0); + assert(comp < (1 << WIDTH)); + uint32_t r = rot % WIDTH; + uint32_t p = pc & ((1 << WIDTH) - 1); + p = (p << r); + p = p ^ (p >> WIDTH); + p = (p & ((1 << WIDTH) - 1)); + return p ^ comp; + } + void update(class HISTORY &ghr, class HISTORY &phr) { + comp = (comp << 1); + comp |= (comp >> WIDTH); + comp &= ((1 << WIDTH) - 1); + comp ^= (ghr.read(start[0]) ? 1 : 0) << pos[0]; + comp ^= (ghr.read(end[0]) ? 1 : 0) << pos[1]; + comp ^= (phr.read(start[1]) ? 1 : 0) << pos[1]; + comp ^= (phr.read(end[1]) ? 1 : 0) << pos[2]; + } +}; diff --git a/src/cpu/pred/llbpref/histogram.h b/src/cpu/pred/llbpref/histogram.h new file mode 100644 index 00000000000..f00d8125d50 --- /dev/null +++ b/src/cpu/pred/llbpref/histogram.h @@ -0,0 +1,183 @@ + + +// S +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "error.h" + + +template +class Histogram { + + // Define upper and lower bounds + const T lower; + const T upper; + const int bins; + + + + // Bucket struct + struct Bucket { + std::string mark; + T count; + double frequency; + }; + + // Containers to hold the buckets and counts + // The number of buckets is bins + 2 because there + // is one extra bucket on each side to hold values + // that are outside the range + std::vector buckets; + + // Bucket size + const T bs; + + // total number of elements inserted + int samples; + T max; + T min; + T sum; + + + public: + + Histogram(T lower, T upper, int bins) + : lower(lower), upper(upper), bins(bins), + bs((upper - lower) / static_cast(bins)), + samples(0), max(0), min(0), sum(0) + { + + // panic_if(bs <= 0, "Bucket size must be positive"); + + buckets.resize(bins + 2); + { + std::ostringstream oss; + oss << "< " << std::fixed << std::setprecision(1) << lower; + buckets[0] = Bucket{oss.str(), 0, 0}; + } + { + std::ostringstream oss; + oss << "> " << std::fixed << std::setprecision(1) << upper; + buckets[bins + 1] = Bucket{oss.str(), 0, 0}; + } + + for (int i = 1; i <= bins; ++i) { + std::ostringstream oss; + if (bs == 1) { + oss << (lower + bs * static_cast(i-1)); + buckets[i] = Bucket{oss.str(), 0, 0}; + } else { + oss << std::fixed << std::setprecision(1) + << (lower + bs * static_cast(i-1)) << "-" + << (lower + bs * static_cast(i)); + buckets[i] = Bucket{oss.str(), 0, 0}; + } + } + } + + void insert(T value, int count=1) { + samples += count; + sum += value * count; + if (value > max) { + max = value; + } + if (value < min) { + min = value; + } + + if (value < lower) { + buckets[0].count += count; + return; + } else if (value > upper) { + buckets[bins + 1].count += count; + return; + } + + int idx = static_cast((value - lower) / bs) + 1; + buckets[idx].count += count; + } + + void insert(const std::vector& values) { + for (auto v : values) { + insert(v); + } + } + + void reset() { + samples = 0; + max = 0; + min = 0; + sum = 0; + for (auto& b : buckets) { + b.count = 0; + } + } + + + std::string print(bool perc=false, bool cdf=false, int width = 40) { + T _max = 0; + for (const auto& b : buckets) { + if (b.count > _max) { + _max = b.count; + } + } + + std::ostringstream res; + + res << "N:" << samples + << " Min:" << min + << " Max:" << max + << " Sum:" << sum + << " Avg:" << std::fixed << std::setprecision(2) << getAvg() + << "\n"; + + + int cum = 0; + for (const auto& b : buckets) { + int barLen = _max > 0 ? static_cast((b.count * width + _max / 2) / _max) : 0; + + res << std::left << std::setw(10) << b.mark + << " [" << std::setw(4) << b.count << "]\t"; + + if (perc) { + res << std::fixed << std::setprecision(1) << (100.0 * b.count / samples) << "%\t"; + } + + if (cdf) { + cum += b.count; + res << std::fixed << std::setprecision(1) << (100.0 * cum / samples) << "%\t"; + } + + res << "|" << std::string(barLen, '*') << "\n"; + + } + return res.str(); + } + + std::string printCDF() { + return print(false, true); + } + + double getAvg() { + return static_cast(sum) / static_cast(samples); + } + + T getMax() { + return max; + } + + T getMin() { + return min; + } + +}; + diff --git a/src/cpu/pred/llbpref/intmath.hh b/src/cpu/pred/llbpref/intmath.hh new file mode 100644 index 00000000000..73ce2d7b13c --- /dev/null +++ b/src/cpu/pred/llbpref/intmath.hh @@ -0,0 +1,304 @@ +/* + * COPY FROM GEM5 + * + * Copyright (c) 2021 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Copyright (c) 2001, 2003-2005 The Regents of The University of Michigan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __BASE_INTMATH_HH__ +#define __BASE_INTMATH_HH__ + +#include +#include +#include +#include + +// #include "base/bitfield.hh" + + + +/** + * @ingroup api_base_utils + */ +template +static constexpr std::enable_if_t, int> +floorLog2(T x) +{ + assert(x > 0); + + // A guaranteed unsigned version of x. + uint64_t ux = (typename std::make_unsigned::type)x; + + int y = 0; + constexpr auto ts = sizeof(T); + + if (ts >= 8 && (ux & 0xffffffff00000000ULL)) { y += 32; ux >>= 32; } + if (ts >= 4 && (ux & 0x00000000ffff0000ULL)) { y += 16; ux >>= 16; } + if (ts >= 2 && (ux & 0x000000000000ff00ULL)) { y += 8; ux >>= 8; } + if (ux & 0x00000000000000f0ULL) { y += 4; ux >>= 4; } + if (ux & 0x000000000000000cULL) { y += 2; ux >>= 2; } + if (ux & 0x0000000000000002ULL) { y += 1; } + + return y; +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr int +ceilLog2(const T& n) +{ + assert(n > 0); + if (n == 1) + return 0; + + return floorLog2(n - (T)1) + 1; +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr bool +isPowerOf2(const T& n) +{ + // If n is non-zero, and subtracting one borrows all the way to the MSB + // and flips all bits, then this is a power of 2. + return n && !(n & (n - 1)); +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr T +divCeil(const T& a, const U& b) +{ + return (a + b - 1) / b; +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr std::enable_if_t +mulUnsigned(std::make_unsigned_t &high, std::make_unsigned_t &low, + std::make_unsigned_t val_a, std::make_unsigned_t val_b) +{ + uint64_t product = (uint64_t)val_a * (uint64_t)val_b; + low = product; + high = (product >> (sizeof(low) * 8)); +}; + +/** + * @ingroup api_base_utils + */ +template +static constexpr std::enable_if_t +mulSigned(std::make_signed_t &high, std::make_signed_t &low, + std::make_signed_t val_a, std::make_signed_t val_b) +{ + uint64_t product = (int64_t)val_a * (int64_t)val_b; + low = product; + high = (product >> (sizeof(low) * 8)); +}; + +/** + * Multiply two values with place value p. + * + * (A * p + a) * (B * p + b) = + * (A * B) * p^2 + (a * B + A * b) * p + (a * b) + * + * low result = (a * B + A * b) * p + (a * b) + * high result = (A * B) + carry out from low result. + * + * As long as p is at most half the capacity of the underlying type, no + * individual multiplication will overflow. We just have to carefully manage + * carries to avoid losing any during the addition steps. + */ +template +static constexpr std::enable_if_t +mulUnsignedManual(std::make_unsigned_t &high, std::make_unsigned_t &low, + std::make_unsigned_t val_a, std::make_unsigned_t val_b) +{ + low = val_a * val_b; + + uint64_t A = (uint32_t)(val_a >> 32); + uint64_t a = (uint32_t)val_a; + uint64_t B = (uint32_t)(val_b >> 32); + uint64_t b = (uint32_t)val_b; + + uint64_t c1 = 0, c2 = 0; // Carry between place values. + uint64_t ab = a * b, Ab = A * b, aB = a * B, AB = A * B; + + c1 = (uint32_t)(ab >> 32); + + // Be careful to avoid overflow. + c2 = (c1 >> 1) + (Ab >> 1) + (aB >> 1); + c2 += ((c1 & 0x1) + (Ab & 0x1) + (aB & 0x1)) >> 1; + c2 >>= 31; + + high = AB + c2; +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr std::enable_if_t +mulUnsigned(std::make_unsigned_t &high, std::make_unsigned_t &low, + std::make_unsigned_t val_a, std::make_unsigned_t val_b) +{ +#ifdef __SIZEOF_INT128__ + __uint128_t val = (__uint128_t)val_a * (__uint128_t)val_b; + low = val; + high = (val >> 64); +#else + mulUnsignedManual(high, low, val_a, val_b); +#endif +} + +template +static constexpr std::enable_if_t +mulSignedManual(std::make_signed_t &high, std::make_signed_t &low, + std::make_signed_t val_a, std::make_signed_t val_b) +{ + uint64_t u_high = 0, u_low = 0; + mulUnsigned(u_high, u_low, val_a, val_b); + + if (val_a < 0) + u_high -= val_b; + if (val_b < 0) + u_high -= val_a; + + high = u_high; + low = u_low; +} + +/** + * @ingroup api_base_utils + */ +template +static constexpr std::enable_if_t +mulSigned(std::make_signed_t &high, std::make_signed_t &low, + std::make_signed_t val_a, std::make_signed_t val_b) +{ +#ifdef __SIZEOF_INT128__ + __int128_t val = (__int128_t)val_a * (__int128_t)val_b; + low = val; + high = (val >> 64); +#else + mulSignedManual(high, low, val_a, val_b); +#endif +} + +template +static constexpr std::pair, std::make_unsigned_t> +mulUnsigned(std::make_unsigned_t val_a, std::make_unsigned_t val_b) +{ + std::make_unsigned_t hi{}, low{}; + mulUnsigned(hi, low, val_a, val_b); + return {hi, low}; +}; + +template +static constexpr std::pair, std::make_signed_t> +mulSigned(std::make_signed_t val_a, std::make_signed_t val_b) +{ + std::make_signed_t hi{}, low{}; + mulSigned(hi, low, val_a, val_b); + return {hi, low}; +}; + +/** + * This function is used to align addresses in memory. + * + * @param val is the address to be aligned. + * @param align is the alignment. Can only be a power of 2. + * @return The aligned address. The smallest number divisible + * by @param align which is greater than or equal to @param val. + * + * @ingroup api_base_utils + */ +template +static constexpr T +roundUp(const T& val, const U& align) +{ + assert(isPowerOf2(align)); + T mask = (T)align - 1; + return (val + mask) & ~mask; +} + +/** + * This function is used to align addresses in memory. + * + * @param val is the address to be aligned. + * @param align is the alignment. Can only be a power of 2. + * @return The aligned address. The biggest number divisible + * by @param align which is less than or equal to @param val. + * + * @ingroup api_base_utils + */ +template +static constexpr T +roundDown(const T& val, const U& align) +{ + assert(isPowerOf2(align)); + T mask = (T)align - 1; + return val & ~mask; +} + +// /** +// * Calculate the log2 of a power of 2 integer +// * +// * @param An input value +// * @return The base 2 log of value +// * +// * @ingroup api_base_utils +// */ +// static constexpr int +// log2i(int value) +// { +// assert(isPowerOf2(value) && value > 0); +// return ctz32(value); +// } + +// } // namespace gem5 + +#endif // __BASE_INTMATH_HH__ diff --git a/src/cpu/pred/llbpref/llbp.cc b/src/cpu/pred/llbpref/llbp.cc new file mode 100755 index 00000000000..23bfaa1c268 --- /dev/null +++ b/src/cpu/pred/llbpref/llbp.cc @@ -0,0 +1,1248 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "llbp.h" + +#include +#include "intmath.hh" + + +namespace LLBP { + + +#define PRINTDEBUG 0 +// #define COND (stats.total > 3000000) +#define COND (false) +#define COND2 (false) + + +#ifdef LLBP_CONSTRAINED +// Only sort if the constrained version is used +#define L2_SORT_CTX +#define L2_SORT_PTRN +#define FILTER_TABLES + +#endif + +#define OVERWRITE_SCL + +/* The hash function is defined by 4 paramenters + * + * T: Type of history (T). Which branches should be hased + * 0: All branches, 1: Only calls, 2: Calls and returns + * 3: All unconditional branches, 4: All taken branches + * + * W: Number of branches that should be hashed (W in the paper). + * D: Number of most recent branches skipped for CCID. Adds delay which + * is used to prefetch. (D in the paper.) + * S: Number of bits to shift the PC's. Is useful to avoid ping-pong context + * due to the XOR function in case a loop is executed + * + * ********************************************************************* * + * EXAMPLE * + * * + * pb-index (2.) (3.) * + * v v v * + * history buffer : |l|k|j|i|h|g|f|e|d|c|b|a| * + * ^prefetch (2.)^ * + * * + * a is the newest branch PC added to the buffer, l the oldest. * + * (2.) = W = 7; (3.) = D = 3 * + * branches used to obtain PB index hash: j to d * + * branches used to obtain hash to prefetch into PB: g to a * + * ********************************************************************* * + */ + + + +LLBP::LLBP(LLBPConfig cfg) + : TageSCL(cfg.tsclConfig), + llbpStorage(cfg.numContexts, cfg.numPatterns, + cfg.ctxAssoc, cfg.ptrnAssoc), + rcr(cfg.T, cfg.W, cfg.D, cfg.S, cfg.CTWidth), + patternBuffer(cfg.pbSize, cfg.pbAssoc), + + numContexts(cfg.numContexts), + numPatterns(cfg.numPatterns), + TTWidth(cfg.TTWidth), + CtrWidth(cfg.CtrWidth), + ReplCtrWidth(cfg.ReplCtrWidth), + CtxReplCtrWidth(cfg.CtxReplCtrWidth), + + simulateTiming(cfg.simulateTiming), + constrained(cfg.constrained), + warmup(false), + accessDelay(cfg.accessDelay), + primMispLength(0,36,36), + llbpMispLength(0,36,36), + primProvLength(0,36,36), + llbpProvLength(0,36,36), + numHistPerContext(1,17,16), + numUsefulHistPerContext(1,17,16) +{ + printf("LLBP branch predictor configs -------\n"); + cfg.print(); + + printf("CD: "); + llbpStorage.printCfg(); + printf("PS: "); + llbpStorage.allocate(0,0)->patterns.printCfg(); + llbpStorage.erase(0); + printf("PB: "); + patternBuffer.printCfg(); + + assert((!simulateTiming || (cfg.pbSize >= rcr.D)) + || "Pattern buffer hold at least all prefetches."); + + int mllbp[MAXNHIST]; + for (int i = 1; i <= nhist; i++) { + mllbp[i] = (i%2) ? m[i] : m[i]+2; + + fghrT1[i] = new FoldedHistoryFast(ghr, mllbp[i], TTWidth); + fghrT2[i] = new FoldedHistoryFast(ghr, mllbp[i], TTWidth - 1); + } + + +#ifdef FILTER_TABLES + // LLBP does not provide for all different history lenghts in + // TAGE a prediction only for the following once which where + // empirically determined. Note this + // are not the actual length but the table indices in TAGE. + auto l = {6,10,13,14,15,16,17,18, 19,20,22,24,26,28,32,36}; + + int n = 0; + for (auto i : l) { + // To reduce the complexity of the multiplexer LLBP groups + // always four consecutive history lenght in one bucket. + // As the pattern sets are implemented a set associative + // structure the lower bits determine the set=bucket. + // The `fltTable`-map not only filters the history lengths + // but also maps each length the correct pattern set index. + // E.e. for the four way associativity the following function + // ensures that history length 6,10,13,14 gets assign + // 0,4,8,12 with the lowest two bits 0b00. Thus, the set will + // be the same. + auto bucket = n / cfg.ptrnAssoc; + fltTables[i] = ((n%cfg.ptrnAssoc) << ceilLog2(cfg.ptrnAssoc) ) | bucket; + printf("%i=>%i:%i:%i:%i ", i, n, bucket, fltTables[i], mllbp[i]); + n++; + } + + + printf("\n"); + +#endif //FILTER_TABLES +} + +LLBP::~LLBP() { + + for (int i = 1; i <= nhist; i++) { + delete[] fghrT1[i]; + delete[] fghrT2[i]; + } +} + + + + +///////////////////////////////////////////////////////////////////////////////// +// Main TAGE and chooser overwrites +///////////////////////////////////////////////////////////////////////////////// + + +bool LLBP::predict(uint64_t pc) { + + // 1. The base prediction + TageSCL::basePredict(pc); + tage_provider = BASE; + bim_pred = base_pred; + bimConf = baseConf; + + // 2. Make the LLBP prediction + llbpPredict(pc); + + // If there was a hit in level 2 mark it. + // Also override the base prediction. The TAGE arbiter + // will decide which prediction to use. + // Note the TAGE chooser will use altConf in its index + baseConf = llbp.hit ? llbp.conf : bimConf; + base_pred = llbp.hit ? llbp.pred : bim_pred; + llbp.isProvider = llbp.hit; + + // 3. The TAGE prediction + // Tage will call the `chooseProvider` function + // Which arbitrates between the TAGE and L2 predictions. + tagePredict(pc); + + DPRINTFF("Prov: [TAGE:%i, L2:%i]\n", tage_pred, llbp.isProvider); + + tage_scl_pred = tage_pred; + scl_provider = tage_provider; + + // 3. SCL prediction + SCLPredict(pc); + + // 4. Choose the correct prediction + provider = scl_provider; + +#ifdef OVERWRITE_SCL + if (llbp.isProvider) { + provider = BASE; + return llbp.pred; + } +#endif + + return tage_scl_pred; +} + + +unsigned LLBP::chooseProvider() { + + bool chooseL2 = llbp.hit; + + // If LLBP hits, we don't use LLBP if it has a longer history. + if (chooseL2 && (llbp.histLength < HitBank)) { + chooseL2 = false; + llbp.shorter = true; + } + + // Don't override if the prefetch is too late. + if (simulateTiming && !warmup && !llbp.prefetched) { + chooseL2 = false; + } + + if (chooseL2) { + AltBank = 0; + altConf = baseConf = llbp.conf; + alttaken = base_pred = llbp.pred; + llbp.isProvider = true; + return BASE; + } + + // Clear the provider bit if instead the main TAGE is used. + llbp.isProvider = false; + + + // If the longest is somehow certain use its prediction. + if (tageConf != LowConf) + return LONGEST; + + // Use on low confidence if the USE_ALT_ON_NA is negative. + if (use_alt_on_na[idxChooser()] < 0) { + return LONGEST; + } + + return (AltBank > 0) ? ALT : BASE; +} + +bool LLBP::isNotUseful(bool taken) { + + if (llbp.hit) { + return false; + } + // If there was no hit in level 2 use the default algorithm. + return TageSCL::isNotUseful(taken); +} + +bool LLBP::isUseful(bool taken) { + + if (llbp.hit) { + return false; + } + // If there was no hit in LLBP use the default algorithm. + return TageSCL::isUseful(taken); +} + +bool LLBP::llbpCorrect(bool taken) { + return llbp.hit && (taken == llbp.pred); +} + +bool LLBP::primCorrect(bool taken) { + return (scl_provider == STC) ? (sc_pred == taken) : + HitBank ? (LongestMatchPred == taken) : + AltBank ? (alttaken == taken) : (bim_pred == taken); +} + +bool LLBP::tageCorrect(bool taken) { + return HitBank ? (LongestMatchPred == taken) : + AltBank ? (alttaken == taken) : (bim_pred == taken); +} + +bool LLBP::llbpUseful(bool taken) { + return llbp.isProvider && (taken == llbp.pred) && !primCorrect(taken); +} + +void LLBP::updateL2Usefulness(bool taken) { + + if (!llbp.hit) return; + + auto llbp_correct = llbpCorrect(taken); + bool prim_correct = tageCorrect(taken); + + // If Level 1 was provider, it was correct and + // level 2 was incorrect this prediction was useful. + if (!llbp.isProvider && prim_correct && !llbp_correct) { + if (HitBank) { + if (HitEntry->u < (1 << uwidth) - 1) + HitEntry->u++; + } + } +} + + +void LLBP::updateTables(uint64_t pc, bool resolveDir, bool predDir) { + + + DPRINTIF(COND,"%s nM:%i, TAGE:[d:%i, conf:%i, prov:%d HitBank:%d], BASE:[d:%i, conf:%i]\n", + (resolveDir != tage_pred) ? "Misp" : "Corr", + stats.tageMispred, + tage_pred, tageConf, tage_provider, HitBank, base_pred, baseConf); + + + // 1. Table allocation -------------------- + bool ALLOC = false; + if (llbp.isProvider) { + // If LLBP was provider we allocate if the prediction was wrong + // and the history length is shorter than the maximum. + ALLOC = (tage_pred != resolveDir) & (llbp.histLength < nhist); + + } else { + + // If the prediction came from TAGE, it was wrong and the history + // length is shorter than the maximum we allocate. + ALLOC = (tage_pred != resolveDir) & (HitBank < nhist); + + // If LLBP was actually correct, it was longer than TAGE, + // but it was not chosen as provider, then we don't allocate. + if (llbp.hit && (llbp.pred == resolveDir) && !llbp.shorter) { + ALLOC = false; + } + + // This comes from the TAGE update function (alternative prediction) + if (HitBank > 0) { + if ((tageConf == LowConf) && (LongestMatchPred == resolveDir)) { + ALLOC = false; + } + + updateChooser(resolveDir); + } + + } + + // Do the actual allocation + // In case LLBP was the provider we overwrite the history length + // of the TAGE prediction. This forces the TAGE allocation + // to start allocating with the history length of the LLBP. + auto tmp2 = HitBank; + if (llbp.isProvider) { + HitBank = llbp.histLength; + } + int nalloc = 0; + if (ALLOC) { + nalloc = 1+nnn; + DPRINTFF("Alloc:%i,%i, HL:%i, L2:[H:%i,S:%i,P:%i,D:%i]\n", + stats.totalAllocInit, stats.total, HitBank, llbp.hit, llbp.shorter, llbp.isProvider, llbp.pred); + } + + allocateTables(nalloc, pc, resolveDir); + HitBank = tmp2; + + // 2. The LLBP + TAGE table updates + // We only update either the TAGE or the LLBP tables. + llbpUpdate(pc, resolveDir, predDir); + + // 3. Finally, the statistical corrector. + SCLUpdate(pc, resolveDir, predDir); +} + + + +///////////////////////////////////////////////////////////////////////////////// +// LLBP PREDICTOR +///////////////////////////////////////////////////////////////////////////////// + +void LLBP::llbpPredict(uint64_t pc) { + + // Calculate indices and tags + // We need to do this explicity because we perform the prediction + // before the TAGE prediction. + calcIndicesAndTags(pc); + + llbpEntry = nullptr; + llbp = {}; + + for (int i = 1; i <= nhist; i++) { + + if (!NOSKIP[i]) continue; + + // We don't use all history lengths. Only 16 + // By using the lower bits for the table number we can + // use it to manage the assocativity of the different history lengths. + auto _i = i; + auto it = fltTables.find(i); + if (it != fltTables.end()) { + _i = it->second; + } + // auto _i = fltTables.contains(i) ? fltTables[i] : i; + // Table index (10 bits) + + + auto _key = pc; + _key ^= fghrT1[i]->value ^ (fghrT2[i]->value << 1); + // Mask the patterns bits + _key &= ((1 << TTWidth) - 1); + + KEY[i] = uint64_t(_key) << 10ULL | uint64_t(_i); + + } + + + + // Get the current context (CCID) + auto ctx_key = rcr.getCCID(); + // PRINTIF(COND2,"%llu L2Predict: %lx\n", branchCount, ctx_key); + HitContext = llbpStorage.get(ctx_key); + + + if (HitContext) { + for (int i = nhist; i > 0; i--) { + if (NOSKIP[i]) { + llbpEntry = HitContext->patterns.get(KEY[i]); + + + if (llbpEntry) { + llbp.hit = i; + llbp.pVal = llbpEntry->ctr; + llbp.pred = llbp.pVal >= 0; + llbp.conf = compConf(llbp.pVal, CtrWidth); + llbp.histLength = i; + break; + } + } + } + } + + + if (llbp.hit) { + PRINTIF(COND,"S1Hit:%i,GI:%i,GT:%i,c:%i\n", + llbpEntry->length, GI[llbpEntry->length], GTAG[llbpEntry->length], + llbpEntry->ctr); + } + + // In case of timing simulation we check if the entry was already + // prefetched. + if (simulateTiming) { + pbEntry = patternBuffer.get(ctx_key); + if (pbEntry) { + pbEntry->locked = false; + } + llbp.prefetched = llbp.hit && pbEntry; + } else { + llbp.prefetched = true; + } + +} + + + +// PREDICTOR UPDATE +void LLBP::llbpUpdate(uint64_t pc, bool resolveDir, bool predDir) { + + // Update ---------------------------------------------------- + bool updateBim = false; + bool updateLLBP = llbp.isProvider; + bool updateTAGE = !llbp.isProvider; + + + // Only the providing component is updated. + // If the prediction came from LLBP its pattern gets updated. + if (updateLLBP) { + ctrupdate(llbpEntry->ctr, resolveDir, CtrWidth); + + // This function updates the context replacement counter + // - If a pattern becomes confident (correct prediction) + // the replacement counter is increased + // - If a pattern becomes low confident (incorrect prediction) + // the replacement counter is decreased + if (llbpEntry->ctr == (resolveDir ? 1 : -2)) { + // entry became medium confident + ctrupdate(HitContext->replace, true, CtxReplCtrWidth); + } + else if (llbpEntry->ctr == (resolveDir ? -1 : 0)) { + // entry became low confident + ctrupdate(HitContext->replace, false, CtxReplCtrWidth); + } + + // If the prediction wrong update also the BIM + if (!llbpCorrect(resolveDir) && (llbp.conf == LowConf)) { + updateBim = true; + } + } + + // If the prediction was from the TAGE predictor update it. + if (updateTAGE) { + updateBim = tageUpdate(pc, resolveDir); + } + + // The base predictor is sometimes updated if the confidence of the + // prediction is low. + if (updateBim) { + + // If the prediction was from the base predictor, update it. + TageSCL::baseUpdate(pc, resolveDir, predDir); + + DPRINTIF(COND,"BIMUp: ctr:%i\n", BIM); + } + + // Usefulness ------------------------------------------------ + if (llbp.hit) { + updateL2Usefulness(resolveDir); + } + + + // Update the pattern buffers statistics + // and dirty bits. + if (simulateTiming && pbEntry) { + if (updateLLBP) { + pbEntry->dirty = true; + } + if (llbp.hit) { + pbEntry->used = true; + } + if (llbpUseful(resolveDir)) { + pbEntry->useful = true; + } + } +} + +bool LLBP::llbpAllocate(int histLen, uint64_t pc, bool taken) { + +#ifdef FILTER_TABLES + auto it = fltTables.find(histLen); + if (it == fltTables.end()) { + // If the history length is not in the filter table + // we don't allocate. + return false; + } + // if (!fltTables.contains(histLen)) { + // return false; + // } +#endif //FILTER_TABLES + + + // Create context key and pattern key + auto ctx_key = rcr.getCCID(); + auto k = KEY[histLen]; + + auto ctx = allocateNewContext(pc, ctx_key); + + // Check if the pattern already exists in LLBP. + auto ptrn = ctx->patterns.get(k); + if (ptrn) { + return true; + } + + // No pattern found. Allocate a new one. + // Sorting before allocation to find the victim + + if (constrained) { +#ifdef L2_SORT_PTRN + ctx->sortPatters(k); +#endif + +#ifdef LLBP_CONSTRAINED + ptrn = ctx->patterns.getVictim(k); +#endif + } + + ptrn = ctx->patterns.insert(k); + + ptrn->length = histLen; + ptrn->useful = 0; + ptrn->correct = 0; + ptrn->key = k; + ptrn->pc = pc; + + ptrn->ctr = taken ? 0 : -1; + ptrn->dir = taken; + + return true; +} + + + +LLBP::Context* LLBP::allocateNewContext(uint64_t pc, uint64_t ctx_key) { + + Context* ctx = llbpStorage.get(ctx_key); + + // If the context does not exist we allocate a new one. + if (!ctx) { + + if (constrained) { +#ifdef L2_SORT_CTX + llbpStorage.sortContexts(ctx_key); +#endif + } + + // Ensure the victim is not in the L2 predictor + ctx = llbpStorage.getVictim(ctx_key); + + // If the victim context is still in pattern buffer + // we need to remove it. + if (simulateTiming && ctx) { + + if (patternBuffer.exists(ctx->key)) { + patternBuffer.erase(ctx->key); + } + + // Also invalidate all entries in the prefetch queue + // with this key. + // auto n = prefetchQueue.size(); + prefetchQueue.erase(std::remove_if( + prefetchQueue.begin(), prefetchQueue.end(), + [ctx_key](auto& e) { return e.key == ctx_key; }), + prefetchQueue.end()); + } + + // Allocate a new context in the LLBP storage. + ctx = llbpStorage.allocate(ctx_key, pc); + + if (simulateTiming && ctx) { + // Put the newly allocated entry into the PB. + PBEntry entry(ctx_key); + entry.valid = true; + entry.dirty = true; + entry.newlyAllocated = true; + installInPB(entry, true); + } + + } + + return ctx; +} + + + +int LLBP::allocate(int Tidx, uint64_t pc, bool taken) { + + int alloc_res = TageBase::allocate(Tidx, pc, taken); + + // Get the newly allocated entry and mark with the context and key + if (alloc_res > 0) { + auto& entry = gtable[Tidx][GI[Tidx]]; + entry.ctx = rcr.getCCID(); + entry.key = KEY[Tidx]; + // DPRINTIF(COND,"L1Alloc:%i,GI:%i,T:%i,K:%#llx,C:%#llx\n", Tidx, GI[Tidx], entry.tag, entry.key, entry.ctx); + } + + if (alloc_res <= 0) { + // Allocation not successful -> we also don't allocate in the LLBP + return alloc_res; + } + + // Try allocating in the LLBP. + if (llbpAllocate(Tidx, pc, taken)) { + stats.allocations[Tidx]++; + stats.totalAllocations++; + return 1; + } + return alloc_res; +} + +// Prefetch functionality + +void LLBP::prefetch() { + + if (warmup) return; + + // Perform the prefetching ----- + // Calculate the hash from the head of the history. + auto ctx_key = rcr.getPCID(); + // PRINTIF(COND2,"%i/%i Prefetch: %lx -> ", ticks, branchCount, ctx_key); + + + + // First check the preload queue if this entry is already enqueued. + auto it = std::find_if( + prefetchQueue.begin(), prefetchQueue.end(), + [ctx_key](auto& e) + { return (e.key == ctx_key) && e.valid;}); + + if (it != prefetchQueue.end()) { + PRINTIF(COND2," Hit in prefetchQueue %lx", it->key); + llbpstats.l2PFHitInQueue++; + } + + // Second check if its already cached. + else if (patternBuffer.exists(ctx_key)) { + // Copy the entry from the cache to the preload queue. + PRINTIF(COND2," Hit in pattern cache"); + llbpstats.l2PFHitInCache++; + patternBuffer.touch(ctx_key); + } + + // Finally check if the context is available in the LLBP + // and needs to be prefetched. + else if (llbpStorage.exists(ctx_key)) { + PRINTIF(COND2," Hit in CI -> prefetch"); + llbpstats.l2PFHitInCI++; + auto& pf_entry = prefetchQueue.emplace_back(ctx_key); + pf_entry.valid = true; + pf_entry.prefetchtime = ticks; + } else { + PRINTIF(COND2," Miss"); + } + PRINTIF(COND2,"\n"); + +} + +void LLBP::squashPrefetchQueue(bool btbMiss) { + if (btbMiss) + llbpstats.pfDroppedBTBMiss += prefetchQueue.size(); + else + llbpstats.pfDroppedMispredict += prefetchQueue.size(); + prefetchQueue.clear(); + + // Once all prefetches are squashed we trigger prefetches + // for an upcoming context. + if (btbMiss) + prefetch(); +} + +void LLBP::tickPrefetchQueue() { + + // Tick should be called before the prediction is made. + + // Install prefetches if the prefetch delay has passed. + if (!prefetchQueue.empty()) { + auto& pf_entry = prefetchQueue.front(); + + // If the prefetch delay has passed + if (ticks - pf_entry.prefetchtime >= (accessDelay)) { + + PRINTIF(COND2," Install in Cache: %lx\n", pf_entry.key); + pf_entry.locked = true; + installInPB(pf_entry); + prefetchQueue.pop_front(); + } + } +} + +void LLBP::installInPB(PBEntry &entry, bool bypass) { + + + // First get the victim from the LLBP predictor + auto victim = patternBuffer.getVictim(entry.key); + + if (victim) { + + // If the entry is locked due to ongoing prefetch. Don't install in + // PB but in LLBP right away. + if (victim->locked && bypass) { + llbpstats.pfDroppedLocked++; + llbpstats.l2cacheDirtyEvict++; + return; + } + + if (victim->dirty) llbpstats.l2cacheDirtyEvict++; + else llbpstats.l2cacheCleanEvict++; + PRINTIF(COND2," Evict: %lx\n", victim->key); + } + + // Copy the prefetched pattern set into the PB + patternBuffer.insert(entry); +} + + +void LLBP::updateStats(bool resolveDir, bool predDir, uint64_t pc) { + + TageSCL::updateStats(resolveDir, predDir, pc); + + + // Check if storing the last history would have been useful. + auto correct = predDir == resolveDir; + + auto llbp_correct = llbp.isProvider && (resolveDir == llbp.pred); + + bool prim_correct = (scl_provider == STC) ? (sc_pred == resolveDir) : + HitBank ? (LongestMatchPred == resolveDir) : + AltBank ? (alttaken == resolveDir) : (bim_pred == resolveDir); + + + bool llbp_useful = llbp_correct && !prim_correct; + + + if (llbp.hit) { + if (llbp.isProvider) { + if (llbp_correct) { + if (prim_correct) + llbpstats.l2OverrideSameCorr++; + else + llbpstats.l2OverrideGood++; + } else { + if (prim_correct) + llbpstats.l2OverrideBad++; + else + llbpstats.l2OverrideSameWrong++; + } + if (llbpEntry->pc != pc) { + if (llbp_correct) + llbpstats.ovrPosAlias++; + else { + llbpstats.ovrNegAlias++; + } + } + } else { + llbpstats.l2NoOverride++; + } + } + + + + + // Hits for contexts and patterns + if (HitContext) { + llbpstats.l2CtxHit++; + if (llbpEntry) { + llbpstats.l2PtrnHit++; + } + } + + + if (llbp.isProvider) { + llbpstats.l2Prov++; + llbpProvLength.insert(llbp.histLength); + if (llbp_correct) { + llbpstats.l2Correct++; + llbpEntry->correct++; + HitContext->correct++; + if (llbp_useful) { + + llbpEntry->useful++; + HitContext->useful++; + if (llbpEntry->useful == 1) { + HitContext->usefulPtrns++; + } + } + + + } else { + llbpstats.l2Wrong++; + llbpMispLength.insert(llbp.histLength); + llbpEntry->incorrect++; + HitContext->incorrect++; + } + } else { + switch (provider) { + case LONGEST: + case ALT: + { + llbpstats.tageProv++; + auto l = (provider == LONGEST) ? HitBank : AltBank; + primProvLength.insert(l); + if (correct) llbpstats.tageCorrect++; + else { + llbpstats.tageWrong++; + primMispLength.insert(l); + } + } + break; + + case LOOP: + case STC: + llbpstats.sclProv++; + if (correct) llbpstats.sclCorrect++; + else llbpstats.sclWrong++; + break; + + case BASE: + llbpstats.baseProv++; + if (correct) { + llbpstats.baseCorrect++; + } else { + llbpstats.baseWrong++; + } + break; + default: + break; + } + } + + if (llbp.hit && !llbp.isProvider) { + if (llbp.shorter) { + llbpstats.l2notBecauseShorter++; + } + + if (!llbp.prefetched) { + llbpstats.l2notBecauseNotPrefetched++; + } + } +} + + + +// Predictor update ---------------------------------------- +void LLBP::UpdatePredictor(uint64_t PC, bool resolveDir, + bool predDir, uint64_t branchTarget) { + + // Update the TAGE and LLBP predictors via the + // base class. This Will also update the histories and statistics. + Tage::UpdatePredictor(PC, resolveDir, predDir, branchTarget); + + // Only thing left is the update of the prefetch queue and + // the context hash. + bool do_prefetch = false; + if (simulateTiming && (resolveDir != predDir)) { + squashPrefetchQueue(); + do_prefetch = true; + } + + // In the default LLBP predictor there will be no update of the + // runtime hash for conditional branches. However, this model + // supports different types of histories. + do_prefetch |= rcr.update(PC, OPTYPE_JMP_DIRECT_COND, resolveDir); + if (simulateTiming && do_prefetch) { + prefetch(); + } +} + + +void LLBP::TrackOtherInst(uint64_t PC, OpType opType, bool taken, + uint64_t branchTarget) { + + TageSCL::TrackOtherInst(PC, opType, taken, branchTarget); + + auto do_prefetch = rcr.update(PC, opType, taken); + if (simulateTiming && do_prefetch) { + // PRINTIF(COND2,"%i/%i Prefetch: %lx/t:%i from UpdateOther -> ", ticks, branchCount, PC, opType); + prefetch(); + } +} + + + +void LLBP::updateGHist(const bool bit) { + TageSCL::updateGHist(bit); + for (uint32_t i = 1; i <= nhist; ++i) { + fghrT1[i]->update(); + fghrT2[i]->update(); + } +} + + +/************************************************************ + * RCR Functionality + */ + +LLBP::RCR::RCR(int _T, int _W, int _D, int _shift, int _CTWidth) + : CTWidth(_CTWidth), T(_T), W(_W), D(_D), S(_shift) +{ + bb[0].resize(maxwindow); + bb[1].resize(maxwindow); + ctxs = {0, 0}; + printf("\n\nRCR: context hash config: [T:%i, W:%i, D:%i, S:%i, CTWidth:%i]\n", + T, W, D, S, CTWidth); +} + + +/* + * Given the {n} number of branches staring from vec[end-start] + * to vec[end-start-n-1] we create the hash function by shifting + * each PC by {shift} number if bits i.e. + * + * 000000000000| PC | :vec[end-start] + * ^ 0000000000| PC |00 :vec[end-start-1] + * ^ 00000000| PC |0000 :vec[end-start-2] + * . . + * . . + * . . + * ^ | PC |000000000000 :vec[end-start-n-1] + * ---------------------- + * final hash value + * */ +uint64_t LLBP::RCR::calcHash(std::list &vec, int n, int start, int shift) { + uint64_t hash = 0; + if (vec.size() < (start + n)) { + return 0; + } + uint64_t sh = 0; + auto it = vec.begin(); + std::advance(it, start); + for (; (it != vec.end()) && (n > 0); it++, n--) { + uint64_t val = *it; + + // Shift the value + hash ^= val << uint64_t(sh); + + sh += shift; + if (sh >= CTWidth) { + sh -= uint64_t(CTWidth); + } + } + return hash & ((1 << CTWidth) - 1); +} + +uint64_t LLBP::RCR::getCCID() { + return ctxs.ccid & ((1 << CTWidth) - 1); +} + +uint64_t LLBP::RCR::getPCID() { + return ctxs.pcid & ((1 << CTWidth) - 1); +} + + +bool LLBP::RCR::update(uint64_t pc, OpType opType, bool taken) { + + branchCount++; + // Hash of all branches + auto isCall = (opType == OPTYPE_CALL_DIRECT_UNCOND) + || (opType == OPTYPE_CALL_INDIRECT_UNCOND) + || (opType == OPTYPE_CALL_DIRECT_COND); + + + switch (T) { + case 0: // All branches + bb[0].push_front(pc); + bb[1].push_front(branchCount); + break; + + case 1: // Only calls + if (isCall) { + bb[0].push_front(pc); + bb[1].push_front(branchCount); + } + break; + + case 2: // Only calls and returns + if (isCall || (opType == OPTYPE_RET_UNCOND)) { + bb[0].push_front(pc); + bb[1].push_front(branchCount); + } + break; + + case 3: // Only unconditional branches + if (opType != OPTYPE_JMP_DIRECT_COND) { + bb[0].push_front(pc); + bb[1].push_front(branchCount); + } + break; + + case 4: // All taken branches + if (taken) { + bb[0].push_front(pc); + bb[1].push_front(branchCount); + } + break; + } + + + + // PRINTIF(COND,"UH:%llx, %i, %i\n", pc, opType, taken); + // If the size has changed the hash has changed + bool changed = false; + if (bb[0].size() > maxwindow) { + changed = true; + + // Resize the history + bb[0].pop_back(); + bb[1].pop_back(); + + + // The current context. + ctxs.ccid = calcHash(bb[0], W, D, S); + // The prefetch context. + ctxs.pcid = calcHash(bb[0], W, 0, S); + } + return changed; +} + + +void LLBP::tick() { + TageSCL::tick(); + if (simulateTiming) { + tickPrefetchQueue(); + } +} + +void LLBP::btbMiss() { + if (simulateTiming) { + squashPrefetchQueue(true); + } +} + +void LLBP::setState(bool _warmup) { + warmup = _warmup; +} + + + +void LLBP::PrintStat(double instr) { + + TageSCL::PrintStat(instr); + + // Analyze the branch context + numHistPerContext.reset(); + numUsefulHistPerContext.reset(); + + int nPattern = 0, nUseful = 0; + int nCtx = 0, nCtxUseful = 0; + + for (auto& ctx_pair : llbpStorage.getMap()) { + + auto& ctx = ctx_pair.second->second; + + int nuseful = 0; + for (auto& pt : ctx.patterns.getMap()) { + if (pt.second->second.useful > 0) { + nuseful++; + } + } + + int n = ctx.patterns.size(); + numHistPerContext.insert(n); + numUsefulHistPerContext.insert(nuseful); + nPattern += n; + nUseful += nuseful; + if (nuseful) { + nCtxUseful++; + } + } + + + printf("LLBP branch predictor stats -------\n"); + + printf("LLBP:: CtxHit:%i(%.4f), PtrnHit:%i(%.4f)\n", + llbpstats.l2CtxHit, llbpstats.l2CtxHit / (double)stats.total, + llbpstats.l2PtrnHit, llbpstats.l2PtrnHit / (double)stats.total + ); + + + printf("PROVIDER:: BIM:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f] \n", + llbpstats.baseProv, (double)llbpstats.baseProv / (double)stats.total, + llbpstats.baseCorrect, (double)llbpstats.baseCorrect / (double)llbpstats.baseProv, + llbpstats.baseWrong, (double)llbpstats.baseWrong / (double)llbpstats.baseProv, + (double)llbpstats.baseWrong / (double)instr * 1000 + ); + + printf("PROVIDER:: TAGE:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n", + llbpstats.tageProv, (double)llbpstats.tageProv / (double)stats.total, + llbpstats.tageCorrect, (double)llbpstats.tageCorrect / (double)llbpstats.tageProv, + llbpstats.tageWrong, (double)llbpstats.tageWrong / (double)llbpstats.tageProv, + (double)llbpstats.tageWrong / (double)instr * 1000); + + printf("PROVIDER:: SCL:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n", + llbpstats.sclProv, (double)llbpstats.sclProv / (double)stats.total, + llbpstats.sclCorrect, (double)llbpstats.sclCorrect / (double)llbpstats.sclProv, + llbpstats.sclWrong, (double)llbpstats.sclWrong / (double)llbpstats.sclProv, + (double)llbpstats.sclWrong / (double)instr * 1000); + + printf("PROVIDER:: LLBP:[P:%i(%.4f), C:%i(%.4f), W:%i(%.4f) MPKI:%.4f], \n", + llbpstats.l2Prov, (double)llbpstats.l2Prov / (double)stats.total, + llbpstats.l2Correct, (double)llbpstats.l2Correct / (double)llbpstats.l2Prov, + llbpstats.l2Wrong, (double)llbpstats.l2Wrong / (double)llbpstats.l2Prov, + (double)llbpstats.l2Wrong / (double)instr * 1000); + + + printf("LLBP:: CtxHit:%i, PtrnHit:%i, Provider:%i(%.4f), NoProvider:[Shorter:%i(%.4f), NoPrefetch:%i(%.4f)]\n", + llbpstats.l2CtxHit, llbpstats.l2PtrnHit, llbpstats.l2Prov, (double)llbpstats.l2Prov / (double)llbpstats.l2PtrnHit, + llbpstats.l2notBecauseShorter, (double)llbpstats.l2notBecauseShorter / (double)llbpstats.l2PtrnHit, + llbpstats.l2notBecauseNotPrefetched, (double)llbpstats.l2notBecauseNotPrefetched / (double)llbpstats.l2PtrnHit + ); + + + printf("LLBP:: PB Prefetch:[HitInPfq:%i, HitInPB:%i, HitInCI:%i], dropped[locked:%i, misp:%i, btbmiss:%i]\n", + llbpstats.l2PFHitInQueue, llbpstats.l2PFHitInCache, llbpstats.l2PFHitInCI, llbpstats.pfDroppedLocked, llbpstats.pfDroppedMispredict, llbpstats.pfDroppedBTBMiss + ); + + auto tot_evicts = llbpstats.l2cacheDirtyEvict + llbpstats.l2cacheCleanEvict; + printf("LLBP:: PB Evict:[Clean:%i(%.3f) Dirty:%i(%.3f)]\n", + llbpstats.l2cacheCleanEvict, (double)llbpstats.l2cacheCleanEvict / (double)tot_evicts, + llbpstats.l2cacheDirtyEvict, (double)llbpstats.l2cacheDirtyEvict / (double)tot_evicts + ); + + + printf("LLBP:: LLBPHits:[NoOv:%i, SameCorr:%i, SameWrong:%i, GoodOv:%i, BadOv:%i] Alias:[P:%i(%.4f),N:%i(%.4f)]\n", + llbpstats.l2NoOverride, llbpstats.l2OverrideSameCorr, llbpstats.l2OverrideSameWrong, llbpstats.l2OverrideGood, llbpstats.l2OverrideBad, + llbpstats.ovrPosAlias, llbpstats.ovrPosAlias / (double)llbpstats.l2Prov, + llbpstats.ovrNegAlias, llbpstats.ovrNegAlias / (double)llbpstats.l2Prov + ); + + + auto tot_pattern = (numPatterns * numContexts); + + + nCtx = llbpStorage.getMap().size(); + printf( + "LLBP:: Utilization: Patterns:[Total:%i,Alloc:%i(%.4f),Useful:%i(%.4f)], Ctx:[Total:%i,Alloc:%i(%.4f),Useful:%i(%.4f)]\n", + + tot_pattern, nPattern, nPattern / (double)tot_pattern, + nUseful, nUseful / (double)tot_pattern, + + numContexts, nCtx, nCtx / (double)numContexts, + nCtxUseful, nCtxUseful / (double)numContexts + ); + +#define PRINTHIST + +#ifdef PRINTHIST + + + printf("Hist Histories per context\n"); + printf("%s\n", numHistPerContext.print(true,true).c_str()); + + printf("Hist Useful histories per context\n"); + printf("%s\n", numUsefulHistPerContext.print(true,true).c_str()); + + printf("Hist primary mispredict length (incorrect)\n"); + printf("%s\n", primMispLength.print(true,true).c_str()); + + printf("Hist LLBP mispredict length (incorrect)\n"); + printf("%s\n", llbpMispLength.print(true,true).c_str()); + + printf("Hist primary provider length\n"); + printf("%s\n", primProvLength.print(true,true).c_str()); + + printf("Hist LLBP provider length\n"); + printf("%s\n", llbpProvLength.print(true,true).c_str()); + + +#endif + + +} + +void LLBP::resetStats() { + TageSCL::resetStats(); + llbpstats = {}; + + primMispLength.reset(); + llbpMispLength.reset(); + primProvLength.reset(); + llbpProvLength.reset(); + numHistPerContext.reset(); + numUsefulHistPerContext.reset(); +} + + +}; // namespace LLBP diff --git a/src/cpu/pred/llbpref/llbp.h b/src/cpu/pred/llbpref/llbp.h new file mode 100644 index 00000000000..585236c2d21 --- /dev/null +++ b/src/cpu/pred/llbpref/llbp.h @@ -0,0 +1,641 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#pragma once + +#include +// #include +#include +#include +#include +#include +#include +#include + + +#include "tage_scl.h" + +// #include "utils/fileutils.h" +#include "counters.h" +#include "cache.h" +#include "histogram.h" + +namespace LLBP { + + +struct LLBPConfig; + +class LLBP : public TageSCL { + + public: + LLBP(LLBPConfig config); + ~LLBP(); + + void UpdatePredictor(uint64_t PC, bool resolveDir, + bool predDir, uint64_t branchTarget) override; + void TrackOtherInst(uint64_t PC, OpType opType, bool taken, + uint64_t branchTarget) override; + void PrintStat(double NUMINST) override; + void tick() override; + void btbMiss() override; + void setState(bool warmup) override; + + private: + + + // Override some base class functions + bool predict(uint64_t pc) override; + void updateTables(uint64_t pc, bool resolveDir, bool predDir) override; + void updateStats(bool taken, bool predtaken, uint64_t PC) override; + void updateGHist(const bool bit) override; + int allocate(int idx, uint64_t pc, bool taken) override; + void resetStats() override; + + + typedef uint64_t Key; + Key KEY[MAXNHIST]; // + + + /******************************************************************** + * LLBP Pattern + * + * Consists of the history length field and the tag. + * In the model we concatenate both to form a key. + * key = (tag << 10) | length + * This simplifies model complexity + ******************************************************************/ + struct Pattern { + int length; + uint tag; + int idx; + int8_t ctr; + uint replace; + bool dir; + int useful = 0; + int correct = 0; + int incorrect = 0; + Key key = 0; + int evicted = 0; + int evicted_ctx = 0; + uint64_t pc = 0; + }; + + + /******************************************************************** + * Pattern Set + * + * The pattern sets are implemented as set associative cache. The + * lower bits of the key - to lookup a pattern in the pattern set + * - are used for the history length which realizes the four way + * associativity. In the constructor we assign each history an + * index + ******************************************************************/ + struct PatternSet : public BaseCache{ + PatternSet(size_t max_size, size_t assoc) : + BaseCache(max_size, assoc) + {} + + Pattern* insert(const uint64_t &key) { + return BaseCache::insert(key); + } + }; + + /******************************************************************** + * Program Context + * + * A program context contains one pattern set and is indexed by + * a key formed by hashing W unconditional branches. + * This struct contains some additional meta data for replacement + * and statistics. + ********************************************************************/ + struct Context { + bool valid; + uint64_t key; + uint64_t pc; + int correct; + int incorrect; + int useful; + int conflict; + uint replace; + int ctr; + int usefulPtrns; + + // The contexts pattern set. + PatternSet patterns; + + Context(uint64_t k, uint64_t p, int n, int assoc) + : valid(true), key(k), pc(p), + correct(0), incorrect(0), useful(0), conflict(0), + replace(0), ctr(0), usefulPtrns(0), + patterns(n, assoc) + {} + + // Before a pattern in the pattern set is replaced, the patterns are + // sorted from the highest to the lowest confidence. This is done to + // determine which pattern should be evicted. + void sortPatters(const uint64_t key) { + auto& set = patterns.getSet(key); + set.sort( + [](const std::pair& a, const std::pair& b) + { + return abs(center(a.second.ctr)) > abs(center(b.second.ctr)); + }); + } + }; + + + /******************************************************************** + * LLBP Storage + * + * LLBPs high-capacity structure to store all pattern sets. + * It's implemented as a set associative cache. + * The Context directory (CD) can be thought of as the tag array while the + * LLBPStorage is the data array. In this simulation model, both LLBP + * and CD are represented with a single data structure. + ********************************************************************/ + class LLBPStorage : public BaseCache{ + typedef typename std::pair key_value_pair_t; + typedef typename std::list::iterator list_iterator_t; + const int n_patterns; + const int _ptrn_assoc; + + public: + + LLBPStorage(int n_ctx, int n_patterns, int ctx_assoc, int ptrn_assoc) + : BaseCache(n_ctx, ctx_assoc), + n_patterns(n_patterns), _ptrn_assoc(ptrn_assoc) + { + } + + // This function creates a new context but does not install it. + Context* createNew(uint64_t key, uint64_t pc) { + return new Context(key, pc, n_patterns, _ptrn_assoc); + } + + // This function will allocate a new context for the + // given key if it does not exist. + // It Will return the created context. + // Note that this function will NOT sort the contexts. + // Therefore, make sure to call the sorting function before + // this function + Context* allocate(uint64_t key, uint64_t pc) { + + auto c = this->get(key); + if (c != nullptr) { + return c; + } + + auto& set = this->getResizedSet(key); + + set.push_front( + key_value_pair_t(key, Context(key, pc, n_patterns, _ptrn_assoc))); + _index[key] = set.begin(); + return &set.front().second; + } + + // Sort the contexts in a set based on the replacement counter. + void sortContexts(uint64_t key) { + auto& set = this->getSet(key); + set.sort( + [](const key_value_pair_t& a, const key_value_pair_t& b) + { + return a.second.replace > b.second.replace; + }); + } + } llbpStorage; + + + + bool bim_pred; + unsigned bimConf; + + + + /******************************************************************** + * Rolling Context Register RCR + * + * The RCR maintains the previous executed branches to compute + * a context ID. + * + * The hash function is defined by 4 paramenters + * + * T: Type of history (T). Which branches should be hased + * 0: All branches, 1: Only calls, 2: Calls and returns + * 3: All unconditional branches, 4: All taken branches + * + * W: Number of branches that should be hashed (W in the paper). + * D: Number of most recent branches skipped for CCID. Adds delay which + * is used to prefetch. (D in the paper.) + * S: Number of bits to shift the PC's. Is useful to avoid ping-pong context + * due to the XOR function in case a loop is executed + * + * ********************************************************************* * + * EXAMPLE * + * * + * pb-index (2.) (3.) * + * v v v * + * history buffer : |l|k|j|i|h|g|f|e|d|c|b|a| * + * ^prefetch (2.)^ * + * * + * a is the newest branch PC added to the buffer, l the oldest. * + * (2.) = W = 7; (3.) = D = 3 * + * branches used to obtain PB index hash: j to d * + * branches used to obtain hash to prefetch into PB: g to a * + * ********************************************************************* * + */ + class RCR { + const int maxwindow = 120; + + uint64_t calcHash(std::list &vec, int n, int start=0, int shift=0); + + // The context tag width + const int CTWidth; + + // A list of previouly taken branches + std::list bb[10]; + + // We compute the context ID and prefetch context ID + // only when the content of the RCR changes. + struct { + uint64_t ccid = 0; + uint64_t pcid = 0; + } ctxs; + + int branchCount = 0; + + public: + // The hash constants + const int T, W, D, S; + + RCR(int _T, int _W, int _D, int _shift, int _CTWidth); + + // Push a new branch into the RCR. + bool update(uint64_t pc, OpType type, bool taken); + + // Get the current context ID + uint64_t getCCID(); + + // Get the prefetch context ID + uint64_t getPCID(); + } rcr; + + + /******************************************************************** + * Pattern Buffer + * + * The pattern buffer is a small set associative cache that maintains + * the most recent executed pattern set. Upcomming contexts + * are prefetched into the pattern buffer and predictions are made from + * the pattern buffer. + * + * Note that in the model we don't move the patterns into the pattern + * buffer. Instead we directly modify the patterns in the LLBPStorage. + * The pattern buffer models the caching behaviour and is only used + * in the timing model. + */ + struct PBEntry { + Key key; + bool dirty; + bool newlyAllocated; + bool used; + bool useful; + int origin; + bool valid; + int prefetchtime; + bool locked; + PBEntry(Key c) + : key(c), dirty(false), + newlyAllocated(false), + used(false), useful(false), + origin(0), + valid(false), + prefetchtime(0), + locked(false) + {} + PBEntry() : PBEntry(0) {} + }; + + class PatternBuffer : public BaseCache { + public: + PatternBuffer(int n, int assoc) + : BaseCache(n, assoc) + { + } + + PBEntry* insert(PBEntry &entry) {; + auto v = get(entry.key); + if (v != nullptr) { + return v; + } + + // Get the set with a free item + auto& set = getResizedSet(entry.key); + + set.push_front(key_value_pair_t(entry.key, entry)); + _index[entry.key] = set.begin(); + return &set.front().second; + } + } patternBuffer; + + + // Pointers to context, llbp pattern and PB entry in case of a + // LLBP pattern/context match. + Context* HitContext; + Pattern* llbpEntry; + PBEntry* pbEntry; + + + // A struct to maintain the prediction info from LLBP. + struct LLBPPredInfo { + bool hit = false; + int pVal = 0; + bool pred = false; + unsigned conf = 0; + int histLength = 0; + bool prefetched = false; + bool isProvider = false; + bool shorter = false; + } llbp; + + // The prediction function for the LLBP. + void llbpPredict(uint64_t pc); + + // The LLBP update function. + void llbpUpdate(uint64_t PC, bool resolveDir, bool predDir); + + // The LLBP allocate function. + bool llbpAllocate(int idx, uint64_t pc, bool taken); + + // Function to allocate a new context. + Context* allocateNewContext(uint64_t pc, uint64_t key); + + // A map to filter the used history lengths. + std::unordered_map fltTables; + + // The number of contexts in the CD/LLBP + const int numContexts; + // The number of patterns per pattern set. + const int numPatterns; + // Bit width for pattern tag. + const int TTWidth; + // Constants for the patterns counter widths + const int CtrWidth; + const int ReplCtrWidth; + const int CtxReplCtrWidth; + + // Folded history register. Same as in the TAGE predictor. + FoldedHistoryFast* fghrT1[MAXNHIST]; + FoldedHistoryFast* fghrT2[MAXNHIST]; + + + // Override the chooser functions to arbitrate between + // the baseline TAGE and LLBP + bool isNotUseful(bool taken) override; + bool isUseful(bool taken) override; + void updateL2Usefulness(bool taken); + + unsigned chooseProvider() override; + + inline bool llbpCorrect(bool taken); + inline bool primCorrect(bool taken); + inline bool tageCorrect(bool taken); + inline bool llbpUseful(bool taken); + + + // Timing simulation -------------------------------------- + // Methods, variables and structures for the prefetching + // functionality. Prefetching is only modelled if `simulateTiming` + // is set to true. + const bool simulateTiming; + const bool constrained; + + void prefetch(); + void tickPrefetchQueue(); + void squashPrefetchQueue(bool btbMiss=false); + void installInPB(PBEntry &entry, bool bypass=false); + + // The prefetch queue + std::list prefetchQueue; + bool warmup = false; + const int accessDelay; + + + // Some histograms + Histogram primMispLength; + Histogram llbpMispLength; + Histogram primProvLength; + Histogram llbpProvLength; + Histogram numHistPerContext; + Histogram numUsefulHistPerContext; + + + // Some stats + struct l2_stats + { + + int tageProv = 0; + int tageCorrect = 0; + int tageWrong = 0; + int sclProv = 0; + int sclCorrect = 0; + int sclWrong = 0; + int baseProv = 0; + int baseCorrect = 0; + int baseWrong = 0; + + int l2Prov = 0; + int l2Correct = 0; + int l2Wrong = 0; + int l2CtxHit = 0; + int l2PtrnHit = 0; + int l2notBecauseShorter = 0; + int l2notBecauseNotPrefetched = 0; + + int l2cacheDirtyEvict = 0; + int l2cacheCleanEvict = 0; + int l2PFHitInQueue = 0; + int l2PFHitInCache = 0; + int l2PFHitInCI = 0; + + int pfDroppedLocked = 0; + int pfDroppedMispredict = 0; + int pfDroppedBTBMiss = 0; + + int primCorrect = 0; + int primWrong = 0; + int l2OverrideGood = 0; + int l2OverrideBad = 0; + int l2OverrideSameCorr = 0; + int l2OverrideSameWrong = 0; + int l2NoOverride = 0; + int ovrPosAlias = 0; + int ovrNegAlias = 0; + + } llbpstats; + +}; + + +struct LLBPConfig { + TSCLConfig tsclConfig; + + // Context hash function parameters + int T = 3; // Type of history + int W = 8; // Number of branches + int D = 4; // Lookahead delay for prefetching + int S = 1; // Shift + +#define LLBP_CONSTRAINED + +#ifdef LLBP_CONSTRAINED + // Size of pattern sets and CD + int numPatterns = 16; + int numContexts = 1024*14; + + // Associativity of pattern sets and CD + int ctxAssoc = 7; + int ptrnAssoc = 4; + + // Tag widths + int TTWidth = 13; + int CTWidth = 14; + + // PB config + int pbSize = 64; + int pbAssoc = 4; +#else + int numContexts = 1000000; + int numPatterns = 1000000; + + int ctxAssoc = numContexts; + int ptrnAssoc = numPatterns; + + int TTWidth = 20; + int CTWidth = 31; + + int pbSize = 1; + int pbAssoc = pbSize; +#endif + + + int CtrWidth = 3; + int ReplCtrWidth = 16; // unused + int CtxReplCtrWidth = 2; + + bool simulateTiming = false; + bool constrained = true; + int accessDelay = 5; + + + void print() const { + printf("LLBP Config: NumPatterns=%i, NumContexts=%i, ctxAssoc=%i, ptrnAssoc=%i, CtrWidth=%i, ReplCtrWidth=%i, CtxReplCtrWidth=%i, pbSize=%i, TTWidth=%i, CTWidth=%i, simMispFlush=%i, accessDelay=%i\n ", + numPatterns, numContexts, ctxAssoc, ptrnAssoc, CtrWidth, ReplCtrWidth, CtxReplCtrWidth, pbSize, TTWidth, CTWidth, simulateTiming, accessDelay); + } +}; + + + + +///////////////////////// +// LLBP Predictor + +// The LLBP predictor without simulating the prefetch latency. +class LLBPTageSCL64k : public LLBP { + public: + LLBPTageSCL64k(void) + : LLBP(LLBPConfig + { + .tsclConfig = TSCLConfig + { + .tageConfig = Tage64kConfig, + .useSC = true, + .useLoop = true + }, + .simulateTiming = false, + }) + {} +}; + + +// The LLBP predictor with simulating the prefetch latency. +class LLBPTageSCL64kTiming : public LLBP { + public: + LLBPTageSCL64kTiming(void) + : LLBP(LLBPConfig + { + .tsclConfig = TSCLConfig + { + .tageConfig = Tage64kConfig, + .useSC = true, + .useLoop = true + }, + .simulateTiming = true, + }) + {} +}; + + + + + + + +// The LLBP predictor with infinite number of patterns. +inline const LLBPConfig LLBPInfConfig = { + .tsclConfig = TSCLConfig + { + .tageConfig = Tage64kConfig, + .useSC = true, + .useLoop = true + }, + + .T = 3, + .W = 8, + .D = 4, + .S = 1, + + // .inf = true, + .numPatterns = 1000000, + .numContexts = 1000000, + .ctxAssoc = 1000000, + .ptrnAssoc = 1000000, + .TTWidth = 20, + .CTWidth = 31, + .pbSize = 1, + .pbAssoc = 1, + + .simulateTiming = false, + .constrained = false, +}; + + + +class LLBPInfTageSCL64k : public LLBP { + public: + LLBPInfTageSCL64k() + : LLBP(LLBPInfConfig) + {} +}; + +}; // namespace LLBP diff --git a/src/cpu/pred/llbpref/tage.cc b/src/cpu/pred/llbpref/tage.cc new file mode 100755 index 00000000000..b96ff499a7a --- /dev/null +++ b/src/cpu/pred/llbpref/tage.cc @@ -0,0 +1,1068 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * The code is based on the 64KiB TAGE-SC-L branch predictor by Andre Seznec + * provided in the CBP-5 competition. + * It was reformated and made easier to configure. Furthermore, the code + * adds a lot of statistics and debugging information to the predictor. + */ + +#include "tage.h" + +#include "common.h" +#include "counters.h" + +using namespace std; + +#define BORNTICK 1024 +#define PRINTDEBUG 0 + +#define COND (stats.total >= 00000) + +#define USEPATH + +namespace LLBP { + + +TageBase::TageBase(TageConfig cfg) + : BasePredictor(), + nhist(cfg.nhist), + nbanklow(cfg.nbanklow), + nbankhigh(cfg.nbankhigh), + born(cfg.born), + assoc_start(cfg.assoc_start), + assoc_end(cfg.assoc_end), + minhist(cfg.minhist), + maxhist(cfg.maxhist), + LogB(cfg.LogB), + LogG(cfg.LogG), + Tbits(cfg.Tbits), + uwidth(cfg.uwidth), + cwidth(cfg.cwidth), + size_use_alt(1 << (cfg.log_size_use_alt)), + ghr(histbufferlen), + disableInterleaving(cfg.disableInterleaving) +{ + assert(minhist <= maxhist); + assert(LogG > 0); + assert(LogB > 0); + assert(Tbits > 0); + assert(nhist <= MAXNHIST); + + cfg.print(); + + // initialize the predictor + reinit(); + predictorsize(); +} + +TageBase::~TageBase() { + delete[] btable; + + delete[] gtable[1]; + delete[] gtable[born]; + + for (int i = 1; i <= nhist; i++) { + delete indexFHist[i]; + delete[] tag1FHist[i]; + delete[] tag2FHist[i]; + } +} + +void TageBase::reinit() { + m[1] = minhist; + m[nhist / 2] = maxhist; + for (int i = 2; i <= nhist / 2; i++) { + m[i] = (int)(((double)minhist * + pow((double)(maxhist) / (double)minhist, + (double)(i - 1) / (double)(((nhist / 2) - 1)))) + + 0.5); + // printf("(%d %d)", m[i],i); + } + for (int i = 1; i <= nhist; i++) { + NOSKIP[i] = ((i - 1) & 1) || ((i >= assoc_start) & (i < assoc_end)); + } + + if (nhist > 30) { + NOSKIP[4] = 0; + NOSKIP[nhist - 2] = 0; + NOSKIP[8] = 0; + NOSKIP[nhist - 6] = 0; + // just eliminate some extra tables (very very marginal) + } + + for (int i = nhist; i > 1; i--) { + m[i] = m[(i + 1) / 2]; + } + for (int i = 1; i <= nhist; i++) { + TB[i] = Tbits + 4 * (i >= born); + logg[i] = LogG; + } + + gtable[1] = new Gentry[nbanklow * (1 << LogG)]; + SizeTable[1] = nbanklow * (1 << LogG); + + gtable[born] = new Gentry[nbankhigh * (1 << LogG)]; + SizeTable[born] = nbankhigh * (1 << LogG); + + for (int i = born + 1; i <= nhist; i++) gtable[i] = gtable[born]; + for (int i = 2; i <= born - 1; i++) gtable[i] = gtable[1]; + + + btable = new Bentry[1 << LogB]; + + for (int i = 1; i <= nhist; i++) { + indexFHist[i] = new FoldedHistoryFast(ghr, m[i], logg[i]); + tag1FHist[i] = new FoldedHistoryFast(ghr, m[i], TB[i]); + tag2FHist[i] = new FoldedHistoryFast(ghr, m[i], TB[i] - 1); + } + + + + + + Seed = 0; + + TICK = 0; + phist = 0; + Seed = 0; + + // for (int i = 0; i < histbufferlen; i++) ghist[0] = 0; + // ptghist = 0; + // updatethreshold = 35 << 3; + + for (int i = 0; i < (1 << LogB); i++) { + btable[i].pred = 0; + btable[i].hyst = 1; + } + for (int i = 0; i < size_use_alt; i++) { + use_alt_on_na[i] = 0; + } + + // ptghist = 0; + phist = 0; +} + +int TageBase::predictorsize() { + int STORAGESIZE = 0; + + STORAGESIZE += + nbankhigh * (1 << (logg[born])) * (cwidth + uwidth + TB[born]); + STORAGESIZE += nbanklow * (1 << (logg[1])) * (cwidth + uwidth + TB[1]); + + STORAGESIZE += (size_use_alt)*alt_width; + STORAGESIZE += (1 << LogB) + (1 << (LogB - hystshift)); + STORAGESIZE += m[nhist]; + STORAGESIZE += phistwidth; + STORAGESIZE += 10; // the TICK counter + + printf("LogG:%i, TBITS:%i, UWIDTH:%i, CWIDTH:%i, ALTWIDTH:%i, LogB:%i, Hyst:%i\n", + LogG, Tbits, uwidth, cwidth, alt_width, LogB, hystshift); + + + printf(" (TAGE %d) ", STORAGESIZE); + + printf(" (TOTAL %d bits %d Kbits) ", STORAGESIZE, + STORAGESIZE / 1024); + + // for printing predictor characteristics + int NBENTRY = 0; + + STORAGESIZE = 0; + for (int i = 1; i <= nhist; i++) { + if (NOSKIP[i]) { + printf("%dx%d ", TB[i], (1 << logg[i])); + + STORAGESIZE += (1 << logg[i]) * (5 + TB[i]); + NBENTRY += (1 << logg[i]); + } + } + + printf("\n"); + + for (int i = 1; i <= nhist; i++) { + if (NOSKIP[i]) printf("%d ", m[i]); + } + printf("\n"); + + printf("TAGE: N:%d -> SIZE:%d,%iK\n", NBENTRY, STORAGESIZE, + STORAGESIZE / 1024); + + int BIMSIZE = (1 << LogB) + (1 << (LogB - hystshift)); + printf("BASE: Dir:%i, Hyst:%i -> SIZE: %d, %dK\n", (1 << LogB), + (1 << (LogB - hystshift)), BIMSIZE, BIMSIZE / 1024); + STORAGESIZE += BIMSIZE; + + printf("nhist= %d; MInhist= %d; MAXHIST= %d; STORAGESIZE= %d, %dKB; " + "NBENTRY= %d\n", + nhist, minhist, maxhist, STORAGESIZE, STORAGESIZE / 1024, NBENTRY); + + return (STORAGESIZE); +} + +// Base Predictions ---------------------------------------- +bool TageBase::basePredict(const uint64_t pc) { + // + tage_provider = BASE; + BI = (pc ^ (pc >> 2)) & ((1 << LogB) - 1); + + provVal = BIM = (btable[BI].pred << 1) + (btable[BI >> hystshift].hyst); + baseConf = ((BIM == 0) || (BIM == 3)) ? HighConf : LowConf; + base_pred = BIM > 1; + + return base_pred; +} + +void TageBase::baseUpdate(uint64_t pc, bool resolveDir, bool predDir) { + int inter = BIM; + if (resolveDir) { + if (inter < 3) inter += 1; + } else if (inter > 0) + inter--; + btable[BI].pred = inter >> 1; + btable[BI >> hystshift].hyst = (inter & 1); + + // ctrupdate(btable[BI].ctr, resolveDir, 2); + btable[BI].pc = pc; +} + + +int TageBase::F(long long A, int size, int bank) { + int A1, A2; + A = A & ((1 << size) - 1); + A1 = (A & ((1 << logg[bank]) - 1)); + A2 = (A >> logg[bank]); + + if (bank < logg[bank]) + A2 = ((A2 << bank) & ((1 << logg[bank]) - 1)) + + (A2 >> (logg[bank] - bank)); + A = A1 ^ A2; + if (bank < logg[bank]) + A = ((A << bank) & ((1 << logg[bank]) - 1)) + + (A >> (logg[bank] - bank)); + return (A); +} + +// gindex computes a full hash of PC, ghist and phist +int TageBase::gindex(unsigned int PC, int bank) { + int index; + index = PC ^ (PC >> (abs(logg[bank] - bank) + 1)); + + index ^= indexFHist[bank]->value; + + +#ifdef USEPATH + int M = (m[bank] > phistwidth) ? phistwidth : m[bank]; + index ^= F(phist, M, bank); +#endif + + return (index & ((1 << (logg[bank])) - 1)); +} + +// tag computation +uint16_t TageBase::gtag(unsigned int PC, int bank) { + + int tag = 0; + tag = PC; + tag ^= tag1FHist[bank]->value ^ (tag2FHist[bank]->value << 1); + + return (tag & ((1 << (TB[bank])) - 1)); +} + +// just a simple pseudo random number generator: use available information +// to allocate entries in the loop predictor +int TageBase::MYRANDOM() { + Seed++; + Seed ^= phist; + Seed = (Seed >> 21) + (Seed << 11); + Seed = (Seed >> 10) + (Seed << 22); + return (Seed); +}; + + + +// Prediction ---------------------------------------------- +bool TageBase::GetPrediction(uint64_t PC) { + + DPRINTIF(COND,"---- %i PC:%lx -------\n", stats.total, PC); + // computes the TAGE table addresses and the partial tags + pred_taken = predict(PC); + return pred_taken; +} + +bool TageBase::predict(uint64_t PC) { + + // 1. The base prediction + basePredict(PC); + tage_provider = BASE; + + // 2. The TAGE prediction + tagePredict(PC); + provider = tage_provider; + return tage_pred; +} + + +void TageBase::calcIndicesAndTags(uint64_t PC) { + + // 1. Compute indices and tags + for (int i = 1; i <= nhist; i += 2) { + GI[i] = gindex(PC, i); + GTAG[i] = gtag(PC, i); + GTAG[i + 1] = GTAG[i]; + GI[i + 1] = GI[i] ^ (GTAG[i] & ((1 << LogG) - 1)); + } + + + int T = (PC ^ (phist & ((1 << m[born]) - 1))) % nbankhigh; + T = disableInterleaving ? 1 : T; + + for (int i = born; i <= nhist; i++) + if (NOSKIP[i]) { + GI[i] += (T << LogG); + T++; + T = T % nbankhigh; + } + + T = (PC ^ (phist & ((1 << m[1]) - 1))) % nbanklow; + T = disableInterleaving ? 1 : T; + + for (int i = 1; i <= born - 1; i++) + if (NOSKIP[i]) { + GI[i] += (T << LogG); + T++; + T = T % nbanklow; + } + +} + +// TAGE PREDICTION: same code at fetch or retire time but the index and +// tags must recomputed +void TageBase::tagePredict(uint64_t PC) { + + // 1. Compute indices and tags + calcIndicesAndTags(PC); + + HitBank = AltBank = 0; + HitEntry = AltEntry = nullptr; + #define TAGBRANCH (PC == 94321081556116) + + // 2. Perform the table lookup + // Look for the bank with longest matching history + for (int i = nhist; i > 0; i--) { + if (NOSKIP[i]) { + if (gtable[i][GI[i]].tag == GTAG[i]) { + + HitBank = i; + HitEntry = >able[i][GI[i]]; + break; + } + } + } + + // Look for the alternate bank + for (int i = HitBank - 1; i > 0; i--) { + if (NOSKIP[i]) { + if (gtable[i][GI[i]].tag == GTAG[i]) { + + AltBank = i; + AltEntry = >able[i][GI[i]]; + break; + } + } + } + + // If there was no hit in the tables use the base prediction + // Initialize the prediction to the base prediction + tageConf = altConf = baseConf; + LongestMatchPred = tage_pred = alttaken = base_pred; + + // 3. Read the predictions and choose between + // longest matching and alternate matching + if (HitBank > 0) { + + // Read the longest match prediction and its confidence. + LongestMatchPred = (HitEntry->ctr >= 0); + tageConf = compConf(HitEntry->ctr, cwidth); + + if (AltBank > 0) { + // For a second hit read also the alternate match prediction. + alttaken = (AltEntry->ctr >= 0); + altConf = compConf(AltEntry->ctr, cwidth); + } + DPRINTIF(COND,"Hit:%i,GI:%i,GT:%i,c:%i Alt:%i\n", HitBank, GI[HitBank], GTAG[HitBank], HitEntry->ctr, AltBank); + + + // Manage the selection between longest matching and alternate + // matching is done by considering the confidence of longest + // and alternate matching. + tage_provider = chooseProvider(); + switch (tage_provider) { + case LONGEST: + provVal = HitEntry->ctr; + tage_pred = LongestMatchPred; + break; + case ALT: + provVal = AltEntry->ctr; + tage_pred = alttaken; + break; + case BASE: + provVal = BIM; + tage_pred = base_pred; + break; + } + } +} + + +void TageBase::updateHistory(const uint64_t pc, const bool taken, + const OpType opType, const uint64_t target) { + + bool indirect = (opType == OPTYPE_CALL_INDIRECT_COND) || + (opType == OPTYPE_CALL_INDIRECT_UNCOND) || + (opType == OPTYPE_JMP_INDIRECT_COND) || + (opType == OPTYPE_JMP_INDIRECT_UNCOND) || + (opType == OPTYPE_RET_COND) || + (opType == OPTYPE_RET_UNCOND); + + int tbits = indirect ? nHistBits+1 : nHistBits; + + int historyBits = taken ? 1 : 0; + if (nHistBits > 1) { + historyBits ^= pc ^ (pc >> 2); + } + ghr_l = (ghr_l << 1) | (taken & 1); + int PATH = pc ^ (pc >> 2) ^ (pc >> 4); + + for (int t = 0; t < tbits; t++) { + // update history + + bool brDir = (historyBits & 1); + updateGHist(brDir); + historyBits >>= 1; +#ifdef USEPATH + int PATHBIT = (PATH & 127); + PATH >>= 1; + phist = (phist << 1) ^ PATHBIT; + phist &= (1<update(); + tag1FHist[i]->update(); + tag2FHist[i]->update(); + } +} + + +int TageBase::idxChooser() { + bool add1 = (altConf != LowConf); + return ((((HitBank - 1) / 8) << 1) + add1) % (size_use_alt - 1); +} + +unsigned TageBase::chooseProvider() { + // Manage the selection between longest matching and alternate + // matching. + // We take two sources of information to decide. First wheather + // the hit entry is recognized as a newly allocated entry (confidence + // low) and USE_ALT_ON_NA is positive use the alternate prediction + + // If the longest is somehow certain use its prediction. + if (tageConf != LowConf) { + return LONGEST; + } + + // Use on low confidence if the USE_ALT_ON_NA is negative + if (use_alt_on_na[idxChooser()] < 0) { + return LONGEST; + } + + return (AltBank > 0) ? ALT : BASE; +} + +void TageBase::updateChooser(bool taken) { + + if (tageConf != LowConf) + return; + + if (LongestMatchPred != alttaken) { + ctrupdate(use_alt_on_na[idxChooser()], (alttaken == taken), alt_width); + } +} + + +int +TageBase::adjustAlloc(bool resolveDir) { + + // TAGE UPDATE + bool ALLOC = ((tage_pred != resolveDir) & (HitBank < nhist)); + + // do not allocate too often if the overall prediction is correct + + if (HitBank > 0) { + // Manage the selection between longest matching and alternate + // matching for "pseudo"-newly allocated longest matching entry this + // is extremely important for TAGE only, not that important when the + // overall predictor is implemented + // An entry is considered as newly allocated if its prediction + // counter is weak + if (tageConf == LowConf) { + + // if the longest match was delivering the correct prediction, + // no need to allocate a new entry even if the overall + // prediction was false + if (LongestMatchPred == resolveDir) ALLOC = false; + } + // Update the chooser policy between longest matching and + // alternate matching. + updateChooser(resolveDir); + } + + if (tage_pred == resolveDir) + if ((MYRANDOM() & 31) != 0) ALLOC = false; + + // Fixed number of allocations for a missprediction. + if (ALLOC) { + return 1 + nnn; + } + return 0; +} + + +void +TageBase::allocateTables(int nalloc, uint64_t PC, bool resolveDir) { + if (nalloc <= 0) return; + + // T is the number of entries additionally allocated to at + // least one entry per missprediction. + int T = nalloc - 1; + + int A = 1; + if ((MYRANDOM() & 127) < 32) A = 2; + int Penalty = 0; + int NA = 0; + // int + int DEP = HitBank + 1; + if (assoc_start < assoc_end) + DEP = ((((HitBank - 1 + 2 * A) & 0xffe)) ^ (MYRANDOM() & 1)); + + // just a complex formula to chose between X and X+1, when X is odd: + // sorry + + for (int I = DEP; I < nhist; I += 2) { + int i = I + 1; + bool Done = false; + if (NOSKIP[i]) { + auto n = allocate(i, PC, resolveDir); + if (n > 0) { + NA+=1; + + if ((T <= 0) || n > 1) { + break; + } + I += 2; + Done = true; + T -= 1; + } else if (n < 0) { + Penalty++; + } + } + + if (!Done) { + i = (I ^ 1) + 1; + if (NOSKIP[i]) { + auto n = allocate(i, PC, resolveDir); + if (n > 0) { + NA+=1; + + if ((T <= 0) || n > 1) { + break; + } + I += 2; + Done = true; + T -= 1; + } else if (n < 0) { + Penalty++; + } + } + } + } + stats.totalAllocTries += Penalty + NA; + stats.totalAllocInit++; + TICK += (Penalty - 2 * NA); + + // just the best formula for the Championship: + // In practice when one out of two entries are useful + if (TICK < 0) TICK = 0; + if (TICK >= BORNTICK) { + for (int i = 1; i <= born; i += born - 1) // born=11 => T:1,11 + for (int j = 0; j < SizeTable[i]; j++) { + gtable[i][j].u >>= 1; + } + TICK = 0; + stats.uResets++; + } +} + +Gentry& TageBase::getEntry(int idx) { + return gtable[idx][GI[idx]]; +} + +int TageBase::allocate(int idx, uint64_t pc, bool taken) { + auto& entry = getEntry(idx); + + if (entry.u != 0) { + return -1; + } + + +#define OPTREMP +// the replacement is optimized with a single u bit: 0.2 % +#ifdef OPTREMP + if (abs(2 * entry.ctr + 1) > 3) { + if (entry.ctr > 0) + entry.ctr--; + else + entry.ctr++; + return 0; + } +#endif + + evict(entry, idx); + + DPRINTIF(COND,"Alloc:%i,GI:%i,GT:%i\n", idx, GI[idx], GTAG[idx]); + + entry.tag = GTAG[idx]; + entry.pc = pc; + entry.hlen = idx; + entry.idx = GI[idx]; + + entry.ctr = (taken) ? 0 : -1; + + // entry.u = 0; + stats.allocations[idx]++; + stats.totalAllocations++; + + if (entry.correct < 0) entry.correct = 0; + + return 1; +} + + +// PREDICTOR UPDATE + +bool TageBase::tageUpdate(uint64_t pc, bool resolveDir) { + + bool update_base = false; + + // update predictions + if (HitBank > 0) { + if (tageConf == LowConf) { + if (LongestMatchPred != resolveDir) { + // acts as a protection + if (AltBank > 0) { + ctrupdate(AltEntry->ctr, resolveDir, cwidth); + } else { + update_base = true; + } + } + } + + // Do the actual counter update + ctrupdate(HitEntry->ctr, resolveDir, cwidth); + // sign changes: no way it can have been useful + if (HitEntry->ctr == (resolveDir ? 0 : -1)) { + HitEntry->u = 0; + } + + // If both the longest and alternate predictions where correct + // we can possible free the longest entry to use it for other + // predictions. + // We clear this entry by clearing the useful bit. + if (isNotUseful(resolveDir)) { + if (HitEntry->u > 0) { + HitEntry->u--; + } + } + // If the longest hit was correct but the alternative prediction + // was not promote this entry to be useful. + if (isUseful(resolveDir)) { + if (HitEntry->u < (1 << uwidth) - 1) + HitEntry->u++; + HitEntry->useful++; + } + DPRINTIF(COND,"TageUpdate: idx:%d, ctr:%i,u:%i,T:%i\n", + HitBank, HitEntry->ctr, HitEntry->u, HitEntry->tag); + + } else { + update_base = true; + } + + // END TAGE UPDATE + return update_base; +} + +bool TageBase::isNotUseful(bool taken) { + + // If both the longest and alternate predictions where correct + // we can possible free the longest entry to use it for other + // predictions. + if ((alttaken == taken) && (LongestMatchPred == taken)) { + // We only clear if the alternate prediction has a + // high confidence. + if (altConf == HighConf) { + if (AltBank > 0) { + return true; + } + } + } + return false; +} + +bool TageBase::isUseful(bool taken) { + + // If the longest prediction is correct but the alternate + // prediction is wrong the longest is useful. + if ((alttaken != taken) && (LongestMatchPred == taken)) { + return true; + } + return false; +} + + +void TageBase::updateTables(uint64_t pc, bool resolveDir, bool predDir) { + + // 1. Allocate tables if necessary + int nalloc = adjustAlloc(resolveDir); + allocateTables(nalloc, pc, resolveDir); + + // 2. the TAGE tables + bool update_base = tageUpdate(pc, resolveDir); + + // If the prediction was from the base predictor, update it. + if (update_base) { + baseUpdate(pc, resolveDir, predDir); + } +} + +void TageBase::FirstTimeUpdate(uint64_t PC, bool taken, + uint64_t branchTarget) { + + branchCount++; + if (taken) { + stats.takenBranches++; + } + // computes the TAGE table addresses and the partial tags + basePredict(PC); + baseUpdate(PC, taken, false); + updateHistory(PC, taken, OPTYPE_JMP_DIRECT_COND, branchTarget); +} + + +// Predictor update ---------------------------------------- +void TageBase::UpdatePredictor(uint64_t PC, bool resolveDir, + bool predDir, uint64_t branchTarget) { + branchCount++; + stats.condBranches++; + if (predDir) { + stats.takenBranches++; + } + updateTables(PC, resolveDir, predDir); + + updateHistory(PC, resolveDir, OPTYPE_JMP_DIRECT_COND, branchTarget); + updateStats(resolveDir, predDir, PC); +} + +void TageBase::TrackOtherInst(uint64_t PC, OpType opType, bool taken, + uint64_t branchTarget) { + branchCount++; + if (taken) { + stats.takenBranches++; + } + updateHistory(PC, taken, opType, branchTarget); +} + + + + + +bool TageBase::isAllias(uint64_t pc, int bank) { + if (bank == 0) { + return (btable[BI].pc != pc) && (btable[BI].pc != 0); + }; + return (gtable[bank][GI[bank]].pc != pc) && (gtable[bank][GI[bank]].pc != 0); +} + +void TageBase::updateStats(bool taken, bool predtaken, uint64_t PC) { + stats.total++; + auto correct = taken == tage_pred; + if (correct) { + auto cb = 0; + switch (tage_provider) { + case LONGEST: + stats.longestMatchCorrect++; + cb = HitBank; + break; + case ALT: + stats.altMatchCorrect++; + cb = AltBank; + break; + case BASE: + stats.bimodalCorrect++; + cb = 0; + break; + } + stats.providerCorrect[cb]++; + if (isAllias(PC, cb)) { + stats.positiveAlliasing[cb]++; + } + } else { + auto cb = 0; + switch (tage_provider) { + case LONGEST: + stats.longestMatchWrong++; + cb = HitBank; + break; + case ALT: + stats.altMatchWrong++; + cb = AltBank; + break; + case BASE: + stats.bimodalWrong++; + cb = 0; + break; + } + stats.providerWrong[cb]++; + if (isAllias(PC, cb)) { + stats.negativeAlliasing[cb]++; + } + } + + // If it was new allocation and it was delivering the correct + // prediction update stats + if (correct) { + switch (tage_provider) { + case LONGEST: + HitEntry->correct++; + break; + + case ALT: + AltEntry->correct++; + break; + + case BASE: + break; + } + } else { + switch (tage_provider) { + case LONGEST: + HitEntry->incorrect++; + break; + + case ALT: + AltEntry->incorrect++; + break; + + case BASE: + break; + } + } + + auto tage_correct = taken == tage_pred; + auto base_correct = taken == base_pred; + auto alt_correct = taken == alttaken; + auto long_correct = taken == LongestMatchPred; + + if (HitBank > 0) { + if (AltBank > 0) { + if (tage_provider == ALT) { + + if (alttaken == LongestMatchPred) { + stats.altProvTageSame++; + } + if (alttaken == base_pred) { + stats.altProvBaseSame++; + } + + + if (!alt_correct && long_correct) { + stats.altProvTageWouldHaveCorrect++; + } + if (!alt_correct && base_correct) { + stats.altProvBaseWouldHaveCorrect++; + } + } else if (tage_provider == LONGEST) { + + if (LongestMatchPred == alttaken) { + stats.tageProvAltSame++; + } + if (LongestMatchPred == base_pred) { + stats.tageProvBaseSame++; + } + + if (!long_correct && alt_correct) { + stats.tageProvAltWouldHaveCorrect++; + } + if (!long_correct && base_correct) { + stats.tageProvBaseWouldHaveCorrect++; + } + } else { + + if (base_pred == alttaken) { + stats.baseProvAltSame++; + } + if (base_pred == LongestMatchPred) { + stats.baseProvTageSame++; + } + + if (!base_correct && long_correct) { + stats.baseProvTageWouldHaveCorrect++; + } + if (!base_correct && alt_correct) { + stats.baseProvAltWouldHaveCorrect++; + } + } + } else { + if (tage_provider == LONGEST) { + if (LongestMatchPred == base_pred) { + stats.tageProvBaseSame++; + } + + if (!tage_correct && base_correct) { + stats.tageProvBaseWouldHaveCorrect++; + } + + } else { + if (base_pred == LongestMatchPred) { + stats.baseProvTageSame++; + } + if (!base_correct && long_correct) { + stats.baseProvTageWouldHaveCorrect++; + } + } + } + } + + if (!tage_correct) stats.tageMispred++; + if (!base_correct) stats.baseMispred++; +} + + +void TageBase::PrintStat(double instr) { + + printf("AllBr:%lu, CondBr:%i, TakenBr:%i, Ticks:%i\n", + branchCount, stats.condBranches, stats.takenBranches, + ticks + ); + + printf("Bi : %i(%.5f) | BiW : %i(%.5f) | BiC : %i(%.5f)\n", + (stats.bimodalCorrect + stats.bimodalWrong), + (double)(stats.bimodalCorrect + stats.bimodalWrong) / (double)stats.total, + stats.bimodalWrong, (double)stats.bimodalWrong / (double)(stats.bimodalCorrect + stats.bimodalWrong), + stats.bimodalCorrect, (double)stats.bimodalCorrect / (double)(stats.bimodalCorrect + stats.bimodalWrong) + ); + printf("TG : %i(%.5f) | TGW : %i(%.5f) | TGC: %i(%.5f) \n", + (stats.longestMatchCorrect + stats.longestMatchWrong), + (double)(stats.longestMatchCorrect + stats.longestMatchWrong) / (double)stats.total, + stats.longestMatchWrong, (double)stats.longestMatchWrong / (double)(stats.longestMatchCorrect + stats.longestMatchWrong), + stats.longestMatchCorrect, (double)stats.longestMatchCorrect / (double)(stats.longestMatchCorrect + stats.longestMatchWrong)); + printf("ATG: %i(%.5f) | ATW : %i(%.5f) | ATC: %i(%.5f)\n", + (stats.altMatchCorrect + stats.altMatchWrong), + (double)(stats.altMatchCorrect + stats.altMatchWrong) / (double)stats.total, + stats.altMatchWrong, (double)stats.altMatchWrong / (double)(stats.altMatchCorrect + stats.altMatchWrong), + stats.altMatchCorrect, (double)stats.altMatchCorrect / (double)(stats.altMatchCorrect + stats.altMatchWrong)); + + + printf("Prov: T:[AS:%i AC:%i, BS:%i AC:%i], A:[TS:%i TC:%i, BS:%i BC:%i], B:[TS:%i TC:%i, AS:%i AC:%i]\n", + stats.tageProvAltSame, stats.tageProvAltWouldHaveCorrect, + stats.tageProvBaseSame, stats.tageProvBaseWouldHaveCorrect, + stats.altProvTageSame, stats.altProvTageWouldHaveCorrect, + stats.altProvBaseSame, stats.altProvBaseWouldHaveCorrect, + stats.baseProvTageSame, stats.baseProvTageWouldHaveCorrect, + stats.baseProvAltSame, stats.baseProvAltWouldHaveCorrect + ); + printf("MPKI:: BASE:%.4f, TAGE:%.4f Red:%i (%.4f) Instr:%i\n", + (double)stats.baseMispred / (double)instr * 1000, + (double)stats.tageMispred / (double)instr * 1000, + int(stats.baseMispred-stats.tageMispred), + (double)(stats.baseMispred-stats.tageMispred) / (double)stats.baseMispred * 100, + int(instr)); + + + for (int i = 0; i <= nhist; i++) { + if (NOSKIP[i] || i == 0) { + + + printf( + "P%d: %i, %.5f | C: %i, %.5f | W: %i, %.5f | Alloc:%d | Allias: p:%i,n:%i\n", + i, (stats.providerCorrect[i] + stats.providerWrong[i]), + (double)(stats.providerCorrect[i] + stats.providerWrong[i]) / (double)stats.total, + stats.providerCorrect[i], + (double)stats.providerCorrect[i] / (double)stats.total, + stats.providerWrong[i], + (double)stats.providerWrong[i] / (double)stats.total, + stats.allocations[i], + stats.positiveAlliasing[i], stats.negativeAlliasing[i] + ); + } + } + printf( + "Allocations:%i, Init:%i, AperI:%.2f, TriesPerInit:%.3f | Useful: %i | TC: %i, TW: %i, uReset:%i\n", + stats.totalAllocations, + stats.totalAllocInit, + (double)stats.totalAllocations / (double)stats.totalAllocInit, + (double)stats.totalAllocTries / (double)stats.totalAllocInit, + stats.totalUseful, + (stats.longestMatchCorrect + stats.altMatchCorrect + + stats.bimodalCorrect), + (stats.longestMatchWrong + stats.altMatchWrong + stats.bimodalWrong), + stats.uResets); + +} + + + + +void TageBase::resetStats() { + stats = {}; + + for (int i = 0; i < (nbanklow * (1 << LogG)); i++) { + Gentry &entry = gtable[1][i]; + entry.correct = 0; + entry.incorrect = 0; + entry.useful = 0; + } + for (int i = 0; i < (nbankhigh * (1 << LogG)); i++) { + Gentry &entry = gtable[born][i]; + entry.correct = 0; + entry.incorrect = 0; + entry.useful = 0; + } +}; + + +}; // namespace LLBP \ No newline at end of file diff --git a/src/cpu/pred/llbpref/tage.h b/src/cpu/pred/llbpref/tage.h new file mode 100644 index 00000000000..7920e0c4a27 --- /dev/null +++ b/src/cpu/pred/llbpref/tage.h @@ -0,0 +1,368 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Credit: + * The code is based on the 64KiB TAGE-SC-L branch predictor by Andre Seznec + * provided in the CBP-5 competition. + * It was reformated and made easier to configure. Furthermore, the code + * adds a lot of statistics and debugging information to the predictor. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "base_predictor.h" +#include "hist_registers.h" + + + +namespace LLBP { + + + +struct TageConfig; + +class TageBase : public BasePredictor { + public: + TageBase(TageConfig config); + ~TageBase(); + + bool GetPrediction(uint64_t PC) override; + void FirstTimeUpdate(uint64_t PC, bool taken, + uint64_t branchTarget) override; + void UpdatePredictor(uint64_t PC, bool resolveDir, + bool predDir, uint64_t branchTarget) override; + void TrackOtherInst(uint64_t PC, OpType opType, bool taken, + uint64_t branchTarget) override; + + virtual void PrintStat(double instr) override; + void tick() override { + ticks++; + }; + void resetStats() override; + + + + protected: + static const int histbufferlen = + 8192; // we use a 4K entries history buffer to store the branch history + // (this allows us to explore using history length up to 4K) + + static const int MAXNHIST = 40; // Constant limit for the number of tables + + const int nhist; // twice the number of different histories + + const int nbanklow; // number of banks in the shared bank-interleaved for the low + // history lengths + const int nbankhigh; // number of banks in the shared bank-interleaved for the history + // lengths + const int born; // below born in the table for low history lengths, >= born in the + // table for high history lengths, + // we use 2-way associativity for the medium history lengths + const int assoc_start; // 2 -way assoc for those banks 0.4 % + const int assoc_end; + + int SizeTable[MAXNHIST]; + + /*in practice 2 bits or 3 bits par branch: around 1200 cond. branchs*/ + + const int minhist; // 6 // not optimized so far + const int maxhist; // 3000 + + const int hystshift = 2; // bimodal hysteresis shared by 4 entries + const int LogB; // 13 // log of number of entries in bimodal predictor + const int + LogG; // 10 /* logsize of the banks in the tagged TAGE tables */ + const int Tbits; // 8 + // minimum width of the tags (low history lengths), +4 + // for high history + // lengths + + bool NOSKIP[MAXNHIST]; // to manage the associativity for different + // history lengths + + const int nnn = + 1; // number of extra entries allocated on a TAGE misprediction (1+nnn) + + static const int phistwidth = 27; // width of the path history used in TAGE + const int uwidth; // u counter width on TAGE (2 bits not worth + // the effort for a 512 Kbits predictor 0.2 %) + const int cwidth; // predictor counter width on the TAGE tagged tables + + // the counter(s) to chose between longest match and alternate prediction on + // TAGE when weak counters + unsigned tageConf; // Different confidence levels + unsigned baseConf; + unsigned altConf; + int provVal; + // For chooser between alt and longest match + const int alt_width = 5; + const int size_use_alt; + int8_t use_alt_on_na[10]; // 10 is not the actual size, but the maximum. + + int TICK; // for the reset of the u counter + + long long phist; // path history + + // History --------------------------- + const int nHistBits = 2; // Number of history bits per branch + const bool takenHistory = false; // Use taken history instead of direction history + + // The global history register + HistoryRegisterFast ghr; + uint64_t ghr_l; + + // The folded global history registers per table implemented as + // Circular Shift Registers (CSRs). Each table has three CSRs. + // One to compute the index and two for the tag. Two are required as any + // periodic history pattern matching the length of the CSR will XOR to + // all zero. Therefore the second CSR has a width n-1. + FoldedHistoryFast* indexFHist[MAXNHIST]; + FoldedHistoryFast* tag1FHist[MAXNHIST]; + FoldedHistoryFast* tag2FHist[MAXNHIST]; + + // For the TAGE predictor + Bentry *btable; // bimodal TAGE table + Gentry *gtable[MAXNHIST]; // tagged TAGE tables + int m[MAXNHIST]; + int TB[MAXNHIST]; + int logg[MAXNHIST]; + + int GI[MAXNHIST]; // indexes to the different tables are computed only once + + uint GTAG[MAXNHIST]; // tags for the different tables are computed only once + bool pred_taken; // prediction + bool alttaken; // alternate TAGEprediction + bool tage_pred; // TAGE prediction + bool LongestMatchPred; + int HitBank; // longest matching bank + int AltBank; // alternate matching bank + Gentry* HitEntry; // pointer to the HitBank entry + Gentry* AltEntry; // pointer to the AltBank entry + int Seed; // for the pseudo-random number generator + bool pred_inter; + enum { + BASE = 0b001, + ALT = 0b010, + LONGEST = 0b100, + ALT_B = 0b011, + ALT_T = 0b110, + LAST_TAGE_PROVIDER_TYPE = 0b111 + }; + + unsigned provider, tage_provider; + + void reinit(); + int predictorsize(); + + public: + void getPredInfo(unsigned& provider, unsigned& conf) { + provider = tage_provider; + conf = (tage_provider == LONGEST) ? tageConf : + (tage_provider == ALT) ? altConf : baseConf; + } + protected: + // int THRES; + + // Base predictor functions --------------------------- + // Can be overridden by derived classes + virtual bool basePredict(const uint64_t pc); + virtual void baseUpdate(uint64_t pc, bool resolveDir, bool predDir); + + int BI; // index of the bimodal table + int8_t BIM; + bool base_pred; // prediction of the base table + + // the index functions for the tagged tables uses path history as in the + // OGEHL predictor + // F serves to mix path history: not very important impact + int F(long long A, int size, int bank); + + // gindex computes a full hash of PC, ghist and phist + int gindex(unsigned int PC, int bank); + + // tag computation + uint16_t gtag(unsigned int PC, int bank); + + // Calculate indices and tags for the TAGE predictor + void calcIndicesAndTags(uint64_t pc); + + // just a simple pseudo random number generator: use available information + // to allocate entries in the loop predictor + int MYRANDOM(); + + // The overall prediction function --------------------------- + virtual bool predict(uint64_t pc); + + // TAGE PREDICTION: the actual computation and lookup in the tage tables + virtual void tagePredict(uint64_t pc); + + // Update the predictor + virtual void updateTables(uint64_t pc, bool resolveDir, bool predDir); + + // Update of the tagged tables. Will return whether to update the base + bool tageUpdate(uint64_t pc, bool resolveDir); + + // New table allocations + virtual int allocate(int idx, uint64_t pc, bool taken); + virtual int adjustAlloc(bool taken); + void allocateTables(int nalloc, uint64_t pc, bool taken); + virtual Gentry& getEntry(int bank); + + int idxChooser(); + virtual unsigned chooseProvider(); + virtual void updateChooser(bool taken); + + // Function to determine when a entry is not useful + // anymore. + virtual bool isNotUseful(bool taken); + virtual bool isUseful(bool taken); + + + // History update function + virtual void updateHistory(const uint64_t pc, const bool taken, + const OpType opType, const uint64_t branchTarget); + virtual void updateGHist(const bool bit); + + virtual bool isAllias(uint64_t pc, int bank); + + virtual void evict(Gentry& entry, int idx) {}; + + // Disable bank interleaving. + const bool disableInterleaving; + + uint64_t branchCount = 0; + int ticks = 0; + + struct tage_stats { + int bimodalCorrect = 0; + int bimodalWrong = 0; + int longestMatchCorrect = 0; + int longestMatchWrong = 0; + int altMatchCorrect = 0; + int altMatchWrong = 0; + int total = 0; + int providerCorrect[MAXNHIST] = {0}; + int providerWrong[MAXNHIST] = {0}; + int allocations[MAXNHIST] = {0}; + int totalAllocations = 0; + int useful[MAXNHIST] = {0}; + int totalUseful = 0; + int positiveAlliasing[MAXNHIST] = {0}; + int negativeAlliasing[MAXNHIST] = {0}; + int uResets = 0; + int totalAlloc = 0; + int totalAllocInit = 0; + int totalAllocTries = 0; + + int baseProvTageWouldHaveCorrect = 0; + int baseProvAltWouldHaveCorrect = 0; + int baseProvTageSame = 0; + int baseProvAltSame = 0; + int altProvTageWouldHaveCorrect = 0; + int altProvBaseWouldHaveCorrect = 0; + int altProvTageSame = 0; + int altProvBaseSame = 0; + int tageProvBaseWouldHaveCorrect = 0; + int tageProvAltWouldHaveCorrect = 0; + int tageProvBaseSame = 0; + int tageProvAltSame = 0; + + + int baseMispred = 0; + int tageMispred = 0; + + int condBranches = 0; + int takenBranches = 0; + } stats; + + virtual void updateStats(bool taken, bool predtaken, uint64_t PC); +}; + + +// Configuration for the TAGE predictor +// Default configuration is for the 64k TAGE predictor +struct TageConfig { + + int nhist = 36; + int minhist = 6; + int maxhist = 3000; + int LogG = 10; + int LogB = 13; + int Tbits = 8; + int nbanklow = 10; + int nbankhigh = 20; + int born = 13; + int assoc_start = 9; + int assoc_end = 23; + + int uwidth = 1; + int cwidth = 3; + int log_size_use_alt = 4; + + bool tage8k = false; + bool disableInterleaving = false; + bool overwriteNotUseful = true; + bool removeAlliasing = false; + bool tagContext = false; + + void print() const { + printf("TAGE Config: nhist=%d minhist=%d maxhist=%d LogG=%d LogB=%d Tbits=%d nbanklow=%d nbankhigh=%d born=%d assoc_start=%d assoc_end=%d\n", + nhist, minhist, maxhist, LogG, LogB, Tbits, nbanklow, nbankhigh, born, assoc_start, assoc_end); + } +}; + +inline const TageConfig Tage64kConfig = {}; + + +inline const TageConfig TageLargeConfig = { + .LogG = 21, + .LogB = 21, + .Tbits = 15 +}; + +inline const TageConfig TageLargeBimConfig = { + .LogB = 21, +}; + + +class Tage64k : public TageBase { + public: + Tage64k(void) + : TageBase(Tage64kConfig) {} +}; + + + +}; // namespace LLBP \ No newline at end of file diff --git a/src/cpu/pred/llbpref/tage_scl.cc b/src/cpu/pred/llbpref/tage_scl.cc new file mode 100755 index 00000000000..73157877a86 --- /dev/null +++ b/src/cpu/pred/llbpref/tage_scl.cc @@ -0,0 +1,933 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * The code is based on the 64KiB TAGE-SC-L branch predictor by Andre Seznec + * provided in the CBP-5 competition. + * It was reformated and made easier to configure. Furthermore, the code + * adds a lot of statistics and debugging information to the predictor. + */ + +#include "tage_scl.h" + +#include "counters.h" +#include "common.h" + +using namespace std; + +namespace LLBP { + + + +////////////////////////////////////////////////////////////////////////// +// DEFINES + +// --- Global --- // +// #define SC // 8.2 % if TAGE alone +#define IMLI // 0.2 % +#define LOCALH // 2.7 % +#define GLOBALH + + +#define LOOPPREDICTOR // loop predictor enable + +// --- SC --- // + +// The statistical corrector components + +#define PERCWIDTH 6 // Statistical corrector counter width 5 -> 6 : 0.6 % + + +#ifdef TAGE8k +#define BWH + +//The two BIAS tables in the SC component +//We play with confidence here +#define LOGBIAS 7 + +#define BIAS +#define BIASSK +#define BIASBANK + +#else // TAGE64k + +// The three BIAS tables in the SC component +// We play with the TAGE confidence here, with the number of the hitting bank +#define BIAS +#define BIASSK +#define BIASBANK + +#define LOGBIAS 8 +#define LOCALS // enable the 2nd local history +#define LOCALT // enables the 3rd local history + +#endif + +#define INDBIAS \ + ( \ + ((((PC ^ (PC >> 2)) << 1) \ + ^ ((tageConf==LowConf) & (LongestMatchPred != alttaken))) << 1) \ + + pred_inter \ + ) & ((1 << LOGBIAS) - 1) + +#define INDBIASSK \ + (((((PC ^ (PC >> (LOGBIAS - 2))) << 1) ^ ((tageConf==HighConf))) << 1) + pred_inter) & \ + ((1 << LOGBIAS) - 1) + +#define INDBIASBANK \ + (pred_inter + (((HitBank + 1) / 4) << 4) + ((tageConf==HighConf) << 1) + \ + ((tageConf==LowConf) << 2) + ((AltBank != 0) << 3) + ((PC ^ (PC >> 2)) << 7)) & \ + ((1 << LOGBIAS) - 1) + +// playing with putting more weights (x2) on some of the SC components +// playing on using different update thresholds on SC +// update threshold for the statistical corrector +#define VARTHRES +#define WIDTHRES 12 +#define WIDTHRESP 8 +#ifdef VARTHRES +#define LOGSIZEUP 6 // not worth increasing +#else +#define LOGSIZEUP 0 +#endif +#define LOGSIZEUPS (LOGSIZEUP / 2) +#define INDUPD (PC ^ (PC >> 2)) & ((1 << LOGSIZEUP) - 1) +#define INDUPDS ((PC ^ (PC >> 2)) & ((1 << (LOGSIZEUPS)) - 1)) +#define EWIDTH 6 + + + +// --- Loop Predictor --- // + +#ifdef LOOPPREDICTOR +// parameters of the loop predictor +#define WIDTHNBITERLOOP \ + 10 // we predict only loops with less than 1K iterations +#define LOOPTAG 10 // tag width in the loop predictor + +#endif + +/////////////////////////////////////////////////////////////////////////// + +TageSCL::TageSCL(TSCLConfig cfg) + : TageBase(cfg.tageConfig) + , LogL(cfg.LogL) + , useSC(cfg.useSC) + , useLoop(cfg.useLoop) + , disableConfCounter(false) +{ + cfg.print(); + init_predictor(); + predictorsize(); +} + +TageSCL::~TageSCL() { + delete[] ltable; +} + +void TageSCL::predictorsize() { + int inter, STORAGESIZE = 0; + + if (useLoop) { + + inter = (1 << LogL) * (2 * WIDTHNBITERLOOP + LOOPTAG + 4 + 4 + 1); + printf(" (LOOP %d) ", inter); + STORAGESIZE += inter; + } + + if (!useSC) { + printf(" (SC %d) ", 0); + return; + } + + inter += WIDTHRES; + inter = WIDTHRESP * ((1 << LOGSIZEUP)); // the update threshold counters + inter += + 3 * EWIDTH * (1 << LOGSIZEUPS); // the extra weight of the partial sums + inter += (PERCWIDTH) * 3 * (1 << (LOGBIAS)); + + inter += (GNB - 2) * (1 << (LOGGNB)) * (PERCWIDTH) + + (1 << (LOGGNB - 1)) * (2 * PERCWIDTH); + inter += Gm[0]; // global histories for SC + inter += (PNB - 2) * (1 << (LOGPNB)) * (PERCWIDTH) + + (1 << (LOGPNB - 1)) * (2 * PERCWIDTH); + // we use phist already counted for these tables + +#ifdef BWH + inter += BWNB * (1 << LOGBWNB) * PERCWIDTH; + inter += EWIDTH * (1 << LOGSIZEUPS); // the extra weight of the partial sums + inter += BWm[0]; +#endif + +#ifdef LOCALH + inter += (LNB - 2) * (1 << (LOGLNB)) * (PERCWIDTH) + + (1 << (LOGLNB - 1)) * (2 * PERCWIDTH); + inter += NLOCAL * Lm[0]; + inter += EWIDTH * (1 << LOGSIZEUPS); +#ifdef LOCALS + inter += (SNB - 2) * (1 << (LOGSNB)) * (PERCWIDTH) + + (1 << (LOGSNB - 1)) * (2 * PERCWIDTH); + inter += NSECLOCAL * (Sm[0]); + inter += EWIDTH * (1 << LOGSIZEUPS); + +#endif +#ifdef LOCALT + inter += (TNB - 2) * (1 << (LOGTNB)) * (PERCWIDTH) + + (1 << (LOGTNB - 1)) * (2 * PERCWIDTH); + inter += NTLOCAL * Tm[0]; + inter += EWIDTH * (1 << LOGSIZEUPS); +#endif + +#endif + +#ifdef IMLI + + inter += (1 << (LOGINB - 1)) * PERCWIDTH; + inter += Im[0]; + + inter += IMNB * (1 << (LOGIMNB - 1)) * PERCWIDTH; + inter += + 2 * EWIDTH * (1 << LOGSIZEUPS); // the extra weight of the partial sums + inter += 256 * IMm[0]; +#endif + inter += 2 * ConfWidth; // the 2 counters in the choser + STORAGESIZE += inter; + + printf(" (SC %d) ", inter); + +} + +void TageSCL::init_predictor() { + + ltable = new lentry[1 << (LogL)]; + + +#ifdef LOOPPREDICTOR + LVALID = false; + WITHLOOP = -1; + _lSeed = 0; +#endif + + updatethreshold = 35 << 3; + + for (int i = 0; i < (1 << LOGSIZEUP); i++) Pupdatethreshold[i] = 0; + for (int i = 0; i < GNB; i++) GGEHL[i] = &GGEHLA[i][0]; + for (int i = 0; i < LNB; i++) LGEHL[i] = &LGEHLA[i][0]; + + for (int i = 0; i < GNB; i++) + for (int j = 0; j < ((1 << LOGGNB) - 1); j++) { + if (!(j & 1)) { + GGEHL[i][j] = -1; + } + } + for (int i = 0; i < LNB; i++) + for (int j = 0; j < ((1 << LOGLNB) - 1); j++) { + if (!(j & 1)) { + LGEHL[i][j] = -1; + } + } + + for (int i = 0; i < SNB; i++) SGEHL[i] = &SGEHLA[i][0]; + for (int i = 0; i < TNB; i++) TGEHL[i] = &TGEHLA[i][0]; + for (int i = 0; i < PNB; i++) PGEHL[i] = &PGEHLA[i][0]; + + for (int i = 0; i < BWNB; i++) BWGEHL[i] = &BWGEHLA[i][0]; + for (int i = 0; i < BWNB; i++) + for (int j = 0; j < ((1 << LOGBWNB) - 1); j++) { + if (!(j & 1)) { + BWGEHL[i][j] = -1; + } + } + +#ifdef IMLI +#ifdef IMLIOH + for (int i = 0; i < FNB; i++) FGEHL[i] = &FGEHLA[i][0]; + + for (int i = 0; i < FNB; i++) + for (int j = 0; j < ((1 << LOGFNB) - 1); j++) { + if (!(j & 1)) { + FGEHL[i][j] = -1; + } + } +#endif + for (int i = 0; i < INB; i++) IGEHL[i] = &IGEHLA[i][0]; + for (int i = 0; i < INB; i++) + for (int j = 0; j < ((1 << LOGINB) - 1); j++) { + if (!(j & 1)) { + IGEHL[i][j] = -1; + } + } + for (int i = 0; i < IMNB; i++) IMGEHL[i] = &IMGEHLA[i][0]; + for (int i = 0; i < IMNB; i++) + for (int j = 0; j < ((1 << LOGIMNB) - 1); j++) { + if (!(j & 1)) { + IMGEHL[i][j] = -1; + } + } + +#endif + for (int i = 0; i < SNB; i++) + for (int j = 0; j < ((1 << LOGSNB) - 1); j++) { + if (!(j & 1)) { + SGEHL[i][j] = -1; + } + } + for (int i = 0; i < TNB; i++) + for (int j = 0; j < ((1 << LOGTNB) - 1); j++) { + if (!(j & 1)) { + TGEHL[i][j] = -1; + } + } + for (int i = 0; i < PNB; i++) + for (int j = 0; j < ((1 << LOGPNB) - 1); j++) { + if (!(j & 1)) { + PGEHL[i][j] = -1; + } + } + + for (int j = 0; j < (1 << LOGBIAS); j++) { + switch (j & 3) { + case 0: + BiasSK[j] = -8; + break; + case 1: + BiasSK[j] = 7; + break; + case 2: + BiasSK[j] = -32; + + break; + case 3: + BiasSK[j] = 31; + break; + } + } + for (int j = 0; j < (1 << LOGBIAS); j++) { + switch (j & 3) { + case 0: + Bias[j] = -32; + + break; + case 1: + Bias[j] = 31; + break; + case 2: + Bias[j] = -1; + break; + case 3: + Bias[j] = 0; + break; + } + } + for (int j = 0; j < (1 << LOGBIAS); j++) { + switch (j & 3) { + case 0: + BiasBank[j] = -32; + + break; + case 1: + BiasBank[j] = 31; + break; + case 2: + BiasBank[j] = -1; + break; + case 3: + BiasBank[j] = 0; + break; + } + } + + for (int i = 0; i < (1 << LOGSIZEUPS); i++) { + WG[i] = 7; + WL[i] = 7; + WS[i] = 7; + WT[i] = 7; + WP[i] = 7; + WI[i] = 7; + WB[i] = 4; + WBW[i] = 7; + } + TICK = 0; + for (int i = 0; i < NLOCAL; i++) { + L_shist[i] = 0; + } + for (int i = 0; i < NSECLOCAL; i++) { + S_slhist[i] = 0; + } + IMLIcount = 0; +} + + + + +void TageSCL::updateHistory(const uint64_t PC, const bool taken, + const OpType opType, const uint64_t branchTarget) { + Tage::updateHistory(PC, taken, opType, branchTarget); + + if (opType != OPTYPE_JMP_DIRECT_COND) return; + +#ifdef IMLI + IMHIST[IMLIcount] = (IMHIST[IMLIcount] << 1) + taken; + if (branchTarget < PC) { + // This branch corresponds to a loop + if (!taken) { + // exit of the "loop" + IMLIcount = 0; + } + if (taken) { + if (IMLIcount < ((1 << Im[0]) - 1)) IMLIcount++; + } + } +#endif + GHIST = (GHIST << 1) + (taken & (branchTarget < PC)); + BWHIST = (BWHIST << 1) + ((branchTarget < PC) & taken); + L_shist[INDLOCAL] = (L_shist[INDLOCAL] << 1) + (taken); + S_slhist[INDSLOCAL] = ((S_slhist[INDSLOCAL] << 1) + taken) ^ (PC & 15); + T_slhist[INDTLOCAL] = (T_slhist[INDTLOCAL] << 1) + taken; +} + +bool TageSCL::predict(uint64_t pc) { + + // 1. The base prediction + basePredict(pc); + tage_provider = BASE; + + // 2. The TAGE prediction + tagePredict(pc); + tage_scl_pred = tage_pred; + scl_provider = tage_provider; + + // 3. SCL prediction + SCLPredict(pc); + + // 4. Choose the correct prediction + provider = scl_provider; + return tage_scl_pred; +} + +void TageSCL::SCLPredict(uint64_t pc) { + + // Loop prediction + if (useLoop) { + loop_pred = getloop(pc); // loop prediction + if ((WITHLOOP >= 0) && (LVALID)) { + tage_scl_pred = loop_pred; + scl_provider = LOOP; + } + } + // Store the prediction without the SC + pred_inter = tage_scl_pred; + + if (useSC) { + int prov_inter = scl_provider; + + // Make the SC prediction + SCpredict(pc); + sc_pred = (LSUM >= 0); + + + + if (pred_inter != sc_pred) { + // Chooser uses TAGE confidence and |LSUM| + scl_provider = STC; + + // Minimal benefit in trying to avoid accuracy loss on low confidence SC + // prediction and high/medium confidence on TAGE + // but just uses 2 counters 0.3 % MPKI reduction + if (!disableConfCounter){ + if ((tageConf==HighConf)) { + if ((abs(LSUM) < THRES / 4)) { + scl_provider = prov_inter; + } + + else if ((abs(LSUM) < THRES / 2)) + scl_provider = (SecondH < 0) ? STC : prov_inter; + } + + if (tageConf==MedConf) + if ((abs(LSUM) < THRES / 4)) { + scl_provider = (FirstH < 0) ? STC : prov_inter; + } + } + } + + if (scl_provider == STC) { + tage_scl_pred = sc_pred; + } + } +} + +// PREDICTOR UPDATE +void TageSCL::updateTables(uint64_t PC, bool resolveDir, bool predDir) +{ + // TAGE update ----------------- + Tage::updateTables(PC, resolveDir, predDir); + + SCLUpdate(PC, resolveDir, predDir); +} + +// PREDICTOR UPDATE +void TageSCL::SCLUpdate(uint64_t PC, bool resolveDir, bool predDir) +{ + // Loop update ----------------- + if (useLoop) { + if (LVALID) { + if (tage_scl_pred != loop_pred) + ctrupdate(WITHLOOP, (loop_pred == resolveDir), 7); + } + loopupdate(PC, resolveDir, (tage_scl_pred != resolveDir)); + } + + // SC update ----------------- + if (useSC) { + SCUpdate(PC, resolveDir, predDir); + } +} + + + +void TageSCL::SCpredict(uint64_t PC) { + + LSUM = 0; + int8_t ctr = 0; + + LSUM += compPartial(PC); + + // integrate BIAS prediction ------- +#ifdef BIAS + ctr = Bias[INDBIAS]; + LSUM += (2 * ctr + 1); +#endif +#ifdef BIASSK + ctr = BiasSK[INDBIASSK]; + LSUM += (2 * ctr + 1); +#endif +#ifdef BIASBANK + ctr = BiasBank[INDBIASBANK]; + LSUM += (2 * ctr + 1); +#endif + + // Threshold for the statistical corrector +#ifdef VARTHRES + LSUM = (1 + (WB[INDUPDS] >= 0)) * LSUM; +#endif + +#ifdef GLOBALH +#ifdef TAGE8k + LSUM += Gpredict (PC, GHIST, Gm, GGEHL, GNB, LOGGNB, WG); + LSUM += Gpredict (PC, BWHIST, BWm, BWGEHL, BWNB, LOGBWNB, WBW); +#else // TAGE64k + // integrate the GEHL predictions + LSUM += Gpredict((PC << 1) + pred_inter, GHIST, Gm, GGEHL, GNB, LOGGNB, WG); + LSUM += Gpredict(PC, phist, Pm, PGEHL, PNB, LOGPNB, WP); +#endif +#endif + + // Local history based components +#ifdef LOCALH + LSUM += Gpredict(PC, L_shist[INDLOCAL], Lm, LGEHL, LNB, LOGLNB, WL); +#ifdef LOCALS + LSUM += Gpredict(PC, S_slhist[INDSLOCAL], Sm, SGEHL, SNB, LOGSNB, WS); +#endif +#ifdef LOCALT + LSUM += Gpredict(PC, T_slhist[INDTLOCAL], Tm, TGEHL, TNB, LOGTNB, WT); +#endif +#endif + +#ifdef IMLI +#ifndef TAGE8k + LSUM += Gpredict(PC, IMHIST[(IMLIcount)], IMm, IMGEHL, IMNB, LOGIMNB, WIM); +#endif + LSUM += Gpredict(PC, IMLIcount, Im, IGEHL, INB, LOGINB, WI); +#endif + + // bool SCPRED = (LSUM >= 0); + // Read the threshold for the statistical corrector + // just an heuristic if the respective contribution of component groups + // can be multiplied by 2 or not + THRES = (updatethreshold >> 3) + Pupdatethreshold[INDUPD] +#ifdef TAGE8k +#ifdef VARTHRES + + 6 * ((WB[INDUPDS] >= 0) +#ifdef LOCALH + + (WL[INDUPDS] >= 0) +#endif +// #ifdef GSC + + (WG[INDUPDS] >= 0) + (WBW[INDUPDS] >= 0) +// #endif +#ifdef IMLI + + (WI[INDUPDS] >= 0) +#endif + ) +#endif + +#else // TAGE64k +#ifdef VARTHRES + + + 12 * ((WB[INDUPDS] >= 0) + (WP[INDUPDS] >= 0) +#ifdef LOCALH + + (WS[INDUPDS] >= 0) + (WT[INDUPDS] >= 0) + (WL[INDUPDS] >= 0) +#endif + + (WG[INDUPDS] >= 0) +#ifdef IMLI + + (WI[INDUPDS] >= 0) +#endif + ) +#endif +#endif + ; + +} + +void TageSCL::SCUpdate(uint64_t PC, bool resolveDir, bool predDir) +{ + + bool SCPRED = (LSUM >= 0); + + // Chooser update ----------------- + if (!disableConfCounter) { + if (pred_inter != SCPRED) { + if ((abs(LSUM) < THRES)) + if (tageConf==HighConf) { + if ((abs(LSUM) < THRES / 2)) + if ((abs(LSUM) >= THRES / 4)) + ctrupdate(SecondH, (pred_inter == resolveDir), + ConfWidth); + } + if (tageConf==MedConf) + if ((abs(LSUM) < THRES / 4)) { + ctrupdate(FirstH, (pred_inter == resolveDir), ConfWidth); + } + } + } + + + if ((SCPRED != resolveDir) || ((abs(LSUM) < THRES))) { + + updatePartial(PC, resolveDir); + + { + if (SCPRED != resolveDir) { + Pupdatethreshold[INDUPD] += 1; + updatethreshold += 1; + } + + else { + Pupdatethreshold[INDUPD] -= 1; + updatethreshold -= 1; + } + + if (Pupdatethreshold[INDUPD] >= (1 << (WIDTHRESP - 1))) + Pupdatethreshold[INDUPD] = (1 << (WIDTHRESP - 1)) - 1; + // Pupdatethreshold[INDUPD] could be negative + if (Pupdatethreshold[INDUPD] < -(1 << (WIDTHRESP - 1))) + Pupdatethreshold[INDUPD] = -(1 << (WIDTHRESP - 1)); + if (updatethreshold >= (1 << (WIDTHRES - 1))) + updatethreshold = (1 << (WIDTHRES - 1)) - 1; + // updatethreshold could be negative + if (updatethreshold < -(1 << (WIDTHRES - 1))) + updatethreshold = -(1 << (WIDTHRES - 1)); + } + +#ifdef VARTHRES + { + int XSUM = + LSUM - ((WB[INDUPDS] >= 0) * + ((2 * Bias[INDBIAS] + 1) + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1))); + if ((XSUM + ((2 * Bias[INDBIAS] + 1) + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1)) >= + 0) != (XSUM >= 0)) + ctrupdate( + WB[INDUPDS], + (((2 * Bias[INDBIAS] + 1) + (2 * BiasSK[INDBIASSK] + 1) + + (2 * BiasBank[INDBIASBANK] + 1) >= + 0) == resolveDir), + EWIDTH); + } +#endif + // Bias update ----------------- + ctrupdate(Bias[INDBIAS], resolveDir, PERCWIDTH); + ctrupdate(BiasSK[INDBIASSK], resolveDir, PERCWIDTH); + ctrupdate(BiasBank[INDBIASBANK], resolveDir, PERCWIDTH); + + // Global history based components +#ifdef TAGE8k + Gupdate(PC, resolveDir, GHIST, Gm, GGEHL, GNB, LOGGNB, WG); + Gupdate(PC, resolveDir, BWHIST, BWm, BWGEHL, BWNB, LOGBWNB, WBW); +#else // TAGE64k + Gupdate((PC << 1) + pred_inter, resolveDir, GHIST, Gm, GGEHL, GNB, + LOGGNB, WG); + Gupdate(PC, resolveDir, phist, Pm, PGEHL, PNB, LOGPNB, WP); +#endif + + // Local history based components +#ifdef LOCALH + Gupdate(PC, resolveDir, L_shist[INDLOCAL], Lm, LGEHL, LNB, LOGLNB, WL); +#ifdef LOCALS + Gupdate(PC, resolveDir, S_slhist[INDSLOCAL], Sm, SGEHL, SNB, LOGSNB, + WS); +#endif +#ifdef LOCALT + + Gupdate(PC, resolveDir, T_slhist[INDTLOCAL], Tm, TGEHL, TNB, LOGTNB, + WT); +#endif +#endif + +#ifdef IMLI +#ifndef TAGE8k + Gupdate(PC, resolveDir, IMHIST[(IMLIcount)], IMm, IMGEHL, IMNB, LOGIMNB, + WIM); +#endif + Gupdate(PC, resolveDir, IMLIcount, Im, IGEHL, INB, LOGINB, WI); +#endif + } +} + + + + + +// #define GINDEX +// (((long long)PC) ^ bhist ^ (bhist >> (8 - i)) ^ (bhist >> (16 - 2 * i)) ^ +// (bhist >> (24 - 3 * i)) ^ (bhist >> (32 - 3 * i)) ^ +// (bhist >> (40 - 4 * i))) & +// ((1 << (logs - (i >= (NBR - 2)))) - 1) + +int64_t TageSCL::gIndex(uint64_t PC, int64_t bhist, int logs, int nbr, int i) +{ + auto substr = (i >= (nbr - 2)) ? 1 : 0; + return (((int64_t) PC) ^ bhist ^ (bhist >> (8 - i)) ^ + (bhist >> (16 - 2 * i)) ^ (bhist >> (24 - 3 * i)) ^ + (bhist >> (32 - 3 * i)) ^ (bhist >> (40 - 4 * i))) & + ((1 << (logs - substr)) - 1); +} + + +int TageSCL::Gpredict(uint64_t PC, long long BHIST, int *length, int8_t **tab, + int NBR, int logs, int8_t *W) { + int PERCSUM = 0; + for (int i = 0; i < NBR; i++) { + long long bhist = BHIST & ((long long)((1 << length[i]) - 1)); + long long index = gIndex(PC, bhist, logs, NBR, i); + + int8_t ctr = tab[i][index]; + + PERCSUM += (2 * ctr + 1); + } +#ifdef VARTHRES + PERCSUM = (1 + (W[INDUPDS] >= 0)) * PERCSUM; +#endif + return ((PERCSUM)); +} + +void TageSCL::Gupdate(uint64_t PC, bool taken, long long BHIST, int *length, + int8_t **tab, int NBR, int logs, int8_t *W) { + int PERCSUM = 0; + + for (int i = 0; i < NBR; i++) { + long long bhist = BHIST & ((long long)((1 << length[i]) - 1)); + long long index = gIndex(PC, bhist, logs, NBR, i); + + PERCSUM += (2 * tab[i][index] + 1); + ctrupdate(tab[i][index], taken, PERCWIDTH); + } +#ifdef VARTHRES + { + int XSUM = LSUM - ((W[INDUPDS] >= 0)) * PERCSUM; + if ((XSUM + PERCSUM >= 0) != (XSUM >= 0)) + ctrupdate(W[INDUPDS], ((PERCSUM >= 0) == taken), EWIDTH); + } +#endif +} + +/// ---- LOOP PREDICTOR ---/// + +int TageSCL::lindex(uint64_t PC) { + return (((PC ^ (PC >> 2)) & ((1 << (LogL - 2)) - 1)) << 2); +} + +// loop prediction: only used if high confidence +// skewed associative 4-way +// At fetch time: speculative +#define CONFLOOP 15 +bool TageSCL::getloop(uint64_t PC) { + LHIT = -1; + + LI = lindex(PC); + LIB = ((PC >> (LogL - 2)) & ((1 << (LogL - 2)) - 1)); + LTAG = (PC >> (LogL - 2)) & ((1 << 2 * LOOPTAG) - 1); + LTAG ^= (LTAG >> LOOPTAG); + LTAG = (LTAG & ((1 << LOOPTAG) - 1)); + + for (int i = 0; i < 4; i++) { + int index = (LI ^ ((LIB >> i) << 2)) + i; + + if (ltable[index].TAG == LTAG) { + LHIT = i; + LVALID = ((ltable[index].confid == CONFLOOP) || + (ltable[index].confid * ltable[index].NbIter > 128)); + + if (ltable[index].CurrentIter + 1 == ltable[index].NbIter) + return (!(ltable[index].dir)); + return ((ltable[index].dir)); + } + } + + LVALID = false; + return (false); +} + +void TageSCL::loopupdate(uint64_t PC, bool Taken, bool ALLOC) { + if (LHIT >= 0) { + int index = (LI ^ ((LIB >> LHIT) << 2)) + LHIT; + // already a hit + if (LVALID) { + if (Taken != loop_pred) { + // free the entry + ltable[index].NbIter = 0; + ltable[index].age = 0; + ltable[index].confid = 0; + ltable[index].CurrentIter = 0; + return; + + } else if ((loop_pred != tage_pred) || ((MYLoopRANDOM() & 7) == 0)) + if (ltable[index].age < CONFLOOP) ltable[index].age++; + } + + ltable[index].CurrentIter++; + ltable[index].CurrentIter &= ((1 << WIDTHNBITERLOOP) - 1); + // loop with more than 2** WIDTHNBITERLOOP iterations are not + // treated correctly; but who cares :-) + if (ltable[index].CurrentIter > ltable[index].NbIter) { + ltable[index].confid = 0; + ltable[index].NbIter = 0; + // treat like the 1st encounter of the loop + } + if (Taken != ltable[index].dir) { + if (ltable[index].CurrentIter == ltable[index].NbIter) { + if (ltable[index].confid < CONFLOOP) ltable[index].confid++; + if (ltable[index].NbIter < 3) + // just do not predict when the loop count is 1 or 2 + { + // free the entry + ltable[index].dir = Taken; + ltable[index].NbIter = 0; + ltable[index].age = 0; + ltable[index].confid = 0; + } + } else { + if (ltable[index].NbIter == 0) { + // first complete nest; + ltable[index].confid = 0; + ltable[index].NbIter = ltable[index].CurrentIter; + } else { + // not the same number of iterations as last time: free + // the entry + ltable[index].NbIter = 0; + ltable[index].confid = 0; + } + } + ltable[index].CurrentIter = 0; + } + + } else if (ALLOC) + + { + uint64_t X = MYLoopRANDOM() & 3; + + if ((MYLoopRANDOM() & 3) == 0) + for (int i = 0; i < 4; i++) { + int LHIT = (X + i) & 3; + int index = (LI ^ ((LIB >> LHIT) << 2)) + LHIT; + if (ltable[index].age == 0) { + ltable[index].dir = !Taken; + // most of mispredictions are on last iterations + ltable[index].TAG = LTAG; + ltable[index].NbIter = 0; + ltable[index].age = 7; + ltable[index].confid = 0; + ltable[index].CurrentIter = 0; + break; + + } else + ltable[index].age--; + break; + } + } +} + +void TageSCL::updateStats(bool taken, bool predtaken, uint64_t PC) { + Tage::updateStats(taken, predtaken, PC); + + bool tage_correct = tage_pred == taken; + bool tscl_correct = tage_scl_pred == taken; + if (!tage_correct) sclstats.tageMisses++; + if (!tscl_correct) sclstats.tsclMisses++; + + switch (provider) { + case LONGEST: + case ALT: + case BASE: + sclstats.provTage++; + if (tage_scl_pred == taken) sclstats.tageCorrect++; + else sclstats.tageIncorrect++; + break; + case LOOP: + sclstats.provLoop++; + if (tage_scl_pred == taken) sclstats.loopCorrect++; + else sclstats.loopIncorrect++; + break; + case STC: + sclstats.provSC++; + if (tage_scl_pred == taken) sclstats.scCorrect++; + else sclstats.scIncorrect++; + break; + default: + break; + } +} +void TageSCL::PrintStat(double instr) { + Tage::PrintStat(instr); + printf("SCL:: L:[P:%d(%.4f) C:%d(%.4f), W:%d(%.4f)] SC:[P:%d(%.4f) C:%d(%.4f), W:%d(%.4f)] TAGE:[P:%d(%.4f) C:%d(%.4f), W:%d(%.4f)] \n", + sclstats.provLoop, sclstats.provLoop / (double)stats.total, + sclstats.loopCorrect, sclstats.loopCorrect / (double)sclstats.provLoop, + sclstats.loopIncorrect, sclstats.loopIncorrect / (double)sclstats.provLoop, + sclstats.provSC, sclstats.provSC / (double)stats.total, + sclstats.scCorrect, sclstats.scCorrect / (double)sclstats.provSC, + sclstats.scIncorrect, sclstats.scIncorrect / (double)sclstats.provSC, + sclstats.provTage, sclstats.provTage / (double)stats.total, + sclstats.tageCorrect, sclstats.tageCorrect / (double)sclstats.provTage, + sclstats.tageIncorrect, sclstats.tageIncorrect / (double)sclstats.provTage); + + printf("MPKI:: TAGE:%.5f, SCL:%.5f Red:%.5f \n", + (double)sclstats.tageMisses / (double)instr * 1000, + (double)sclstats.tsclMisses / (double)instr * 1000, + (double)(sclstats.tageMisses - sclstats.tsclMisses) / (double)sclstats.tageMisses * 100); +} + +// Dump all tage tables as csv file + +void TageSCL::DumpTables(std::string filename) {} + + +}; // namespace LLBP diff --git a/src/cpu/pred/llbpref/tage_scl.h b/src/cpu/pred/llbpref/tage_scl.h new file mode 100644 index 00000000000..ccd64e3a3b4 --- /dev/null +++ b/src/cpu/pred/llbpref/tage_scl.h @@ -0,0 +1,472 @@ +/* MIT License + * + * Copyright (c) 2024 David Schall and EASE lab + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * The code is based on the 64KiB TAGE-SC-L branch predictor by Andre Seznec + * provided in the CBP-5 competition. + * It was reformated and made easier to configure. Furthermore, the code + * adds a lot of statistics and debugging information to the predictor. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "tage.h" + +//---------------------- +// #define TAGEInf +// #define TAGE8k +//---------------------- + +namespace LLBP { + + +struct TSCLConfig; + +class TageSCL : public TageBase { + public: + TageSCL(TSCLConfig config); + ~TageSCL(); + + void DumpTables(std::string filename) override; + void PrintStat(double NUMINST) override; + + protected: + typedef TageBase Tage; + + // Override some base class functions + bool predict(uint64_t pc) override; + void updateTables(uint64_t pc, bool resolveDir, bool predDir) override; + + void updateStats(bool taken, bool predtaken, uint64_t PC) override; + void updateHistory(const uint64_t pc, const bool taken, const OpType opType, + const uint64_t branchTarget) override; + + void init_predictor(); + void predictorsize(); + void resetStats() override { + Tage::resetStats(); + sclstats = {}; + }; + // Internal prediction of the SCL part + bool SCLpredict(uint64_t pc); + + + + +// --- SC --- // +// The max number is not the actual number of entries just for the +// initialization + +#ifdef TAGEInf + + +// Bias tables +#define LogBiasMax 20 + int8_t Bias[(1 << LogBiasMax)]; + int8_t BiasSK[(1 << LogBiasMax)]; + int8_t BiasBank[(1 << LogBiasMax)]; + + // In all th GEHL components, the two tables with the shortest history + // lengths have only half of the entries. + + // IMLI-SIC -> Micro 2015 paper: a big disappointment on CBP2016 traces + long long IMLIcount; // use to monitor the iteration number + +// Not used by INF SCL ------------------------------ +#define LOGBWNB 7 +#define BWNB 2 + int BWm[BWNB] = { 16, 8 }; + int8_t BWGEHLA[BWNB][(1 << LOGBWNB)] = { {0} }; + int8_t *BWGEHL[BWNB]; + long long BWHIST; + + +#define LOGINB 19 // 128-entry +#define INB 1 + int Im[INB] = {8}; + int8_t IGEHLA[INB][(1 << LOGINB)] = {{0}}; + int8_t *IGEHL[INB]; + +// global branch GEHL +#define LOGGNB 19 // 1 1K + 2 * 512-entry tables +#define GNB 3 + int Gm[GNB] = {40, 24, 10}; + int8_t GGEHLA[GNB][(1 << LOGGNB)] = {{0}}; + int8_t *GGEHL[GNB]; + +// first local history +#define LOGLNB 19 // 1 1K + 2 * 512-entry tables +#define LNB 3 + int Lm[LNB] = {11, 6, 3}; + int8_t LGEHLA[LNB][(1 << LOGLNB)] = {{0}}; + int8_t *LGEHL[LNB]; +#define LOGLOCAL 19 +#define NLOCAL (1 << LOGLOCAL) +#define INDLOCAL ((PC ^ (PC >> 2)) & (NLOCAL - 1)) + long long L_shist[NLOCAL]; // local histories + + +// Variation on the IMLI predictor +#define LOGIMNB 19 // 2* 256 -entry +#define IMNB 2 + int IMm[IMNB] = {10, 4}; + int8_t IMGEHLA[IMNB][(1 << LOGIMNB)] = {{0}}; + int8_t *IMGEHL[IMNB]; + long long IMHIST[256]; + + +// variation on global branch history +#define PNB 3 +#define LOGPNB 19 // 1 1K + 2 * 512-entry tables + int Pm[PNB] = {25, 16, 9}; + int8_t PGEHLA[PNB][(1 << LOGPNB)] = {{0}}; + int8_t *PGEHL[PNB]; + + +// second local history +#define LOGSNB 19 // 1 1K + 2 * 512-entry tables +#define SNB 3 + int Sm[SNB] = {16, 11, 6}; + int8_t SGEHLA[SNB][(1 << LOGSNB)] = {{0}}; + + int8_t *SGEHL[SNB]; +#define LOGSECLOCAL 4 +#define NSECLOCAL (1 << LOGSECLOCAL) // Number of second local histories +#define INDSLOCAL (((PC ^ (PC >> 5))) & (NSECLOCAL - 1)) + long long S_slhist[NSECLOCAL]; + +// third local history +#define LOGTNB 19 // 2 * 512-entry tables +#define TNB 2 + int Tm[TNB] = {9, 4}; + int8_t TGEHLA[TNB][(1 << LOGTNB)] = {{0}}; + + int8_t *TGEHL[TNB]; +#define NTLOCAL 19 +#define INDTLOCAL \ + (((PC ^ (PC >> (LOGTNB)))) & \ + (NTLOCAL - 1)) // different hash for the history + long long T_slhist[NTLOCAL]; + + +#else // No TAGEInf ------------------------------ + + + +// Bias tables +#define LogBiasMax 15 + int8_t Bias[(1 << LogBiasMax)]; + int8_t BiasSK[(1 << LogBiasMax)]; + int8_t BiasBank[(1 << LogBiasMax)]; + + // In all th GEHL components, the two tables with the shortest history + // lengths have only half of the entries. + + // IMLI-SIC -> Micro 2015 paper: a big disappointment on CBP2016 traces + long long IMLIcount; // use to monitor the iteration number + + +#ifdef TAGE8k + +#define LOGINB 7 +#define INB 1 + int Im[INB] = { 8 }; + int8_t IGEHLA[INB][(1 << LOGINB)] = { {0} }; + int8_t *IGEHL[INB]; + +//global branch GEHL +#define LOGGNB 7 +#define GNB 2 + int Gm[GNB] = {6,3}; + int8_t GGEHLA[GNB][(1 << LOGGNB)] = { {0} }; + int8_t *GGEHL[GNB]; + + +//large local history +#define LOGLNB 7 +#define LNB 2 + int Lm[LNB] = { 6, 3 }; + int8_t LGEHLA[LNB][(1 << LOGLNB)] = { {0} }; + int8_t *LGEHL[LNB]; + +#define LOGLOCAL 6 +#define NLOCAL (1<>2)) & (NLOCAL-1)) + long long L_shist[NLOCAL]; + + +#else // TAGE64k + +#define LOGINB 8 // 128-entry +#define INB 1 + int Im[INB] = {8}; + int8_t IGEHLA[INB][(1 << LOGINB)] = {{0}}; + int8_t *IGEHL[INB]; + +// global branch GEHL +#define LOGGNB 10 // 1 1K + 2 * 512-entry tables +#define GNB 3 + int Gm[GNB] = {40, 24, 10}; + int8_t GGEHLA[GNB][(1 << LOGGNB)] = {{0}}; + int8_t *GGEHL[GNB]; + +// first local history +#define LOGLNB 10 // 1 1K + 2 * 512-entry tables +#define LNB 3 + int Lm[LNB] = {11, 6, 3}; + int8_t LGEHLA[LNB][(1 << LOGLNB)] = {{0}}; + int8_t *LGEHL[LNB]; +#define LOGLOCAL 8 +#define NLOCAL (1 << LOGLOCAL) +#define INDLOCAL ((PC ^ (PC >> 2)) & (NLOCAL - 1)) + long long L_shist[NLOCAL]; // local histories + +#endif + + +// Only used by 8k TAGE ------------------------------ +// Backward branch history +#define LOGBWNB 7 +#define BWNB 2 + int BWm[BWNB] = { 16, 8 }; + int8_t BWGEHLA[BWNB][(1 << LOGBWNB)] = { {0} }; + int8_t *BWGEHL[BWNB]; + long long BWHIST; + + + +// Only used by 64k TAGE ------------------------------ +// Variation on the IMLI predictor +#define LOGIMNB 9 // 2* 256 -entry +#define IMNB 2 + int IMm[IMNB] = {10, 4}; + int8_t IMGEHLA[IMNB][(1 << LOGIMNB)] = {{0}}; + int8_t *IMGEHL[IMNB]; + long long IMHIST[256]; + + +// variation on global branch history +#define PNB 3 +#define LOGPNB 9 // 1 1K + 2 * 512-entry tables + int Pm[PNB] = {25, 16, 9}; + int8_t PGEHLA[PNB][(1 << LOGPNB)] = {{0}}; + int8_t *PGEHL[PNB]; + + +// second local history +#define LOGSNB 9 // 1 1K + 2 * 512-entry tables +#define SNB 3 + int Sm[SNB] = {16, 11, 6}; + int8_t SGEHLA[SNB][(1 << LOGSNB)] = {{0}}; + + int8_t *SGEHL[SNB]; +#define LOGSECLOCAL 4 +#define NSECLOCAL (1 << LOGSECLOCAL) // Number of second local histories +#define INDSLOCAL (((PC ^ (PC >> 5))) & (NSECLOCAL - 1)) + long long S_slhist[NSECLOCAL]; + +// third local history +#define LOGTNB 10 // 2 * 512-entry tables +#define TNB 2 + int Tm[TNB] = {9, 4}; + int8_t TGEHLA[TNB][(1 << LOGTNB)] = {{0}}; + + int8_t *TGEHL[TNB]; +#define NTLOCAL 16 +#define INDTLOCAL \ + (((PC ^ (PC >> (LOGTNB)))) & \ + (NTLOCAL - 1)) // different hash for the history + long long T_slhist[NTLOCAL]; + + +#endif + +#define LOGSIZEUPMAX 6 // not worth increasing + int updatethreshold; + int Pupdatethreshold[(1 << LOGSIZEUPMAX)]; // size is fixed by LOGSIZEUP + int8_t WG[(1 << LOGSIZEUPMAX)]; + int8_t WL[(1 << LOGSIZEUPMAX)]; + int8_t WS[(1 << LOGSIZEUPMAX)]; + int8_t WT[(1 << LOGSIZEUPMAX)]; + int8_t WP[(1 << LOGSIZEUPMAX)]; + int8_t WI[(1 << LOGSIZEUPMAX)]; + int8_t WIM[(1 << LOGSIZEUPMAX)]; + int8_t WB[(1 << LOGSIZEUPMAX)]; + int8_t WBW[(1 << LOGSIZEUPMAX)]; + + long long GHIST; + int LSUM; + bool sc_pred, tage_scl_pred; + + // The thereshold for the SC prediction + int THRES; + // + int64_t gIndex(uint64_t PC, int64_t bhist, int logs, int nbr, int i); + + int Gpredict(uint64_t PC, long long BHIST, int *length, int8_t **tab, + int NBR, int logs, int8_t *W); + void Gupdate(uint64_t PC, bool taken, long long BHIST, int *length, + int8_t **tab, int NBR, int logs, int8_t *W); + + void SCpredict(uint64_t pc); + void SCUpdate(uint64_t PC, bool resolveDir, bool predDir); + + void SCLPredict(uint64_t pc); + void SCLUpdate(uint64_t PC, bool resolveDir, bool predDir); + + + // Two hooks to add additional information into the SC prediction + virtual int compPartial(uint64_t pc) { return 0; } + virtual void updatePartial(uint64_t PC, bool resolveDir) {} + + // ---- LOOP PREDICTOR --- // + class lentry // loop predictor entry + { + public: + uint16_t NbIter; // 10 bits + uint8_t confid; // 4bits + uint16_t CurrentIter; // 10 bits + + uint16_t TAG; // 10 bits + uint8_t age; // 4 bits + bool dir; // 1 bit + + // 39 bits per entry + lentry() { + confid = 0; + CurrentIter = 0; + NbIter = 0; + TAG = 0; + age = 0; + dir = false; + } + }; + const int LogL; // log of number of entries in the loop predictor + lentry *ltable; // loop predictor table + // variables for the loop predictor + bool loop_pred; // loop predictor prediction + int LIB; + int LI; + int LHIT; // hitting way in the loop predictor + int LTAG; // tag on the loop predictor + bool LVALID; // validity of the loop predictor prediction + int8_t WITHLOOP; // counter to monitor whether or not loop prediction is + // beneficial + + int lindex(uint64_t PC); + bool getloop(uint64_t PC); + void loopupdate(uint64_t PC, bool Taken, bool ALLOC); + int _lSeed; + // just a simple pseudo random number generator: use available information + // to allocate entries in the loop predictor + int MYLoopRANDOM() { + _lSeed++; + // Seed ^= phist; + _lSeed = (_lSeed >> 21) + (_lSeed << 11); + _lSeed = (_lSeed >> 10) + (_lSeed << 22); + return (_lSeed); + }; + + // Chooser --------------------- + enum { + LOOP = Tage::LAST_TAGE_PROVIDER_TYPE +1, + STC = LOOP + 1, + LAST_SCL_PROVIDER_TYPE = STC + }; + int scl_provider; + + // The two counters used to choose between TAGE and SC on Low Conf SC + int8_t FirstH, SecondH; + const int ConfWidth = 7; // for the counters in the choser + + const bool useSC; + const bool useLoop; + + // Disable the chooser confidence counter + const bool disableConfCounter; + + // Some stats + struct + { + /* data */ + int provLoop = 0; + int loopCorrect = 0; + int loopIncorrect = 0; + int provSC = 0; + int scCorrect = 0; + int scIncorrect = 0; + int provTage = 0; + int tageCorrect = 0; + int tageIncorrect = 0; + int tageMisses = 0; + int tsclMisses = 0; + } sclstats; + +}; + + +// Configurations +struct TSCLConfig { + TageConfig tageConfig; + bool useSC = true; + bool useLoop = true; + int LogL = 5; + + void print() const { + printf("SCL Config: useSC=%d, useLoop=%d\n", useSC, useLoop); + } +}; +inline const TSCLConfig TSCL64kCfgDefault = {}; + + +// Configurations +class TageSCL64k : public TageSCL { + public: + TageSCL64k(void) + : TageSCL(TSCL64kCfgDefault) + {} +}; + +class TageSCL512k : public TageSCL { + public: + TageSCL512k(void) + : TageSCL(TSCLConfig + { + .tageConfig = TageConfig { + .LogG = 13, // 64K (2**10 entries) * 8 + } + }) + {} +}; + + +}; // namespace LLBP diff --git a/src/cpu/pred/loop_predictor.hh b/src/cpu/pred/loop_predictor.hh index c92c2e78f43..633782ffba7 100644 --- a/src/cpu/pred/loop_predictor.hh +++ b/src/cpu/pred/loop_predictor.hh @@ -156,7 +156,9 @@ class LoopPredictor : public SimObject BranchInfo() : loopTag(0), currentIter(0), loopPred(false), - loopPredValid(false), loopIndex(0), loopIndexB(0), loopHit(0), + loopPredValid(false), + loopPredUsed(false), + loopIndex(0), loopIndexB(0), loopHit(-1), predTaken(false) {} }; diff --git a/src/cpu/pred/ltage.cc b/src/cpu/pred/ltage.cc index 64a749ee4b1..b9d94dc39ff 100644 --- a/src/cpu/pred/ltage.cc +++ b/src/cpu/pred/ltage.cc @@ -82,7 +82,7 @@ LTAGE::branchPlaceholder(ThreadID tid, Addr pc, } //prediction -bool +Prediction LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) { LTageBranchInfo *bi = new LTageBranchInfo(*tage, *loopPredictor, @@ -110,7 +110,7 @@ LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) // record final prediction bi->lpBranchInfo->predTaken = pred_taken; - return pred_taken; + return staticPrediction(pred_taken); } // PREDICTOR UPDATE diff --git a/src/cpu/pred/ltage.hh b/src/cpu/pred/ltage.hh index 89bef7b7c92..2da30905b15 100644 --- a/src/cpu/pred/ltage.hh +++ b/src/cpu/pred/ltage.hh @@ -90,7 +90,6 @@ class LTAGE : public TAGE void init() override; - protected: /** The loop predictor object */ LoopPredictor *loopPredictor; @@ -128,7 +127,7 @@ class LTAGE : public TAGE * @param b Reference to wrapping pointer to allow storing * derived class prediction information in the base class. */ - bool predict( + Prediction predict( ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override; }; diff --git a/src/cpu/pred/multiperspective_perceptron.cc b/src/cpu/pred/multiperspective_perceptron.cc index c8284e49a1d..25355093376 100644 --- a/src/cpu/pred/multiperspective_perceptron.cc +++ b/src/cpu/pred/multiperspective_perceptron.cc @@ -128,7 +128,7 @@ MultiperspectivePerceptron::ThreadData::ThreadData(int num_filters, } MultiperspectivePerceptron::MultiperspectivePerceptron( - const MultiperspectivePerceptronParams &p) : BPredUnit(p), + const MultiperspectivePerceptronParams &p) : ConditionalPredictor(p), blockSize(p.block_size), pcshift(p.pcshift), threshold(p.threshold), bias0(p.bias0), bias1(p.bias1), biasmostly0(p.biasmostly0), biasmostly1(p.biasmostly1), nbest(p.nbest), tunebits(p.tunebits), @@ -591,7 +591,7 @@ MultiperspectivePerceptron::updateHistories(ThreadID tid, Addr pc, threadData[tid]->path_history[0] = pc2; } -bool +Prediction MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC, void * &bp_history) { @@ -608,11 +608,11 @@ MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC, if (f.alwaysNotTakenSoFar()) { bi->filtered = true; bi->prediction = false; - return false; + return staticPrediction(false); } else if (f.alwaysTakenSoFar()) { bi->filtered = true; bi->prediction = true; - return true; + return staticPrediction(true); } if (f.neverSeen()) { use_static = true; @@ -630,7 +630,7 @@ MultiperspectivePerceptron::lookup(ThreadID tid, Addr instPC, } } - return bi->prediction; + return staticPrediction(bi->prediction); } void diff --git a/src/cpu/pred/multiperspective_perceptron.hh b/src/cpu/pred/multiperspective_perceptron.hh index f1055d5fae3..d21681799bf 100644 --- a/src/cpu/pred/multiperspective_perceptron.hh +++ b/src/cpu/pred/multiperspective_perceptron.hh @@ -55,7 +55,8 @@ #include #include "base/random.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/branch_type.hh" +#include "cpu/pred/conditional.hh" #include "params/MultiperspectivePerceptron.hh" namespace gem5 @@ -64,7 +65,7 @@ namespace gem5 namespace branch_prediction { -class MultiperspectivePerceptron : public BPredUnit +class MultiperspectivePerceptron : public ConditionalPredictor { protected: /** @@ -1064,7 +1065,8 @@ class MultiperspectivePerceptron : public BPredUnit void init() override; // Base class methods. - bool lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override; + Prediction + lookup(ThreadID tid, Addr branch_addr, void* &bp_history) override; void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, void * &bp_history) override; diff --git a/src/cpu/pred/multiperspective_perceptron_tage.cc b/src/cpu/pred/multiperspective_perceptron_tage.cc index 6d5f2da76e8..3b9b41028b4 100644 --- a/src/cpu/pred/multiperspective_perceptron_tage.cc +++ b/src/cpu/pred/multiperspective_perceptron_tage.cc @@ -515,7 +515,7 @@ MultiperspectivePerceptronTAGE::updateHistories(ThreadID tid, } } -bool +Prediction MultiperspectivePerceptronTAGE::lookup(ThreadID tid, Addr instPC, void * &bp_history) { @@ -543,7 +543,7 @@ MultiperspectivePerceptronTAGE::lookup(ThreadID tid, Addr instPC, 0 /* altBank: unused */, init_lsum); bi->predictedTaken = pred_taken; bi->lpBranchInfo->predTaken = pred_taken; - return pred_taken; + return staticPrediction(pred_taken); } diff --git a/src/cpu/pred/multiperspective_perceptron_tage.hh b/src/cpu/pred/multiperspective_perceptron_tage.hh index 81d8dcb2d9b..774d5cb2ab1 100644 --- a/src/cpu/pred/multiperspective_perceptron_tage.hh +++ b/src/cpu/pred/multiperspective_perceptron_tage.hh @@ -248,7 +248,7 @@ class MultiperspectivePerceptronTAGE : public MultiperspectivePerceptron void init() override; - bool lookup(ThreadID tid, Addr instPC, void * &bp_history) override; + Prediction lookup(ThreadID tid, Addr instPC, void * &bp_history) override; void update(ThreadID tid, Addr pc, bool taken, void * &bp_history, bool squashed, diff --git a/src/cpu/pred/tage.cc b/src/cpu/pred/tage.cc index dd6ef5ddbe0..bc35d579de1 100644 --- a/src/cpu/pred/tage.cc +++ b/src/cpu/pred/tage.cc @@ -62,7 +62,9 @@ namespace gem5 namespace branch_prediction { -TAGE::TAGE(const TAGEParams ¶ms) : BPredUnit(params), tage(params.tage) +TAGE::TAGE(const TAGEParams ¶ms) : + ConditionalPredictor(params), + tage(params.tage) { } @@ -108,20 +110,22 @@ TAGE::squash(ThreadID tid, void * &bp_history) bp_history = nullptr; } -bool +Prediction TAGE::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b) { TageBranchInfo *bi = new TageBranchInfo(*tage, pc, cond_branch); b = (void*)(bi); - return tage->tagePredict(tid, pc, cond_branch, bi->tageBranchInfo); + return staticPrediction( + tage->tagePredict(tid, pc, cond_branch, bi->tageBranchInfo) + ); } -bool +Prediction TAGE::lookup(ThreadID tid, Addr pc, void* &bp_history) { - bool retval = predict(tid, pc, true, bp_history); + Prediction retval = predict(tid, pc, true, bp_history); - DPRINTF(Tage, "Lookup branch: %lx; predict:%d\n", pc, retval); + DPRINTF(Tage, "Lookup branch: %lx; predict:%d\n", pc, retval.taken); return retval; } diff --git a/src/cpu/pred/tage.hh b/src/cpu/pred/tage.hh index 329ba922ad4..8fc087c143d 100644 --- a/src/cpu/pred/tage.hh +++ b/src/cpu/pred/tage.hh @@ -64,7 +64,7 @@ #include "base/random.hh" #include "base/types.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/conditional.hh" #include "cpu/pred/tage_base.hh" #include "params/TAGE.hh" @@ -74,10 +74,12 @@ namespace gem5 namespace branch_prediction { -class TAGE: public BPredUnit +class TAGE: public ConditionalPredictor { - protected: + public: TAGEBase *tage; + + protected: Random::RandomPtr rng = Random::genRandom(); @@ -95,7 +97,7 @@ class TAGE: public BPredUnit } }; - virtual bool predict(ThreadID tid, Addr branch_pc, bool cond_branch, + virtual Prediction predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b); public: @@ -103,7 +105,7 @@ class TAGE: public BPredUnit TAGE(const TAGEParams ¶ms); // Base class methods. - bool lookup(ThreadID tid, Addr pc, void* &bp_history) override; + Prediction lookup(ThreadID tid, Addr pc, void* &bp_history) override; virtual void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, void * &bp_history) override; @@ -113,6 +115,15 @@ class TAGE: public BPredUnit virtual void squash(ThreadID tid, void * &bp_history) override; virtual void branchPlaceholder(ThreadID tid, Addr pc, bool uncond, void * &bp_history) override; + + unsigned int getNumHistoryTables() const + { + return tage->nHistoryTables; + } + + uint16_t gtag(ThreadID tid, Addr pc, int bank) const { + return tage->gtag(tid, pc, bank); + } }; } // namespace branch_prediction diff --git a/src/cpu/pred/tage_base.cc b/src/cpu/pred/tage_base.cc index ed83a48fb31..71fe99ca771 100644 --- a/src/cpu/pred/tage_base.cc +++ b/src/cpu/pred/tage_base.cc @@ -47,31 +47,30 @@ namespace gem5 namespace branch_prediction { - TAGEBase::TAGEBase(const TAGEBaseParams &p) - : SimObject(p), - logRatioBiModalHystEntries(p.logRatioBiModalHystEntries), - nHistoryTables(p.nHistoryTables), - tagTableCounterBits(p.tagTableCounterBits), - tagTableUBits(p.tagTableUBits), - histBufferSize(p.histBufferSize), - minHist(p.minHist), - maxHist(p.maxHist), - pathHistBits(p.pathHistBits), - tagTableTagWidths(p.tagTableTagWidths), - logTagTableSizes(p.logTagTableSizes), - threadHistory(p.numThreads), - logUResetPeriod(p.logUResetPeriod), - initialTCounterValue(p.initialTCounterValue), - numUseAltOnNa(p.numUseAltOnNa), - useAltOnNaBits(p.useAltOnNaBits), - maxNumAlloc(p.maxNumAlloc), - takenOnlyHistory(p.takenOnlyHistory), - noSkip(p.noSkip), - speculativeHistUpdate(p.speculativeHistUpdate), - instShiftAmt(p.instShiftAmt), - initialized(false), - stats(this, nHistoryTables) + : SimObject(p), + logRatioBiModalHystEntries(p.logRatioBiModalHystEntries), + nHistoryTables(p.nHistoryTables), + tagTableCounterBits(p.tagTableCounterBits), + tagTableUBits(p.tagTableUBits), + histBufferSize(p.histBufferSize), + minHist(p.minHist), + maxHist(p.maxHist), + pathHistBits(p.pathHistBits), + tagTableTagWidths(p.tagTableTagWidths), + logTagTableSizes(p.logTagTableSizes), + threadHistory(p.numThreads), + logUResetPeriod(p.logUResetPeriod), + initialTCounterValue(p.initialTCounterValue), + numUseAltOnNa(p.numUseAltOnNa), + useAltOnNaBits(p.useAltOnNaBits), + maxNumAlloc(p.maxNumAlloc), + takenOnlyHistory(p.takenOnlyHistory), + noSkip(p.noSkip), + speculativeHistUpdate(p.speculativeHistUpdate), + instShiftAmt(p.instShiftAmt), + initialized(false), + stats(this, nHistoryTables) { if (noSkip.empty()) { // Set all the table to enabled by default @@ -111,9 +110,7 @@ TAGEBase::init() for (auto& history : threadHistory) { history.pathHist = 0; history.nonSpecPathHist = 0; - history.globalHistory = new uint8_t[histBufferSize]; - history.gHist = history.globalHistory; - memset(history.gHist, 0, histBufferSize); + history.globalHist.resize(histBufferSize, 0); history.ptGhist = 0; } @@ -310,32 +307,46 @@ TAGEBase::updateGHist(ThreadID tid, uint64_t bv, uint8_t n) ThreadHistory& tHist = threadHistory[tid]; if (tHist.ptGhist < n) { DPRINTF(Tage, "Rolling over the histories\n"); - // Copy beginning of globalHistoryBuffer to end, such that - // the last maxHist outcomes are still reachable - // through globalHistory[0 .. maxHist - 1]. - for (int i = 0; i < maxHist; i++) { - tHist.globalHistory[histBufferSize - maxHist + i] - = tHist.globalHistory[tHist.ptGhist + i]; + // Copy beginning of globalHistoryBuffer to end, such that the last + // maxHist outcomes are still reachable through + // globalHist[0 .. maxHist - 1]. + // The rollover can happen in a speculative state, where multiple + // predictions are in flight. In that case we must be able to + // rollback thus we need to copy more bits than just the `maxHist`. + // We use 1k as this means more than 500 predictions (TAGE-SC-L) + // in flight which is more than realistic. We use an additional + // assert in the restore method to catch if there are more + // predictions in flight. + const int rollbackBuffer = 1000; + for (int i = 0; i < (maxHist + rollbackBuffer); i++) { + tHist.globalHist[histBufferSize - maxHist - rollbackBuffer + i] = + tHist.globalHist[tHist.ptGhist + i]; } - tHist.ptGhist = histBufferSize - maxHist; - tHist.gHist = &tHist.globalHistory[tHist.ptGhist]; + tHist.ptGhist = histBufferSize - maxHist - rollbackBuffer; } // Update the global history for (int i = 0; i < n; i++) { - // Shift the next bit of the bit vector into the history + // Use `at` to check for out-of-bounds access. tHist.ptGhist--; - tHist.gHist--; - *(tHist.gHist) = (bv & 1) ? 1 : 0; + if (tHist.ptGhist >= tHist.globalHist.size()) { + DPRINTF(Tage, "BUG: PTGhist out of bounds, resetting"); + warn("BUG: PTGhist out of bounds, resetting @ %lu\n", curTick()); + // tHist.ptGhist = tHist.globalHist.size() - 1; + } + + tHist.globalHist[tHist.ptGhist] = (bv & 1) ? 1 : 0; bv >>= 1; // Update the folded histories with the new bit. + uint8_t *gh_ptr = &(tHist.globalHist[tHist.ptGhist]); + for (int i = 1; i <= nHistoryTables; i++) { - tHist.computeIndices[i].update(tHist.gHist); - tHist.computeTags[0][i].update(tHist.gHist); - tHist.computeTags[1][i].update(tHist.gHist); + tHist.computeIndices[i].update(gh_ptr); + tHist.computeTags[0][i].update(gh_ptr); + tHist.computeTags[1][i].update(gh_ptr); } } } @@ -492,15 +503,17 @@ TAGEBase::handleAllocAndUReset(bool alloc, bool taken, BranchInfo* bi, handleUReset(); } -bool +int TAGEBase::allocateEntry(int idx, BranchInfo* bi, bool taken) { if (gtable[idx][bi->tableIndices[idx]].u != 0) - return false; + return 0; + + ++stats.allocationsTotal; gtable[idx][bi->tableIndices[idx]].tag = bi->tableTags[idx]; gtable[idx][bi->tableIndices[idx]].ctr = (taken) ? 0 : -1; - return true; + return 1; } void @@ -605,8 +618,8 @@ TAGEBase::updatePathAndGlobalHistory(ThreadID tid, int brtype, bool taken, ThreadHistory& tHist = threadHistory[tid]; // Update path history - tHist.pathHist = calcNewPathHist(tid, branch_pc, tHist.pathHist); - + tHist.pathHist = + calcNewPathHist(tid, branch_pc, tHist.pathHist, taken, brtype, target); if (takenOnlyHistory) { // Taken-only history is implemented after the paper: @@ -616,8 +629,8 @@ TAGEBase::updatePathAndGlobalHistory(ThreadID tid, int brtype, bool taken, // is shifted into the global history in case the branch was taken. // For not-taken branches no history update will happen. if (taken) { - bi->ghist = (((branch_pc >> instShiftAmt) >> 2) - ^ ((target >> instShiftAmt) >> 3)) & 0x3; + bi->ghist = (((branch_pc >> instShiftAmt) >> 2) ^ + ((target >> instShiftAmt) >> 3)); bi->nGhist = 2; } @@ -627,6 +640,7 @@ TAGEBase::updatePathAndGlobalHistory(ThreadID tid, int brtype, bool taken, bi->ghist = taken ? 1 : 0; bi->nGhist = 1; } + // Update the global history updateGHist(tid, bi->ghist, bi->nGhist); } @@ -640,8 +654,9 @@ TAGEBase::updateHistories(ThreadID tid, Addr branch_pc, bool speculative, if (speculative != speculativeHistUpdate) { if (!speculative) { // Save the speculative path history as non-speculative - threadHistory[tid].nonSpecPathHist - = calcNewPathHist(tid, branch_pc, bi->pathHist); + threadHistory[tid].nonSpecPathHist = + calcNewPathHist(tid, branch_pc, bi->pathHist, taken, + branchTypeExtra(inst), target); } return; } @@ -680,18 +695,17 @@ TAGEBase::updateHistories(ThreadID tid, Addr branch_pc, bool speculative, branch_pc, target, bi); bi->modified = true; - DPRINTF(Tage, "Updating global histories with branch:%lx; taken?:%d, " - "path Hist: %x; pointer:%d\n", branch_pc, taken, - threadHistory[tid].pathHist, threadHistory[tid].ptGhist); - assert(threadHistory[tid].gHist == - &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); + DPRINTF(Tage, + "Updating global histories with branch:%lx; taken?:%d, " + "path Hist: %x; pointer:%d\n", + branch_pc, taken, threadHistory[tid].pathHist, + threadHistory[tid].ptGhist); } void TAGEBase::recordHistState(ThreadID tid, BranchInfo* bi) { - ThreadHistory& tHist = threadHistory[tid]; - bi->ptGhist = tHist.ptGhist; + ThreadHistory &tHist = threadHistory[tid]; bi->pathHist = tHist.pathHist; for (int i = 1; i <= nHistoryTables; i++) { @@ -718,30 +732,39 @@ TAGEBase::restoreHistState(ThreadID tid, BranchInfo* bi) return; // RESTORE HISTORIES - // Shift out the inserted bits - // from the folded history and the global history vector + // Shift out the inserted bits from the folded history + // and the global history vector for (int n = 0; n < bi->nGhist; n++) { + uint8_t *gh_ptr = &(tHist.globalHist[tHist.ptGhist]); // First revert the folded history for (int i = 1; i <= nHistoryTables; i++) { - tHist.computeIndices[i].restore(tHist.gHist); - tHist.computeTags[0][i].restore(tHist.gHist); - tHist.computeTags[1][i].restore(tHist.gHist); + tHist.computeIndices[i].restore(gh_ptr); + tHist.computeTags[0][i].restore(gh_ptr); + tHist.computeTags[1][i].restore(gh_ptr); } tHist.ptGhist++; - tHist.gHist++; + // Make sure we do not go out of bounds. + // If we do its likely that there where too many branches in flight + // during a rollover. Consider increasing the `rollbackBuffer` + assert((tHist.ptGhist + maxHist) < tHist.globalHist.size()); } bi->nGhist = 0; bi->modified = false; } int -TAGEBase::calcNewPathHist(ThreadID tid, Addr pc, int phist) const +TAGEBase::calcNewPathHist(ThreadID tid, Addr pc, int cur_phist, bool taken, + int brtype, Addr target) const { + if (takenOnlyHistory && !taken) { + // For taken-only history we update only if the branch was taken + return cur_phist; + } int pathbit = ((pc >> instShiftAmt) & 1); - phist = (phist << 1) + pathbit; - phist = (phist & ((1ULL << pathHistBits) - 1)); - return phist; + cur_phist = (cur_phist << 1) + pathbit; + cur_phist = (cur_phist & ((1ULL << pathHistBits) - 1)); + return cur_phist; } void @@ -813,10 +836,8 @@ TAGEBase::getGHR(ThreadID tid) const unsigned val = 0; int gh_ptr = threadHistory[tid].ptGhist; for (unsigned i = 0; i < 16; i++) { - // Make sure we don't go out of bounds - assert(&(threadHistory[tid].globalHistory[gh_ptr + i]) < - threadHistory[tid].globalHistory + histBufferSize); - val |= ((threadHistory[tid].globalHistory[gh_ptr + i] & 0x1) << i); + // Make sure we don't go out of bounds with `at`. + val |= ((threadHistory[tid].globalHist.at(gh_ptr + i) & 0x1) << i); } return val; } @@ -825,6 +846,8 @@ TAGEBase::getGHR(ThreadID tid) const TAGEBase::TAGEBaseStats::TAGEBaseStats( statistics::Group *parent, unsigned nHistoryTables) : statistics::Group(parent), + ADD_STAT(allocationsTotal, statistics::units::Count::get(), + "Number of times TAGE inserted a new pattern"), ADD_STAT(longestMatchProviderCorrect, statistics::units::Count::get(), "Number of times TAGE Longest Match is the provider and the " "prediction is correct"), diff --git a/src/cpu/pred/tage_base.hh b/src/cpu/pred/tage_base.hh index 98e218c65f9..58141d1f463 100644 --- a/src/cpu/pred/tage_base.hh +++ b/src/cpu/pred/tage_base.hh @@ -68,7 +68,6 @@ class TAGEBase : public SimObject TAGEBase(const TAGEBaseParams &p); void init() override; - protected: // Prediction Structures // Tage Entry @@ -138,7 +137,6 @@ class TAGEBase : public SimObject const bool condBranch; int pathHist; - int ptGhist; int hitBank; int hitBankIndex; int altBank; @@ -464,7 +462,7 @@ class TAGEBase : public SimObject * Try to allocate an entry at index idx. * Returns true if the allocation was successful */ - virtual bool allocateEntry(int idx, BranchInfo* bi, bool taken = false); + virtual int allocateEntry(int idx, BranchInfo* bi, bool taken = false); /** * Extra steps for calculating altTaken @@ -481,11 +479,11 @@ class TAGEBase : public SimObject int8_t getCtr(int hitBank, int hitBankIndex) const; unsigned getTageCtrBits() const; int getPathHist(ThreadID tid, bool speculative=true) const; - int calcNewPathHist(ThreadID tid, Addr pc, int old_phist) const; + virtual int calcNewPathHist(ThreadID tid, Addr pc, int cur_phist, + bool taken, int brtype, Addr target) const; bool isSpeculativeUpdateEnabled() const; size_t getSizeInBits() const; - protected: const unsigned logRatioBiModalHystEntries; const unsigned nHistoryTables; const unsigned tagTableCounterBits; @@ -495,6 +493,7 @@ class TAGEBase : public SimObject const unsigned maxHist; const unsigned pathHistBits; + protected: std::vector tagTableTagWidths; std::vector logTagTableSizes; @@ -514,11 +513,7 @@ class TAGEBase : public SimObject // Speculative branch direction // history (circular buffer) - // @TODO Convert to std::vector - uint8_t *globalHistory; - - // Pointer to most recent branch outcome - uint8_t* gHist; + std::vector globalHist; // Index to most recent branch outcome int ptGhist; @@ -549,11 +544,14 @@ class TAGEBase : public SimObject /** Use taken only history. */ const bool takenOnlyHistory; + public: // Tells which tables are active // (for the base TAGE implementation all are active) // Some other classes use this for handling associativity std::vector noSkip; + protected: + const bool speculativeHistUpdate; const unsigned instShiftAmt; @@ -564,6 +562,7 @@ class TAGEBase : public SimObject { TAGEBaseStats(statistics::Group *parent, unsigned nHistoryTables); // stats + statistics::Scalar allocationsTotal; statistics::Scalar longestMatchProviderCorrect; statistics::Scalar altMatchProviderCorrect; statistics::Scalar bimodalAltMatchProviderCorrect; diff --git a/src/cpu/pred/tage_sc_l.cc b/src/cpu/pred/tage_sc_l.cc index 2e24fc8786d..1ff4f4d3462 100644 --- a/src/cpu/pred/tage_sc_l.cc +++ b/src/cpu/pred/tage_sc_l.cc @@ -78,7 +78,7 @@ TAGE_SC_L_LoopPredictor::optionalAgeInc() const TAGE_SC_L::TAGE_SC_L(const TAGE_SC_LParams &p) : LTAGE(p), statisticalCorrector(p.statistical_corrector), - useSC(p.sc_enabled) + useSC(p.sc_enabled), useLoop(p.loop_enabled) { } @@ -180,6 +180,9 @@ TAGE_SC_L_TAGE::calculateIndicesAndTags( t = t % shortTagsTageFactor; } } + + // TODO: Test with and without this + bi->valid = true; } unsigned @@ -257,31 +260,70 @@ TAGE_SC_L_TAGE::updatePathAndGlobalHistory(ThreadID tid, int brtype, TAGEBase::BranchInfo* bi) { ThreadHistory& tHist = threadHistory[tid]; - // TAGE update + + // Update path history + tHist.pathHist = + calcNewPathHist(tid, branch_pc, tHist.pathHist, taken, brtype, target); + Addr shifted_pc = branch_pc >> instShiftAmt; - int tmp = ((shifted_pc ^ (shifted_pc >> 2))) ^ taken; + Addr shifted_target = target >> instShiftAmt; + + if (takenOnlyHistory) { + // Taken-only history is implemented after the paper: + // https://ieeexplore.ieee.org/document/9246215 + // + // For taken-only history two bits of a hash of pc and its target + // is shifted into the global history in case the branch was taken. + // For not-taken branches no history update will happen. + if (taken) { + bi->ghist = ((shifted_pc >> 2) ^ ((target >> instShiftAmt) >> 3)); + bi->nGhist = 2; + } + + } else { + // Original TAGE-SC-L history uses two or three bits of the + // PC and the taken bit to update the global history + bi->ghist = ((shifted_pc ^ (shifted_pc >> 2))) ^ taken; + if ((brtype == 3) & taken) { + bi->ghist = (bi->ghist ^ (shifted_target >> 2)); + } + // some branch types use 3 bits in global history, the others just 2 + bi->nGhist = (brtype == 2) ? 3 : 2; + } + + // Update the global history + updateGHist(tid, bi->ghist, bi->nGhist); +} + +int +TAGE_SC_L_TAGE::calcNewPathHist(ThreadID tid, Addr pc, int cur_phist, + bool taken, int brtype, Addr target) const +{ + if (takenOnlyHistory && !taken) { + // For taken-only history we update only if the branch was taken + return cur_phist; + } + + Addr shifted_pc = pc >> instShiftAmt; + Addr shifted_target = target >> instShiftAmt; int path = shifted_pc ^ (shifted_pc >> 2) ^ (shifted_pc >> 4); if ((brtype == 3) & taken) { - tmp = (tmp ^ (target >> 2)); - path = path ^ (target >> 2) ^ (target >> 4); + path = path ^ (shifted_target >> 2) ^ (shifted_target >> 4); } // some branch types use 3 bits in global history, the others just 2 int maxt = (brtype == 2) ? 3 : 2; for (int t = 0; t < maxt; t++) { - int pathbit = (path & 127); + int pathbits = (path & 127); path >>= 1; - tHist.pathHist = (tHist.pathHist << 1) ^ pathbit; + cur_phist = (cur_phist << 1) ^ pathbits; if (truncatePathHist) { // The 8KB implementation does not do this truncation - tHist.pathHist = (tHist.pathHist & ((1ULL << pathHistBits) - 1)); + cur_phist = (cur_phist & ((1ULL << pathHistBits) - 1)); } } - // Record the update to be able to squash it later - bi->ghist = tmp; - bi->nGhist = maxt; - updateGHist(tid, tmp, maxt); + return cur_phist; } void @@ -364,7 +406,7 @@ TAGE_SC_L::branchPlaceholder(ThreadID tid, Addr pc, bp_history = (void*)(bi); } -bool +Prediction TAGE_SC_L::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b) { TageSCLBranchInfo *bi = new TageSCLBranchInfo(*tage, @@ -375,9 +417,11 @@ TAGE_SC_L::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b) bool pred_taken = tage->tagePredict(tid, pc, cond_branch, bi->tageBranchInfo); - pred_taken = loopPredictor->loopPredict(tid, pc, cond_branch, - bi->lpBranchInfo, pred_taken, - instShiftAmt); + if (useLoop) { + pred_taken = loopPredictor->loopPredict(tid, pc, cond_branch, + bi->lpBranchInfo, pred_taken, + instShiftAmt); + } if (bi->lpBranchInfo->loopPredUsed) { bi->tageBranchInfo->provider = LOOP; @@ -412,7 +456,7 @@ TAGE_SC_L::predict(ThreadID tid, Addr pc, bool cond_branch, void* &b) // record final prediction bi->lpBranchInfo->predTaken = pred_taken; - return pred_taken; + return staticPrediction(pred_taken); } void @@ -422,6 +466,21 @@ TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, void *&bp_history, assert(bp_history); TageSCLBranchInfo* bi = static_cast(bp_history); + update(tid, pc, taken, bi, squashed, inst, target); + + if (squashed) + return; + + delete bi; + bp_history = nullptr; +} + +void +TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, TageSCLBranchInfo *&bi, + bool squashed, const StaticInstPtr & inst, Addr target) +{ + assert(bi); + TAGE_SC_L_TAGE::BranchInfo* tage_bi = static_cast(bi->tageBranchInfo); @@ -430,7 +489,7 @@ TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, void *&bp_history, // This restores the global history, then update it // and recomputes the folded histories. tage->squash(tid, taken, target, inst, tage_bi); - if (bi->tageBranchInfo->condBranch) { + if (useLoop && bi->tageBranchInfo->condBranch) { loopPredictor->squashLoop(bi->lpBranchInfo); } if (useSC) { @@ -448,7 +507,8 @@ TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, void *&bp_history, pc, taken); tage->updateStats(taken, bi->tageBranchInfo); - loopPredictor->updateStats(taken, bi->lpBranchInfo); + if (useLoop) + loopPredictor->updateStats(taken, bi->lpBranchInfo); if (useSC) { statisticalCorrector->updateStats(taken, bi->scBranchInfo); @@ -460,7 +520,8 @@ TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, void *&bp_history, bi->tageBranchInfo->altBank); } - loopPredictor->condBranchUpdate(tid, pc, taken, + if (useLoop) + loopPredictor->condBranchUpdate(tid, pc, taken, bi->tageBranchInfo->tagePred, bi->lpBranchInfo, instShiftAmt); tage->condBranchUpdate(tid, pc, taken, bi->tageBranchInfo, @@ -475,20 +536,20 @@ TAGE_SC_L::update(ThreadID tid, Addr pc, bool taken, void *&bp_history, bi->scBranchInfo, target, tage->getPathHist(tid, false)); } - - - delete bi; - bp_history = nullptr; } + void TAGE_SC_L::squash(ThreadID tid, void * &bp_history) { TageSCLBranchInfo* bi = static_cast(bp_history); + if (useLoop && bi->tageBranchInfo->condBranch) { + loopPredictor->squash(tid, bi->lpBranchInfo); + } if (useSC) { statisticalCorrector->scRestoreHistState(bi->scBranchInfo); } - LTAGE::squash(tid, bp_history); + TAGE::squash(tid, bp_history); } diff --git a/src/cpu/pred/tage_sc_l.hh b/src/cpu/pred/tage_sc_l.hh index f00ae195ab8..0d864f1cfd2 100644 --- a/src/cpu/pred/tage_sc_l.hh +++ b/src/cpu/pred/tage_sc_l.hh @@ -150,7 +150,8 @@ class TAGE_SC_L_TAGE : public TAGEBase TAGEBase::BranchInfo* tage_bi) const override; void extraAltCalc(TAGEBase::BranchInfo* bi) override; - + int calcNewPathHist(ThreadID tid, Addr pc, int cur_phist, bool taken, + int brtype, Addr target) const override; }; class TAGE_SC_L_LoopPredictor : public LoopPredictor @@ -170,7 +171,7 @@ class TAGE_SC_L: public LTAGE public: TAGE_SC_L(const TAGE_SC_LParams ¶ms); - bool predict( + Prediction predict( ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) override; void squash(ThreadID tid, void * &bp_history) override; void update(ThreadID tid, Addr pc, bool taken, void * &bp_history, @@ -182,7 +183,6 @@ class TAGE_SC_L: public LTAGE void branchPlaceholder(ThreadID tid, Addr pc, bool uncond, void * &bp_history) override; - protected: struct TageSCLBranchInfo : public LTageBranchInfo { @@ -200,12 +200,19 @@ class TAGE_SC_L: public LTAGE } }; + void update(ThreadID tid, Addr pc, bool taken, TageSCLBranchInfo * &bi, + bool squashed, const StaticInstPtr & inst, + Addr target); + // more provider types enum { SC = LAST_LTAGE_PROVIDER_TYPE + 1 }; + + protected: const bool useSC; + const bool useLoop; }; diff --git a/src/cpu/pred/tage_sc_l_64KB.cc b/src/cpu/pred/tage_sc_l_64KB.cc index bc9899176f7..8029cd43a50 100644 --- a/src/cpu/pred/tage_sc_l_64KB.cc +++ b/src/cpu/pred/tage_sc_l_64KB.cc @@ -41,6 +41,8 @@ #include "cpu/pred/tage_sc_l_64KB.hh" +#include "debug/TageSCL.hh" + namespace gem5 { @@ -248,23 +250,13 @@ TAGE_SC_L_TAGE_64KB::handleAllocAndUReset( for (int j = 0; j < 2; ++j) { int i = ((j == 0) ? I : (I ^ 1)) + 1; if (noSkip[i]) { - if (gtable[i][bi->tableIndices[i]].u == 0) { - int8_t ctr = gtable[i][bi->tableIndices[i]].ctr; - if (abs (2 * ctr + 1) <= 3) { - gtable[i][bi->tableIndices[i]].tag = bi->tableTags[i]; - gtable[i][bi->tableIndices[i]].ctr = taken ? 0 : -1; + auto n = allocateEntry(i, bi, taken); + if (n > 0) { numAllocated++; maxAllocReached = (numAllocated == maxNumAlloc); I += 2; break; - } else { - if (gtable[i][bi->tableIndices[i]].ctr > 0) { - gtable[i][bi->tableIndices[i]].ctr--; - } else { - gtable[i][bi->tableIndices[i]].ctr++; - } - } - } else { + } else if (n < 0) { penalty++; } } @@ -279,6 +271,32 @@ TAGE_SC_L_TAGE_64KB::handleAllocAndUReset( handleUReset(); } +int +TAGE_SC_L_TAGE_64KB::allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) +{ + if (gtable[idx][bi->tableIndices[idx]].u != 0) + return -1; + + int8_t ctr = gtable[idx][bi->tableIndices[idx]].ctr; + if (abs (2 * ctr + 1) > 3) { + if (ctr > 0) + gtable[idx][bi->tableIndices[idx]].ctr--; + else + gtable[idx][bi->tableIndices[idx]].ctr++; + return 0; + } + + DPRINTF(TageSCL, "TSL Alloc:%i, %i,%i\n", idx, + bi->tableIndices[idx], bi->tableTags[idx]); + + ++stats.allocationsTotal; + + gtable[idx][bi->tableIndices[idx]].tag = bi->tableTags[idx]; + gtable[idx][bi->tableIndices[idx]].ctr = (taken) ? 0 : -1; + return 1; +} + + void TAGE_SC_L_TAGE_64KB::handleTAGEUpdate(Addr branch_pc, bool taken, TAGEBase::BranchInfo* bi) @@ -305,28 +323,49 @@ TAGE_SC_L_TAGE_64KB::handleTAGEUpdate(Addr branch_pc, bool taken, gtable[bi->hitBank][bi->hitBankIndex].u = 0; } - if (bi->altTaken == taken) { - if (bi->altBank > 0) { - int8_t ctr = gtable[bi->altBank][bi->altBankIndex].ctr; - if (abs (2 * ctr + 1) == 7) { - if (gtable[bi->hitBank][bi->hitBankIndex].u == 1) { - if (bi->longestMatchPred == taken) { - gtable[bi->hitBank][bi->hitBankIndex].u = 0; - } - } - } + if (isNotUseful(taken, bi)) { + if (gtable[bi->hitBank][bi->hitBankIndex].u > 0) { + gtable[bi->hitBank][bi->hitBankIndex].u--; } } } else { baseUpdate(branch_pc, taken, bi); } - if ((bi->longestMatchPred != bi->altTaken) && - (bi->longestMatchPred == taken) && - (gtable[bi->hitBank][bi->hitBankIndex].u < (1 << tagTableUBits) -1)) { + if (isUseful(taken, bi)) { + if(gtable[bi->hitBank][bi->hitBankIndex].u < ((1 << tagTableUBits) -1)) { gtable[bi->hitBank][bi->hitBankIndex].u++; } } +} + +bool +TAGE_SC_L_TAGE_64KB::isUseful(bool taken, TAGEBase::BranchInfo* bi) const +{ + // If the longest prediction is correct but the alternate + // prediction is wrong the longest is useful. + return (bi->longestMatchPred != bi->altTaken) && + (bi->longestMatchPred == taken); +} + +bool +TAGE_SC_L_TAGE_64KB::isNotUseful(bool taken, TAGEBase::BranchInfo* bi) const +{ + // If both the longest and alternate predictions where correct + // we can possible free the longest entry to use it for other + // predictions. + if ((bi->altTaken == taken) && (bi->longestMatchPred == taken)) { + // We only clear if the alternate prediction has a + // high confidence. + if (bi->altBank > 0) { + int8_t ctr = gtable[bi->altBank][bi->altBankIndex].ctr; + if (abs (2 * ctr + 1) == 7) { + return true; + } + } + } + return false; +} TAGE_SC_L_64KB::TAGE_SC_L_64KB(const TAGE_SC_L_64KBParams ¶ms) : TAGE_SC_L(params) diff --git a/src/cpu/pred/tage_sc_l_64KB.hh b/src/cpu/pred/tage_sc_l_64KB.hh index 633d0f54a05..a187fb1d353 100644 --- a/src/cpu/pred/tage_sc_l_64KB.hh +++ b/src/cpu/pred/tage_sc_l_64KB.hh @@ -69,8 +69,13 @@ class TAGE_SC_L_TAGE_64KB : public TAGE_SC_L_TAGE void handleAllocAndUReset( bool alloc, bool taken, TAGEBase::BranchInfo* bi, int nrand) override; + int allocateEntry(int idx, TAGEBase::BranchInfo* bi, bool taken) override; + void handleTAGEUpdate( Addr branch_pc, bool taken, TAGEBase::BranchInfo* bi) override; + + virtual bool isUseful(bool taken, TAGEBase::BranchInfo* bi) const; + virtual bool isNotUseful(bool taken, TAGEBase::BranchInfo* bi) const; }; class TAGE_SC_L_64KB_StatisticalCorrector : public StatisticalCorrector diff --git a/src/cpu/pred/tagescl_ref.cc b/src/cpu/pred/tagescl_ref.cc index 032bded5285..d61776d09cd 100644 --- a/src/cpu/pred/tagescl_ref.cc +++ b/src/cpu/pred/tagescl_ref.cc @@ -54,7 +54,7 @@ namespace branch_prediction { TageSCLRef::TageSCLRef(const TageSCLRefParams ¶ms) - : BPredUnit(params) + : ConditionalPredictor(params) { predictor = new PREDICTOR(); } @@ -76,11 +76,11 @@ TageSCLRef::updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, } -bool +Prediction TageSCLRef::lookup(ThreadID tid, Addr branch_addr, void * &bp_history) { auto pred = predictor->GetPrediction(branch_addr); - return pred; + return staticPrediction(pred); } void diff --git a/src/cpu/pred/tagescl_ref.hh b/src/cpu/pred/tagescl_ref.hh index 7ae2db85c0d..e9d89e8afaa 100644 --- a/src/cpu/pred/tagescl_ref.hh +++ b/src/cpu/pred/tagescl_ref.hh @@ -46,7 +46,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/conditional.hh" #include "params/TageSCLRef.hh" namespace gem5 @@ -63,7 +63,7 @@ namespace branch_prediction * predictor state that needs to be recorded or updated; the update can be * determined solely by the branch being taken or not taken. */ -class TageSCLRef : public BPredUnit +class TageSCLRef : public ConditionalPredictor { public: /** @@ -73,7 +73,7 @@ class TageSCLRef : public BPredUnit ~TageSCLRef(); // Overriding interface functions - bool lookup(ThreadID tid, Addr pc, void * &bp_history) override; + Prediction lookup(ThreadID tid, Addr pc, void * &bp_history) override; // void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, // Addr target, void * &bp_history) override; diff --git a/src/cpu/pred/tournament.cc b/src/cpu/pred/tournament.cc index a6428575f28..96ba8a07ecb 100644 --- a/src/cpu/pred/tournament.cc +++ b/src/cpu/pred/tournament.cc @@ -51,7 +51,7 @@ namespace branch_prediction { TournamentBP::TournamentBP(const TournamentBPParams ¶ms) - : BPredUnit(params), + : ConditionalPredictor(params), localPredictorSize(params.localPredictorSize), localCtrBits(params.localCtrBits), localCtrs(localPredictorSize, SatCounter8(localCtrBits)), @@ -149,7 +149,7 @@ TournamentBP::updateLocalHist(unsigned local_history_idx, bool taken) (localHistoryTable[local_history_idx] << 1) | taken; } -bool +Prediction TournamentBP::lookup(ThreadID tid, Addr pc, void * &bp_history) { bool local_prediction; @@ -188,9 +188,9 @@ TournamentBP::lookup(ThreadID tid, Addr pc, void * &bp_history) // Select and return the prediction // History update will be happen in the next function if (choice_prediction) { - return global_prediction; + return staticPrediction(global_prediction); } else { - return local_prediction; + return staticPrediction(local_prediction); } } diff --git a/src/cpu/pred/tournament.hh b/src/cpu/pred/tournament.hh index 36b50c706a4..5ccfb0f8898 100644 --- a/src/cpu/pred/tournament.hh +++ b/src/cpu/pred/tournament.hh @@ -46,7 +46,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" -#include "cpu/pred/bpred_unit.hh" +#include "cpu/pred/conditional.hh" #include "params/TournamentBP.hh" namespace gem5 @@ -63,7 +63,7 @@ namespace branch_prediction * predictor chooses between the two. Both the global history register * and the selected local history are speculatively updated. */ -class TournamentBP : public BPredUnit +class TournamentBP : public ConditionalPredictor { public: /** @@ -72,7 +72,7 @@ class TournamentBP : public BPredUnit TournamentBP(const TournamentBPParams ¶ms); // Base class methods. - bool lookup(ThreadID tid, Addr pc, void* &bp_history) override; + Prediction lookup(ThreadID tid, Addr pc, void* &bp_history) override; void updateHistories(ThreadID tid, Addr pc, bool uncond, bool taken, Addr target, const StaticInstPtr &inst, void * &bp_history) override; diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index 4614115d997..2dc3a970f2b 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -396,7 +396,7 @@ BaseSimpleCPU::preExecute() set(t_info.predPC, thread->pcState()); const bool predict_taken( branchPred->predict(curStaticInst, cur_sn, *t_info.predPC, - curThread)); + curThread).taken); if (predict_taken) ++t_info.execContextStats.numPredictedBranches; diff --git a/src/mem/cache/prefetch/fdp.cc b/src/mem/cache/prefetch/fdp.cc index 6eff00a450d..95176a65457 100644 --- a/src/mem/cache/prefetch/fdp.cc +++ b/src/mem/cache/prefetch/fdp.cc @@ -171,23 +171,25 @@ FetchDirectedPrefetcher::translationComplete(PrefetchRequest *pfr, bool failed) assert(cache != nullptr); if (failed) { - DPRINTF(HWPrefetch, "Translation of %#x failed\n", pfr->addr); + DPRINTF(HWPrefetch, "Translation of %#x failed\n", it->addr); stats.translationFail++; } else { - DPRINTF(HWPrefetch, "Translation of %#x succeeded\n", pfr->addr); + DPRINTF(HWPrefetch, "Translation of %#x succeeded\n", it->addr); stats.translationSuccess++; - it->createPkt(curTick() + latency); - stats.pfPacketsCreated++; - - if (cacheSnoop && (cache->inCache(pfr->pkt->getAddr(), pfr->pkt->isSecure()) - || (cache->inMissQueue(pfr->pkt->getAddr(), pfr->pkt->isSecure())))) { + if (cacheSnoop && (cache->inCache( + it->req->getPaddr(), it->req->isSecure() + ) || (cache->inMissQueue( + it->req->getPaddr(), it->req->isSecure() + )))) { stats.pfInCache++; DPRINTF(HWPrefetch, "Drop Packet. In Cache / MSHR\n"); } else { - + it->createPkt(curTick() + latency); + stats.pfPacketsCreated++; + DPRINTF(HWPrefetch, "Addr: %#x Add packet to PFQ. pkt PA:%#x, " - "PFQ sz:%i\n", pfr->addr, pfr->pkt->getAddr(), pfq.size()); - + "PFQ sz:%i\n", it->addr, it->pkt->getAddr(), pfq.size()); + stats.pfCandidatesAdded++; pfq.push_back(*it); }