diff --git a/libpimeval/src/libpimeval.cpp b/libpimeval/src/libpimeval.cpp index ccd21330..52187142 100644 --- a/libpimeval/src/libpimeval.cpp +++ b/libpimeval/src/libpimeval.cpp @@ -537,17 +537,17 @@ pimRotateElementsLeft(PimObjId src) //! @brief Shift elements of an obj by one step to the right and fill zero PimStatus -pimShiftElementsRight(PimObjId src) +pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication) { - bool ok = pimSim::get()->pimShiftElementsRight(src); + bool ok = pimSim::get()->pimShiftElementsRight(src, useCrossRegionCommunication); return ok ? PIM_OK : PIM_ERROR; } //! @brief Shift elements of an obj by one step to the left and fill zero PimStatus -pimShiftElementsLeft(PimObjId src) +pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication) { - bool ok = pimSim::get()->pimShiftElementsLeft(src); + bool ok = pimSim::get()->pimShiftElementsLeft(src, useCrossRegionCommunication); return ok ? PIM_OK : PIM_ERROR; } diff --git a/libpimeval/src/libpimeval.h b/libpimeval/src/libpimeval.h index f89dbd48..b54ec659 100644 --- a/libpimeval/src/libpimeval.h +++ b/libpimeval/src/libpimeval.h @@ -229,8 +229,8 @@ PimStatus pimBroadcastUInt(PimObjId dest, uint64_t value); PimStatus pimBroadcastFP(PimObjId dest, float value); PimStatus pimRotateElementsRight(PimObjId src); PimStatus pimRotateElementsLeft(PimObjId src); -PimStatus pimShiftElementsRight(PimObjId src); -PimStatus pimShiftElementsLeft(PimObjId src); +PimStatus pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication = true); +PimStatus pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication = true); PimStatus pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount); PimStatus pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount); diff --git a/libpimeval/src/pimCmd.cpp b/libpimeval/src/pimCmd.cpp index 454607c8..0e7b67c1 100644 --- a/libpimeval/src/pimCmd.cpp +++ b/libpimeval/src/pimCmd.cpp @@ -1222,33 +1222,35 @@ pimCmdRotate::execute() computeAllRegions(numRegions); // handle region boundaries - if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) { - for (unsigned i = 0; i < numRegions; ++i) { - const pimRegion &srcRegion = objSrc.getRegions()[i]; - uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); - uint64_t val = 0; - if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) { - val = m_regionBoundary[numRegions - 1]; - } else if (i > 0) { - val = m_regionBoundary[i - 1]; + if(m_useCrossRegionCommunication) { + if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) { + for (unsigned i = 0; i < numRegions; ++i) { + const pimRegion &srcRegion = objSrc.getRegions()[i]; + uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); + uint64_t val = 0; + if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) { + val = m_regionBoundary[numRegions - 1]; + } else if (i > 0) { + val = m_regionBoundary[i - 1]; + } + objSrc.setElement(elemIdxBegin, val); } - objSrc.setElement(elemIdxBegin, val); - } - } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) { - for (unsigned i = 0; i < numRegions; ++i) { - const pimRegion &srcRegion = objSrc.getRegions()[i]; - unsigned numElementsInRegion = srcRegion.getNumElemInRegion(); - uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); - uint64_t val = 0; - if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) { - val = m_regionBoundary[0]; - } else if (i < numRegions - 1) { - val = m_regionBoundary[i + 1]; + } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) { + for (unsigned i = 0; i < numRegions; ++i) { + const pimRegion &srcRegion = objSrc.getRegions()[i]; + unsigned numElementsInRegion = srcRegion.getNumElemInRegion(); + uint64_t elemIdxBegin = srcRegion.getElemIdxBegin(); + uint64_t val = 0; + if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) { + val = m_regionBoundary[0]; + } else if (i < numRegions - 1) { + val = m_regionBoundary[i + 1]; + } + objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val); } - objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val); + } else { + assert(0); } - } else { - assert(0); } if (pimSim::get()->getDeviceType() != PIM_FUNCTIONAL) { @@ -1323,7 +1325,7 @@ pimCmdRotate::updateStats() const PimDataType dataType = objSrc.getDataType(); bool isVLayout = objSrc.isVLayout(); - pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc); + pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc, m_useCrossRegionCommunication); pimSim::get()->getStatsMgr()->recordCmd(getName(dataType, isVLayout), mPerfEnergy); return true; } diff --git a/libpimeval/src/pimCmd.h b/libpimeval/src/pimCmd.h index 2941d529..5842f7e0 100644 --- a/libpimeval/src/pimCmd.h +++ b/libpimeval/src/pimCmd.h @@ -559,8 +559,8 @@ class pimCmdBroadcast : public pimCmd class pimCmdRotate : public pimCmd { public: - pimCmdRotate(PimCmdEnum cmdType, PimObjId src) - : pimCmd(cmdType), m_src(src) + pimCmdRotate(PimCmdEnum cmdType, PimObjId src, bool useCrossRegionCommunication) + : pimCmd(cmdType), m_src(src), m_useCrossRegionCommunication(useCrossRegionCommunication) { assert(cmdType == PimCmdEnum::ROTATE_ELEM_R || cmdType == PimCmdEnum::ROTATE_ELEM_L || cmdType == PimCmdEnum::SHIFT_ELEM_R || cmdType == PimCmdEnum::SHIFT_ELEM_L); @@ -573,6 +573,7 @@ class pimCmdRotate : public pimCmd protected: PimObjId m_src; std::vector m_regionBoundary; + bool m_useCrossRegionCommunication; }; //! @class pimCmdReadRowToSa diff --git a/libpimeval/src/pimPerfEnergyAim.cpp b/libpimeval/src/pimPerfEnergyAim.cpp index 1dc73fc4..00190e94 100644 --- a/libpimeval/src/pimPerfEnergyAim.cpp +++ b/libpimeval/src/pimPerfEnergyAim.cpp @@ -132,7 +132,7 @@ pimPerfEnergyAim::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo //! @brief Perf energy model of aim for rotate pimeval::perfEnergy -pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; diff --git a/libpimeval/src/pimPerfEnergyAim.h b/libpimeval/src/pimPerfEnergyAim.h index bb00cd25..6b35750a 100644 --- a/libpimeval/src/pimPerfEnergyAim.h +++ b/libpimeval/src/pimPerfEnergyAim.h @@ -26,7 +26,7 @@ class pimPerfEnergyAim : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; virtual pimeval::perfEnergy getPerfEnergyForMac(PimCmdEnum cmdType, const pimObjInfo& obj) const override; protected: diff --git a/libpimeval/src/pimPerfEnergyAquabolt.cpp b/libpimeval/src/pimPerfEnergyAquabolt.cpp index 845727e2..43cfb658 100644 --- a/libpimeval/src/pimPerfEnergyAquabolt.cpp +++ b/libpimeval/src/pimPerfEnergyAquabolt.cpp @@ -259,7 +259,7 @@ pimPerfEnergyAquabolt::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimOb //! @brief Perf energy model of aquabolt PIM for rotate pimeval::perfEnergy -pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; diff --git a/libpimeval/src/pimPerfEnergyAquabolt.h b/libpimeval/src/pimPerfEnergyAquabolt.h index 5dcca145..7645853d 100644 --- a/libpimeval/src/pimPerfEnergyAquabolt.h +++ b/libpimeval/src/pimPerfEnergyAquabolt.h @@ -26,8 +26,8 @@ class pimPerfEnergyAquabolt : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; + protected: unsigned m_aquaboltFPUBitWidth = 16; // TODO: Update for Aquabolt diff --git a/libpimeval/src/pimPerfEnergyBankLevel.cpp b/libpimeval/src/pimPerfEnergyBankLevel.cpp index 2ac82c20..88824e8c 100644 --- a/libpimeval/src/pimPerfEnergyBankLevel.cpp +++ b/libpimeval/src/pimPerfEnergyBankLevel.cpp @@ -355,7 +355,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO // TODO: This needs to be revisited //! @brief Perf energy model of bank-level PIM for rotate pimeval::perfEnergy -pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -366,8 +366,6 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); uint64_t totalOp = 0; - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); // rotate within subarray: // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row @@ -377,9 +375,14 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass msRuntime *= numPass; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; - printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + } return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); } diff --git a/libpimeval/src/pimPerfEnergyBankLevel.h b/libpimeval/src/pimPerfEnergyBankLevel.h index 5f869cb2..3442e2ff 100644 --- a/libpimeval/src/pimPerfEnergyBankLevel.h +++ b/libpimeval/src/pimPerfEnergyBankLevel.h @@ -26,7 +26,7 @@ class pimPerfEnergyBankLevel : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override; protected: diff --git a/libpimeval/src/pimPerfEnergyBase.cpp b/libpimeval/src/pimPerfEnergyBase.cpp index 2a1a5e49..feb3fa4a 100644 --- a/libpimeval/src/pimPerfEnergyBase.cpp +++ b/libpimeval/src/pimPerfEnergyBase.cpp @@ -171,7 +171,7 @@ pimPerfEnergyBase::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInf //! @brief Perf energy model of base class for rotate (placeholder) pimeval::perfEnergy -pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 1e10; double mjEnergy = 999999999.9; diff --git a/libpimeval/src/pimPerfEnergyBase.h b/libpimeval/src/pimPerfEnergyBase.h index dc7f4b79..41095875 100644 --- a/libpimeval/src/pimPerfEnergyBase.h +++ b/libpimeval/src/pimPerfEnergyBase.h @@ -70,7 +70,7 @@ class pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const; virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const; virtual pimeval::perfEnergy getPerfEnergyForMac(PimCmdEnum cmdType, const pimObjInfo& obj) const; diff --git a/libpimeval/src/pimPerfEnergyBitSerial.cpp b/libpimeval/src/pimPerfEnergyBitSerial.cpp index 1724bbd8..7d0a8c82 100644 --- a/libpimeval/src/pimPerfEnergyBitSerial.cpp +++ b/libpimeval/src/pimPerfEnergyBitSerial.cpp @@ -439,7 +439,7 @@ pimPerfEnergyBitSerial::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO //! @brief Perf energy model of bit-serial PIM for rotate pimeval::perfEnergy -pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -451,8 +451,6 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); unsigned numCore = obj.getNumCoreAvailable(); - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); switch (m_simTarget) { case PIM_DEVICE_BITSIMD_V: @@ -465,8 +463,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI totalOp += 3 * bitsPerElement * numPass * numCore; msRuntime = msRead + msWrite + msCompute; mjEnergy = (m_eAP + 3 * m_eL) * bitsPerElement * numPass; // for one pass - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } break; case PIM_DEVICE_SIMDRAM: // todo @@ -481,8 +483,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass msRuntime *= numPass; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + } break; default: assert(0); diff --git a/libpimeval/src/pimPerfEnergyBitSerial.h b/libpimeval/src/pimPerfEnergyBitSerial.h index 86d8b681..f2ce4ee1 100644 --- a/libpimeval/src/pimPerfEnergyBitSerial.h +++ b/libpimeval/src/pimPerfEnergyBitSerial.h @@ -26,7 +26,7 @@ class pimPerfEnergyBitSerial : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override; protected: diff --git a/libpimeval/src/pimPerfEnergyFulcrum.cpp b/libpimeval/src/pimPerfEnergyFulcrum.cpp index ced8b928..b74a46cd 100644 --- a/libpimeval/src/pimPerfEnergyFulcrum.cpp +++ b/libpimeval/src/pimPerfEnergyFulcrum.cpp @@ -307,7 +307,7 @@ pimPerfEnergyFulcrum::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObj //! @brief Perf energy model of Fulcrum for rotate pimeval::perfEnergy -pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const +pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const { double msRuntime = 0.0; double mjEnergy = 0.0; @@ -318,8 +318,6 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL); unsigned numRegions = obj.getRegions().size(); uint64_t totalOp = 0; - // boundary handling - assume two times copying between device and host for boundary elements - pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); // rotate within subarray: // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row @@ -330,9 +328,14 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf msWrite = m_tW * numPass; msRuntime = msRead + msWrite + msCompute; mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass; - msRuntime += 2 * perfEnergyBT.m_msRuntime; - mjEnergy += 2 * perfEnergyBT.m_mjEnergy; - printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + + if(useCrossRegionCommunication) { + // boundary handling - assume two times copying between device and host for boundary elements + pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8); + msRuntime += 2 * perfEnergyBT.m_msRuntime; + mjEnergy += 2 * perfEnergyBT.m_mjEnergy; + // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str()); + } return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp); } diff --git a/libpimeval/src/pimPerfEnergyFulcrum.h b/libpimeval/src/pimPerfEnergyFulcrum.h index 652107b9..f0e2d4a5 100644 --- a/libpimeval/src/pimPerfEnergyFulcrum.h +++ b/libpimeval/src/pimPerfEnergyFulcrum.h @@ -26,7 +26,7 @@ class pimPerfEnergyFulcrum : public pimPerfEnergyBase virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override; virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override; virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override; - virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override; + virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override; virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override; protected: diff --git a/libpimeval/src/pimSim.cpp b/libpimeval/src/pimSim.cpp index 356637f9..b1cb3078 100644 --- a/libpimeval/src/pimSim.cpp +++ b/libpimeval/src/pimSim.cpp @@ -893,7 +893,7 @@ pimSim::pimRotateElementsRight(PimObjId src) { pimPerfMon perfMon("pimRotateElementsRight"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_R, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_R, src, true); return m_device->executeCmd(std::move(cmd)); } @@ -902,25 +902,25 @@ pimSim::pimRotateElementsLeft(PimObjId src) { pimPerfMon perfMon("pimRotateElementsLeft"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_L, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::ROTATE_ELEM_L, src, true); return m_device->executeCmd(std::move(cmd)); } bool -pimSim::pimShiftElementsRight(PimObjId src) +pimSim::pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication) { pimPerfMon perfMon("pimShiftElementsRight"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_R, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_R, src, useCrossRegionCommunication); return m_device->executeCmd(std::move(cmd)); } bool -pimSim::pimShiftElementsLeft(PimObjId src) +pimSim::pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication) { pimPerfMon perfMon("pimShiftElementsLeft"); if (!isValidDevice()) { return false; } - std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_L, src); + std::unique_ptr cmd = std::make_unique(PimCmdEnum::SHIFT_ELEM_L, src, useCrossRegionCommunication); return m_device->executeCmd(std::move(cmd)); } diff --git a/libpimeval/src/pimSim.h b/libpimeval/src/pimSim.h index abdf4dff..3f1d1c58 100644 --- a/libpimeval/src/pimSim.h +++ b/libpimeval/src/pimSim.h @@ -121,8 +121,8 @@ class pimSim template bool pimBroadcast(PimObjId dest, T value); bool pimRotateElementsRight(PimObjId src); bool pimRotateElementsLeft(PimObjId src); - bool pimShiftElementsRight(PimObjId src); - bool pimShiftElementsLeft(PimObjId src); + bool pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication); + bool pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication); bool pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount); bool pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount); bool pimAesSbox(PimObjId src, PimObjId dest, const std::vector& lut); diff --git a/misc-bench/stencil/Makefile b/misc-bench/stencil/Makefile new file mode 100644 index 00000000..c4cdb2e3 --- /dev/null +++ b/misc-bench/stencil/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of stencil +# Copyright (c) 2025 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) \ No newline at end of file diff --git a/misc-bench/stencil/PIM/Makefile b/misc-bench/stencil/PIM/Makefile new file mode 100644 index 00000000..cbeb7aa0 --- /dev/null +++ b/misc-bench/stencil/PIM/Makefile @@ -0,0 +1,30 @@ +# Makefile: C++ version of stencil +# Copyright (c) 2025 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := stencil.out +SRC := stencil.cpp + +EXEC2 := stencil_alt.out +SRC2 := stencil_alt.cpp + +debug perf dramsim3_integ: $(EXEC) $(EXEC2) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +$(EXEC2): $(SRC2) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) $(EXEC2) *.dSYM \ No newline at end of file diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp new file mode 100644 index 00000000..5ec25dc2 --- /dev/null +++ b/misc-bench/stencil/PIM/stencil.cpp @@ -0,0 +1,518 @@ +// Test: C++ version of the stencil +// Copyright (c) 2025 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" + +constexpr bool isHorizontallyChunked = true; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t iterations; + uint64_t gridWidth; + uint64_t gridHeight; + uint64_t radius; + const char *configFile; + const char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./stencil.out [options]" + "\n" + "\n -n iterations (default=10 iterations)" + "\n -x grid width (default=2048 elements)" + "\n -y grid height (default=2048 elements)" + "\n -r stencil radius (default=1)" + "\n -c dramsim config file" + "\n -i input file containing a 2d array (default=random)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.iterations = 10; + p.gridWidth = 2048; + p.gridHeight = 2048; + p.radius = 1; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'n': + p.iterations = strtoull(optarg, NULL, 0); + break; + case 'x': + p.gridWidth = strtoull(optarg, NULL, 0); + break; + case 'y': + p.gridHeight = strtoull(optarg, NULL, 0); + break; + case 'r': + p.radius= strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't'); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +//! @brief Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum +//! +//! Sums radius number of elemements to the left and right of center element, including center element +//! Puts each result pimRowSum[i] where i is the center index +//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j] +//! Works by shifting mid to the left and right and adding shifted versions +//! @param[in] mid PIM row to be summed +//! @param[out] pimRowSum The resultant PIM object to place the sum into +//! @param[in,out] shiftBackup Temporary PIM object used for calculations +//! @param[in] radius The stencil radius +void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) { + PimStatus status; + + if(radius == 0) { + return; + } + + status = pimCopyObjectToObject(mid, shiftBackup); + assert (status == PIM_OK); + + status = pimShiftElementsRight(shiftBackup, !isHorizontallyChunked); + assert (status == PIM_OK); + + status = pimAdd(mid, shiftBackup, pimRowSum); + assert (status == PIM_OK); + + for(uint64_t shiftIter=1; shiftIter& workingPimMemory, std::vector& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { + PimStatus status; + + uint64_t circularQueueBot = 0; + uint64_t circularQueueTop = 0; + + sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); + assert (status == PIM_OK); + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2 + // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed + // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed + // runningSum = sum of first two rows horizontally summed + + for(uint64_t i=2; i<2*radius; ++i) { + sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); + assert (status == PIM_OK); + ++circularQueueTop; + } + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2*radius + // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed + // runningSum = sum of rows [0...2*radius] horizontally summed + + uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum + + // Loops over the rest of the rows in the current chunk, vertically + // Each iteration, finds horizontal sum of the next row (nextRowToAdd) + // Places this horizontal sum at the front of the queue (at position circularQueueTop) + // Adds the horizontal sum to the runningSum + // Places runningSum/stencilArea into the workingPimMemory as the final result for the row + // If neccessary, subtracts the row from the back of the queue from the runningSum + + for(uint64_t row=radius; row &vec, PimObjId pimObj, const uint64_t numInvalid, const uint64_t numElementsHorizontal, const bool isToPim) { + PimStatus status; + if constexpr (!isHorizontallyChunked) { + if(isToPim) { + status = pimCopyHostToDevice((void*) vec.data(), pimObj); + } else { + status = pimCopyDeviceToHost(pimObj, (void*) vec.data()); + } + assert (status == PIM_OK); + } else { + //! @brief Total number of usable elements in final result + const uint64_t totalValid = vec.size() - 2*numInvalid; + //! @brief Maximum number of usable elements in a horizontal chunk, will be the number usable for all except for (possibly) the last chunk + const uint64_t maxUsable = numElementsHorizontal - 2*numInvalid; + //! @brief Total number of horizontal chunks + const uint64_t numChunks = (totalValid + maxUsable - 1) / maxUsable; + if(isToPim) { + uint64_t hostStartIdx = 0; + uint64_t pimStartIdx = 0; + for(uint64_t i=0; i> &srcHost, std::vector> &dstHost, const uint64_t numAssociable, + const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) { + + assert(!srcHost.empty()); + assert(!srcHost[0].empty()); + assert(srcHost.size() == dstHost.size()); + assert(srcHost[0].size() == dstHost[0].size()); + + std::vector> tmpGrid; + tmpGrid.resize(srcHost.size(), std::vector(srcHost[0].size())); + + const uint64_t gridWidth = srcHost[0].size(); + + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaFloat = 1.0f / static_cast(stencilAreaInt); + uint32_t tmp; + std::memcpy(&tmp, &stencilAreaFloat, sizeof(float)); + const uint64_t stencilAreaToMultiplyPim = static_cast(tmp); + + // Model assumes that only a finite number of stencil iterations can be computed on the PIM device before transferring back to the host + // In chunked stencil implementations (with cross region computations) this limit is both vertical and horizontal + // In non-chunked stencil implementations, this limit is purely vertical + // TODO: Figure out what to make this number + constexpr uint64_t maxIterationsPerPim = 5; // TODO: what should this number be? + + uint64_t pimAllocWidth; + if constexpr (isHorizontallyChunked) { + // Represents the number of elements on the left/right that aren't part of the final result for a horizontally chunked implementation. Without data movement, each iteration causes number of elements on each side to no longer be valid. + const uint64_t maxInvalidHorizontal = radius * std::min(maxIterationsPerPim, iterations); + const uint64_t maxUsableHorizontal = numElementsHorizontal - 2*maxInvalidHorizontal; + const uint64_t maxChunksHorizontal = (gridWidth + maxUsableHorizontal - 1) / maxUsableHorizontal; + pimAllocWidth = numElementsHorizontal * maxChunksHorizontal; + } else { + pimAllocWidth = gridWidth; + } + + PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, pimAllocWidth, PIM_FP32); + assert(tmpPim != -1); + PimObjId runningSum = pimAllocAssociated(tmpPim, PIM_FP32); + assert(runningSum != -1); + + std::vector rowsInSumCircularQueue(2*radius+1); + for(uint64_t i=0; i workingPimMemory(numAssociable - (rowsInSumCircularQueue.size() + 2)); + for(uint64_t i=0; i= srcHost.size()) { + break; + } + const uint64_t totalRowsThisIter = std::min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc; + const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsEachSide; + uint64_t workingPimMemoryIdx = 0; + for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) { + if(iter == 0) { + copyChunkedVectorPim(const_cast&>(srcHost[srcHostRow]), workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true); + } else { + copyChunkedVectorPim(tmpGrid[srcHostRow], workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true); + } + ++workingPimMemoryIdx; + } + + for(uint64_t iterNum = 0; iterNum < currIterations; ++iterNum) { + computeStencilChunkIteration(workingPimMemory, rowsInSumCircularQueue, tmpPim, runningSum, stencilAreaToMultiplyPim, radius); + } + + workingPimMemoryIdx = invalidResultsEachSide; + for(uint64_t srcHostRow = firstRowUsableSrc; srcHostRow < firstRowUsableSrc + usableRowsThisIter; ++srcHostRow) { + copyChunkedVectorPim(dstHost[srcHostRow], workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, false); + ++workingPimMemoryIdx; + } + + firstRowSrc += usableRowsThisIter; + } + std::swap(tmpGrid, dstHost); + } + std::swap(tmpGrid, dstHost); +} + +void stencilCpu(std::vector>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaInverseFloat = 1.0f / static_cast(stencilAreaInt); + + for(uint64_t iter=1; iter<=iterations; ++iter) { + // Only compute when stencil is fully in range + const uint64_t startY = radius*iter; + const uint64_t endY = src.size() - startY; + const uint64_t startX = radius*iter; + const uint64_t endX = src[0].size() - startX; + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY> x, y; + + if (params.inputFile == nullptr) + { + // Fill in random grid + x.resize(params.gridHeight, std::vector(params.gridWidth)); + + #pragma omp parallel + { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(0.0f, 10000.0f); + + #pragma omp for + for(size_t i=0; i(x[0].size())); + + PimDeviceProperties deviceProp; + PimStatus status = pimGetDeviceProperties(&deviceProp); + assert(status == PIM_OK); + + constexpr uint64_t bitsPerElement = 32; + + uint64_t numAssociable = 2 * deviceProp.numRowPerSubarray; + if(!deviceProp.isHLayoutDevice) { + numAssociable /= bitsPerElement; + } + + uint64_t numElementsHorizontal; + if(deviceProp.isHLayoutDevice) { + switch(deviceProp.simTarget) { + case PIM_DEVICE_FULCRUM: + case PIM_DEVICE_BANK_LEVEL: + numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement; + break; + default: + std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl; + std::exit(1); + } + } else { + numElementsHorizontal = deviceProp.numColPerSubarray; + } + + stencil(x, y, numAssociable, numElementsHorizontal, params.iterations, params.radius); + + if (params.shouldVerify) + { + std::vector> cpuY(y.size(), std::vector(y[0].size())); + stencilCpu(x, cpuY, params.iterations, params.radius); + + bool ok = true; + + // Only compute when stencil is fully in range + const uint64_t startY = params.radius * params.iterations; + const uint64_t endY = params.gridHeight - startY; + const uint64_t startX = params.radius * params.iterations; + const uint64_t endX = params.gridWidth - startX; + + std::cout << std::fixed << std::setprecision(10); + + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY acceptableDelta) + { + #pragma omp critical + { + std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl; + ok = false; + } + } + } + } + if(ok) { + std::cout << "Correct for stencil!" << std::endl; + } + } + + pimShowStats(); + + return 0; +} \ No newline at end of file diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp new file mode 100644 index 00000000..9543e637 --- /dev/null +++ b/misc-bench/stencil/PIM/stencil_alt.cpp @@ -0,0 +1,565 @@ +// Test: C++ version of the stencil +// Copyright (c) 2025 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t iterations; + uint64_t gridWidth; + uint64_t gridHeight; + uint64_t radius; + const char *configFile; + const char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./stencil.out [options]" + "\n" + "\n -n iterations (default=10 iterations)" + "\n -x grid width (default=2048 elements)" + "\n -y grid height (default=2048 elements)" + "\n -r stencil radius (default=1)" + "\n -c dramsim config file" + "\n -i input file containing a 2d array (default=random)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.iterations = 10; + p.gridWidth = 2048; + p.gridHeight = 2048; + p.radius = 1; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'n': + p.iterations = strtoull(optarg, NULL, 0); + break; + case 'x': + p.gridWidth = strtoull(optarg, NULL, 0); + break; + case 'y': + p.gridHeight = strtoull(optarg, NULL, 0); + break; + case 'r': + p.radius= strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't'); + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +//! @brief Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum +//! +//! Sums radius number of elemements to the left and right of center element, including center element +//! Puts each result pimRowSum[i] where i is the center index +//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j] +//! Works by shifting mid to the left and right and adding shifted versions +//! @param[in] mid PIM row to be summed +//! @param[out] pimRowSum The resultant PIM object to place the sum into +//! @param[in,out] shiftBackup Temporary PIM object used for calculations +//! @param[in] radius The stencil radius +void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) { + PimStatus status; + + if(radius == 0) { + return; + } + + status = pimCopyObjectToObject(mid, shiftBackup); + assert (status == PIM_OK); + + status = pimShiftElementsRight(shiftBackup, true); + assert (status == PIM_OK); + + status = pimAdd(mid, shiftBackup, pimRowSum); + assert (status == PIM_OK); + + for(uint64_t shiftIter=1; shiftIter rowsInSumCircularQueue; + std::vector workingPimMemory; + uint64_t srcStartX; + uint64_t srcStartY; + uint64_t numX; + uint64_t numY; + PimObjId tmpPim; + PimObjId runningSum; + + StencilTilePim(uint64_t radius, uint64_t srcStartY, uint64_t numY, uint64_t srcStartX, uint64_t numX) + : srcStartX(srcStartX), srcStartY(srcStartY), numX(numX), numY(numY) { + + tmpPim = pimAlloc(PIM_ALLOC_AUTO, numX, PIM_FP32); + assert(tmpPim != -1); + runningSum = pimAllocAssociated(tmpPim, PIM_FP32); + assert(runningSum != -1); + + rowsInSumCircularQueue.resize(2*radius+1); + for(uint64_t i=0; i> &srcHost) { + for(uint64_t idx = 0; idx < workingPimMemory.size(); ++idx) { + PimStatus status = pimCopyHostToDevice((void*) (srcHost[srcStartY + idx].data() + srcStartX), workingPimMemory[idx], 0, numX); + assert (status == PIM_OK); + } + } + + void copyFromPim(std::vector> &dstHost, const uint64_t numOverlap) { + for(uint64_t idx = numOverlap; idx < workingPimMemory.size() - numOverlap; ++idx) { + PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) (dstHost[srcStartY + idx].data() + srcStartX + numOverlap), numOverlap, numX - numOverlap); + assert (status == PIM_OK); + } + } + + //! @brief Computes one iteration of one chunk of the stencil + //! + //! Uses circular queue to compute window sums + //! Adds the next row to the front of the queue and to the sum + //! Takes the sum (divided by the stencil area) as the result from the row + //! Subtracts the back of the queue from the sum + //! Pops from the queue back of the queue + //! Repeats until done + //! @param[in] workingPimMemory PIM rows in the stencil chunk + //! @param[in] rowsInSumCircularQueue Queue used for keeping track of running sum of rows vertically + //! @param[in,out] tmpPim Temporary PIM object used for calculations + //! @param[in,out] runningSum Temporary PIM object used for keeping track of the current running (vertical) sum + //! @param[in] stencilAreaToMultiplyPim This algorithm computes stencil average, thus each element in the result must be divided by the stencil area. This is done by multiplying by the inverse. + //! @param[in] radius The stencil radius + void computeStencilIteration(const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) { + PimStatus status; + + uint64_t circularQueueBot = 0; + uint64_t circularQueueTop = 0; + + sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + ++circularQueueTop; + status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum); + assert (status == PIM_OK); + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2 + // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed + // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed + // runningSum = sum of first two rows horizontally summed + + for(uint64_t i=2; i<2*radius; ++i) { + sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius); + status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum); + assert (status == PIM_OK); + ++circularQueueTop; + } + + // At this point: + // circularQueueBot = 0 + // circularQueueTop = 2*radius + // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed + // runningSum = sum of rows [0...2*radius] horizontally summed + + uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum + + // Loops over the rest of the rows in the current chunk, vertically + // Each iteration, finds horizontal sum of the next row (nextRowToAdd) + // Places this horizontal sum at the front of the queue (at position circularQueueTop) + // Adds the horizontal sum to the runningSum + // Places runningSum/stencilArea into the workingPimMemory as the final result for the row + // If neccessary, subtracts the row from the back of the queue from the runningSum + + for(uint64_t row=radius; row& hostTmpRow, PimObjId pimSrc, PimObjId pimDst, uint64_t srcIdx, uint64_t dstIdx, uint64_t num) { + PimStatus status = pimCopyDeviceToHost(pimSrc, hostTmpRow.data(), srcIdx, srcIdx + num); + assert(status == PIM_OK); + status = pimCopyHostToDevice(hostTmpRow.data(), pimDst, dstIdx, dstIdx + num); + assert(status == PIM_OK); +} + +uint64_t getNumTiles(const uint64_t totalSize, const uint64_t maxChunkSize, const uint64_t numOverlap) { + if (totalSize <= maxChunkSize) { + return 1; + } else if (totalSize <= 2*(maxChunkSize - numOverlap)) { + return 2; + } else { + const uint64_t firstAndLastChunkRows = 2 * (maxChunkSize - numOverlap); + const uint64_t remainingRows = totalSize - firstAndLastChunkRows; + const uint64_t middleChunkSize = maxChunkSize - 2*numOverlap; + const uint64_t numMiddleChunks = (remainingRows + middleChunkSize - 1) / middleChunkSize; + return 2 + numMiddleChunks; + } +} + +//! @brief Computes a stencil pattern over a 2d array +//! @param[in] srcHost The input stencil grid +//! @param[out] dstHost The resultant stencil grid +//! @param[in] numAssociable Number of float 32 PIM objects that can be associated with each other +//! @param[in] numElementsHorizontal Number of float 32 PIM objects that can be placed in a PIM row without creating shifting issues +//! @param[in] iterations Number of iterations to run the stencil pattern for +//! @param[in] radius The radius of the stencil pattern +void stencil(const std::vector> &srcHost, std::vector> &dstHost, const uint64_t numAssociable, + const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) { + + assert(!srcHost.empty()); + assert(!srcHost[0].empty()); + assert(srcHost.size() == dstHost.size()); + assert(srcHost[0].size() == dstHost[0].size()); + + std::vector> tmpGrid; + tmpGrid.resize(srcHost.size(), std::vector(srcHost[0].size())); + + const uint64_t gridWidth = srcHost[0].size(); + + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaFloat = 1.0f / static_cast(stencilAreaInt); + uint32_t tmp; + std::memcpy(&tmp, &stencilAreaFloat, sizeof(float)); + const uint64_t stencilAreaToMultiplyPim = static_cast(tmp); + + const uint64_t maxElemChunkY = numAssociable - (2*radius + 1) - 2; + const uint64_t maxElemChunkX = numElementsHorizontal; + const uint64_t numOverlap = radius; + const uint64_t numTileX = getNumTiles(srcHost[0].size(), maxElemChunkX, numOverlap); + const uint64_t numTileY = getNumTiles(srcHost.size(), maxElemChunkY, numOverlap); + + std::vector> stenTilesPim(numTileY); + for (auto& row : stenTilesPim) { + row.reserve(numTileX); + } + + for(uint64_t tileIdxY=0; tileIdxY hostTmpRow(gridWidth, 0.0f); + + for(uint64_t iter=0; iter& above = tile.workingPimMemory; + std::vector& below = tileBelow.workingPimMemory; + + // only exchange rows with valid data + uint64_t startIdxX = tileIdxX == 0 ? 0 : numOverlap; + uint64_t endIdxX = tileIdxX == numTileX - 1 ? tile.numX : tile.numX - numOverlap; // exclusive + for(uint64_t row=0; row& left = tile.workingPimMemory; + std::vector& right = tileRight.workingPimMemory; + + // only exchange rows with valid data + uint64_t startIdxY = tileIdxY == 0 ? 0 : numOverlap; + uint64_t endIdxY = tileIdxY == numTileY - 1 ? tile.numY : tile.numY - numOverlap; // exclusive + for(uint64_t row=startIdxY; row& topLeft = tile.workingPimMemory; + std::vector& topRight = stenTilesPim[tileIdxY][tileIdxX+1].workingPimMemory; + std::vector& bottomLeft = stenTilesPim[tileIdxY+1][tileIdxX].workingPimMemory; + std::vector& bottomRight = stenTilesPim[tileIdxY+1][tileIdxX+1].workingPimMemory; + for(uint64_t row=0; row>& src, std::vector>& dst, const uint64_t iterations, const uint64_t radius) { + const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1); + const float stencilAreaInverseFloat = 1.0f / static_cast(stencilAreaInt); + + for(uint64_t iter=1; iter<=iterations; ++iter) { + // Only compute when stencil is fully in range + const uint64_t startY = radius*iter; + const uint64_t endY = src.size() - startY; + const uint64_t startX = radius*iter; + const uint64_t endX = src[0].size() - startX; + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY> x, y; + + if (params.inputFile == nullptr) + { + // Fill in random grid + x.resize(params.gridHeight, std::vector(params.gridWidth)); + + #pragma omp parallel + { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(0.0f, 10000.0f); + + #pragma omp for + for(size_t i=0; i(x[0].size())); + + PimDeviceProperties deviceProp; + PimStatus status = pimGetDeviceProperties(&deviceProp); + assert(status == PIM_OK); + + constexpr uint64_t bitsPerElement = 32; + + const uint64_t numAssociable = deviceProp.isHLayoutDevice ? deviceProp.numRowPerCore : deviceProp.numRowPerCore / bitsPerElement; + const uint64_t numElementsHorizontal = deviceProp.isHLayoutDevice ? deviceProp.numColPerSubarray / bitsPerElement : deviceProp.numColPerSubarray; + + stencil(x, y, numAssociable, numElementsHorizontal, params.iterations, params.radius); + + if (params.shouldVerify) + { + std::vector> cpuY(y.size(), std::vector(y[0].size())); + stencilCpu(x, cpuY, params.iterations, params.radius); + + bool ok = true; + + // Only compute when stencil is fully in range + const uint64_t startY = params.radius * params.iterations; + const uint64_t endY = params.gridHeight - startY; + const uint64_t startX = params.radius * params.iterations; + const uint64_t endX = params.gridWidth - startX; + + std::cout << std::fixed << std::setprecision(10); + + #pragma omp parallel for collapse(2) + for(uint64_t gridY=startY; gridY acceptableDelta) + { + #pragma omp critical + { + std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl; + ok = false; + } + } + } + } + if(ok) { + std::cout << "Correct for stencil!" << std::endl; + } + } + + pimShowStats(); + + return 0; +} \ No newline at end of file diff --git a/misc-bench/stencil/README.MD b/misc-bench/stencil/README.MD new file mode 100644 index 00000000..55a2fd18 --- /dev/null +++ b/misc-bench/stencil/README.MD @@ -0,0 +1,55 @@ +# Stencil + +Iterative stencil loops consist of repeatedly applying a stencil pattern to a grid. This benchmark iterates a stencil average pattern on a 2D array, with both the number of iterations and the size of the stencil pattern parameterized. An example is below: + +- Input: [[2.0, 2.0, 2.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], radius=1, iterations=1 +- Output: [[#, #, #], [#, 1.66, #], [#, #, #]] +- \# represents an undefined element + +## Directory Structure + +``` +stencil/ +├── PIM/ +│ ├── Makefile +│ ├── stencil.cpp +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains one implementation of the stencil benchmark: + +1. PIM + +### PIM Implementation + +The PIM variant is implemented using C++ with some speedup from OpenMP. Three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run each executable with the following command that will run it for default parameters: + +```bash +./stencil.out +``` + +To see help text on all usages and how to modify any of the input parameters, use the following command: + +```bash +./stencil.out -h +``` diff --git a/misc-bench/stencil/extra/optimizer.py b/misc-bench/stencil/extra/optimizer.py new file mode 100644 index 00000000..5230dabc --- /dev/null +++ b/misc-bench/stencil/extra/optimizer.py @@ -0,0 +1,90 @@ +# Finds the optimal layout for stencil assuming intra memory-layer transfer cost is consistent +# ie., all subarray to subarray transfers are equivalent in cost (within a bank) +# Note after running for some test values: layout does change based on transfer cost parameters + +subarray_block_width = 100 +subarray_block_height = 100 +subarrays_per_bank = 16 +banks_per_rank = 16 +ranks = 16 +transfer_cost_subarray_to_subarray = 1 +transfer_cost_bank_to_bank = 20 +transfer_cost_rank_to_rank = 100 + + +def get_stats(num_blocks, grid_width, block_width, block_height): + + if num_blocks % grid_width != 0: + raise ValueError("num blocks must be divisible by grid width") + + grid_height = num_blocks/grid_width + to_move_horizontal = (2 * grid_height * (grid_width - 1)) * (block_width - 2) + to_move_vertical = (2 * grid_width * (grid_height - 1)) * (block_height - 2) + to_move_diagonal = (4 * (grid_width-1) * (grid_height - 1)) + to_move_total = to_move_horizontal + to_move_vertical + to_move_diagonal + width_next = grid_width * block_width + height_next = grid_height * block_height + return to_move_total, width_next, height_next + +def total_move_cost(subarray_grid_width, bank_grid_width, rank_grid_width): + to_move_s2s, bank_block_width, bank_block_height = get_stats(subarrays_per_bank, subarray_grid_width, subarray_block_width, subarray_block_height) + to_move_b2b, rank_block_width, rank_block_height = get_stats(banks_per_rank, bank_grid_width, bank_block_width, bank_block_height) + to_move_r2r, final_block_width, final_block_height = get_stats(ranks, rank_grid_width, rank_block_width, rank_block_height) + cost = transfer_cost_subarray_to_subarray*to_move_s2s + cost += transfer_cost_bank_to_bank*to_move_b2b + cost += transfer_cost_rank_to_rank*to_move_r2r + return cost + +def get_divisors(n): + """Get all divisors of n""" + divisors = [] + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + divisors.append(i) + if i != n // i: + divisors.append(n // i) + return sorted(divisors) + + +# Find all valid divisors +subarray_divisors = get_divisors(subarrays_per_bank) +bank_divisors = get_divisors(banks_per_rank) +rank_divisors = get_divisors(ranks) + +print("Valid divisors:") +print(f" subarrays_per_bank ({subarrays_per_bank}): {subarray_divisors}") +print(f" banks_per_rank ({banks_per_rank}): {bank_divisors}") +print(f" ranks ({ranks}): {rank_divisors}") +print() + +# Find optimal configuration +min_cost = float('inf') +best_config = None + +for sgw in subarray_divisors: + for bgw in bank_divisors: + for rgw in rank_divisors: + cost = total_move_cost(sgw, bgw, rgw) + if cost < min_cost: + min_cost = cost + best_config = (sgw, bgw, rgw) + +print("OPTIMAL CONFIGURATION:") +print(f" subarray_grid_width = {best_config[0]}") +print(f" bank_grid_width = {best_config[1]}") +print(f" rank_grid_width = {best_config[2]}") +print(f" Total move cost = {min_cost:,.0f}") +print() + +# Show top 10 configurations +print("Top 10 configurations:") +results = [] +for sgw in subarray_divisors: + for bgw in bank_divisors: + for rgw in rank_divisors: + cost = total_move_cost(sgw, bgw, rgw) + results.append((cost, sgw, bgw, rgw)) + +results.sort() +for i, (cost, sgw, bgw, rgw) in enumerate(results[:10], 1): + print(f"{i:2}. Cost={cost:12,.0f} subarray={sgw:2}, bank={bgw:2}, rank={rgw:2}") \ No newline at end of file