diff --git a/libpimeval/src/libpimeval.cpp b/libpimeval/src/libpimeval.cpp
index ccd21330..52187142 100644
--- a/libpimeval/src/libpimeval.cpp
+++ b/libpimeval/src/libpimeval.cpp
@@ -537,17 +537,17 @@ pimRotateElementsLeft(PimObjId src)
 
 //! @brief  Shift elements of an obj by one step to the right and fill zero
 PimStatus
-pimShiftElementsRight(PimObjId src)
+pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication)
 {
-  bool ok = pimSim::get()->pimShiftElementsRight(src);
+  bool ok = pimSim::get()->pimShiftElementsRight(src, useCrossRegionCommunication);
   return ok ? PIM_OK : PIM_ERROR;
 }
 
 //! @brief  Shift elements of an obj by one step to the left and fill zero
 PimStatus
-pimShiftElementsLeft(PimObjId src)
+pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication)
 {
-  bool ok = pimSim::get()->pimShiftElementsLeft(src);
+  bool ok = pimSim::get()->pimShiftElementsLeft(src, useCrossRegionCommunication);
   return ok ? PIM_OK : PIM_ERROR;
 }
 
diff --git a/libpimeval/src/libpimeval.h b/libpimeval/src/libpimeval.h
index f89dbd48..b54ec659 100644
--- a/libpimeval/src/libpimeval.h
+++ b/libpimeval/src/libpimeval.h
@@ -229,8 +229,8 @@ PimStatus pimBroadcastUInt(PimObjId dest, uint64_t value);
 PimStatus pimBroadcastFP(PimObjId dest, float value);
 PimStatus pimRotateElementsRight(PimObjId src);
 PimStatus pimRotateElementsLeft(PimObjId src);
-PimStatus pimShiftElementsRight(PimObjId src);
-PimStatus pimShiftElementsLeft(PimObjId src);
+PimStatus pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication = true);
+PimStatus pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication = true);
 PimStatus pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount);
 PimStatus pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount);
 
diff --git a/libpimeval/src/pimCmd.cpp b/libpimeval/src/pimCmd.cpp
index 454607c8..0e7b67c1 100644
--- a/libpimeval/src/pimCmd.cpp
+++ b/libpimeval/src/pimCmd.cpp
@@ -1222,33 +1222,35 @@ pimCmdRotate::execute()
   computeAllRegions(numRegions);
 
   // handle region boundaries
-  if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) {
-    for (unsigned i = 0; i < numRegions; ++i) {
-      const pimRegion &srcRegion = objSrc.getRegions()[i];
-      uint64_t elemIdxBegin = srcRegion.getElemIdxBegin();
-      uint64_t val = 0;
-      if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) {
-        val = m_regionBoundary[numRegions - 1];
-      } else if (i > 0) {
-        val = m_regionBoundary[i - 1];
+  if(m_useCrossRegionCommunication) {
+    if (m_cmdType == PimCmdEnum::ROTATE_ELEM_R || m_cmdType == PimCmdEnum::SHIFT_ELEM_R) {
+      for (unsigned i = 0; i < numRegions; ++i) {
+        const pimRegion &srcRegion = objSrc.getRegions()[i];
+        uint64_t elemIdxBegin = srcRegion.getElemIdxBegin();
+        uint64_t val = 0;
+        if (i == 0 && m_cmdType == PimCmdEnum::ROTATE_ELEM_R) {
+          val = m_regionBoundary[numRegions - 1];
+        } else if (i > 0) {
+          val = m_regionBoundary[i - 1];
+        }
+        objSrc.setElement(elemIdxBegin, val);
       }
-      objSrc.setElement(elemIdxBegin, val);
-    }
-  } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) {
-    for (unsigned i = 0; i < numRegions; ++i) {
-      const pimRegion &srcRegion = objSrc.getRegions()[i];
-      unsigned numElementsInRegion = srcRegion.getNumElemInRegion();
-      uint64_t elemIdxBegin = srcRegion.getElemIdxBegin();
-      uint64_t val = 0;
-      if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) {
-        val = m_regionBoundary[0];
-      } else if (i < numRegions - 1) {
-        val = m_regionBoundary[i + 1];
+    } else if (m_cmdType == PimCmdEnum::ROTATE_ELEM_L || m_cmdType == PimCmdEnum::SHIFT_ELEM_L) {
+      for (unsigned i = 0; i < numRegions; ++i) {
+        const pimRegion &srcRegion = objSrc.getRegions()[i];
+        unsigned numElementsInRegion = srcRegion.getNumElemInRegion();
+        uint64_t elemIdxBegin = srcRegion.getElemIdxBegin();
+        uint64_t val = 0;
+        if (i == numRegions - 1 && m_cmdType == PimCmdEnum::ROTATE_ELEM_L) {
+          val = m_regionBoundary[0];
+        } else if (i < numRegions - 1) {
+          val = m_regionBoundary[i + 1];
+        }
+        objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val);
       }
-      objSrc.setElement(elemIdxBegin + numElementsInRegion - 1, val);
+    } else {
+      assert(0);
     }
-  } else {
-    assert(0);
   }
 
   if (pimSim::get()->getDeviceType() != PIM_FUNCTIONAL) {
@@ -1323,7 +1325,7 @@ pimCmdRotate::updateStats() const
   PimDataType dataType = objSrc.getDataType();
   bool isVLayout = objSrc.isVLayout();
 
-  pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc);
+  pimeval::perfEnergy mPerfEnergy = pimSim::get()->getPerfEnergyModel()->getPerfEnergyForRotate(m_cmdType, objSrc, m_useCrossRegionCommunication);
   pimSim::get()->getStatsMgr()->recordCmd(getName(dataType, isVLayout), mPerfEnergy);
   return true;
 }
diff --git a/libpimeval/src/pimCmd.h b/libpimeval/src/pimCmd.h
index 2941d529..5842f7e0 100644
--- a/libpimeval/src/pimCmd.h
+++ b/libpimeval/src/pimCmd.h
@@ -559,8 +559,8 @@ class pimCmdBroadcast : public pimCmd
 class pimCmdRotate : public pimCmd
 {
 public:
-  pimCmdRotate(PimCmdEnum cmdType, PimObjId src)
-    : pimCmd(cmdType), m_src(src)
+  pimCmdRotate(PimCmdEnum cmdType, PimObjId src, bool useCrossRegionCommunication)
+    : pimCmd(cmdType), m_src(src), m_useCrossRegionCommunication(useCrossRegionCommunication)
   {
     assert(cmdType == PimCmdEnum::ROTATE_ELEM_R || cmdType == PimCmdEnum::ROTATE_ELEM_L ||
            cmdType == PimCmdEnum::SHIFT_ELEM_R || cmdType == PimCmdEnum::SHIFT_ELEM_L);
@@ -573,6 +573,7 @@ class pimCmdRotate : public pimCmd
 protected:
   PimObjId m_src;
   std::vector<uint64_t> m_regionBoundary;
+  bool m_useCrossRegionCommunication;
 };
 
 //! @class  pimCmdReadRowToSa
diff --git a/libpimeval/src/pimPerfEnergyAim.cpp b/libpimeval/src/pimPerfEnergyAim.cpp
index 1dc73fc4..00190e94 100644
--- a/libpimeval/src/pimPerfEnergyAim.cpp
+++ b/libpimeval/src/pimPerfEnergyAim.cpp
@@ -132,7 +132,7 @@ pimPerfEnergyAim::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo
 
 //! @brief  Perf energy model of aim for rotate
 pimeval::perfEnergy
-pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyAim::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 0.0;
   double mjEnergy = 0.0;
diff --git a/libpimeval/src/pimPerfEnergyAim.h b/libpimeval/src/pimPerfEnergyAim.h
index bb00cd25..6b35750a 100644
--- a/libpimeval/src/pimPerfEnergyAim.h
+++ b/libpimeval/src/pimPerfEnergyAim.h
@@ -26,7 +26,7 @@ class pimPerfEnergyAim : public pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override;
   virtual pimeval::perfEnergy getPerfEnergyForMac(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
   
 protected:
diff --git a/libpimeval/src/pimPerfEnergyAquabolt.cpp b/libpimeval/src/pimPerfEnergyAquabolt.cpp
index 845727e2..43cfb658 100644
--- a/libpimeval/src/pimPerfEnergyAquabolt.cpp
+++ b/libpimeval/src/pimPerfEnergyAquabolt.cpp
@@ -259,7 +259,7 @@ pimPerfEnergyAquabolt::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimOb
 
 //! @brief  Perf energy model of aquabolt PIM for rotate
 pimeval::perfEnergy
-pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyAquabolt::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 0.0;
   double mjEnergy = 0.0;
diff --git a/libpimeval/src/pimPerfEnergyAquabolt.h b/libpimeval/src/pimPerfEnergyAquabolt.h
index 5dcca145..7645853d 100644
--- a/libpimeval/src/pimPerfEnergyAquabolt.h
+++ b/libpimeval/src/pimPerfEnergyAquabolt.h
@@ -26,8 +26,8 @@ class pimPerfEnergyAquabolt : public pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override;
+
 protected:
   unsigned m_aquaboltFPUBitWidth = 16;
   // TODO: Update for Aquabolt
diff --git a/libpimeval/src/pimPerfEnergyBankLevel.cpp b/libpimeval/src/pimPerfEnergyBankLevel.cpp
index 2ac82c20..88824e8c 100644
--- a/libpimeval/src/pimPerfEnergyBankLevel.cpp
+++ b/libpimeval/src/pimPerfEnergyBankLevel.cpp
@@ -355,7 +355,7 @@ pimPerfEnergyBankLevel::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO
 // TODO: This needs to be revisited
 //! @brief  Perf energy model of bank-level PIM for rotate
 pimeval::perfEnergy
-pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 0.0;
   double mjEnergy = 0.0;
@@ -366,8 +366,6 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI
   unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL);
   unsigned numRegions = obj.getRegions().size();
   uint64_t totalOp = 0;
-  // boundary handling - assume two times copying between device and host for boundary elements
-  pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
 
   // rotate within subarray:
   // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row
@@ -377,9 +375,14 @@ pimPerfEnergyBankLevel::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI
   msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass
   msRuntime *= numPass;
   mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass;
-  msRuntime += 2 * perfEnergyBT.m_msRuntime;
-  mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
-  printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str());
+
+  if(useCrossRegionCommunication) {
+    // boundary handling - assume two times copying between device and host for boundary elements
+    pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
+    msRuntime += 2 * perfEnergyBT.m_msRuntime;
+    mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+    // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str());
+  }
 
   return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp);
 }
diff --git a/libpimeval/src/pimPerfEnergyBankLevel.h b/libpimeval/src/pimPerfEnergyBankLevel.h
index 5f869cb2..3442e2ff 100644
--- a/libpimeval/src/pimPerfEnergyBankLevel.h
+++ b/libpimeval/src/pimPerfEnergyBankLevel.h
@@ -26,7 +26,7 @@ class pimPerfEnergyBankLevel : public pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override;
   virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
 
 protected:
diff --git a/libpimeval/src/pimPerfEnergyBase.cpp b/libpimeval/src/pimPerfEnergyBase.cpp
index 2a1a5e49..feb3fa4a 100644
--- a/libpimeval/src/pimPerfEnergyBase.cpp
+++ b/libpimeval/src/pimPerfEnergyBase.cpp
@@ -171,7 +171,7 @@ pimPerfEnergyBase::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInf
 
 //! @brief  Perf energy model of base class for rotate (placeholder)
 pimeval::perfEnergy
-pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyBase::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 1e10;
   double mjEnergy = 999999999.9;
diff --git a/libpimeval/src/pimPerfEnergyBase.h b/libpimeval/src/pimPerfEnergyBase.h
index dc7f4b79..41095875 100644
--- a/libpimeval/src/pimPerfEnergyBase.h
+++ b/libpimeval/src/pimPerfEnergyBase.h
@@ -70,7 +70,7 @@ class pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const;
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const;
   virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const;
   virtual pimeval::perfEnergy getPerfEnergyForMac(PimCmdEnum cmdType, const pimObjInfo& obj) const;
 
diff --git a/libpimeval/src/pimPerfEnergyBitSerial.cpp b/libpimeval/src/pimPerfEnergyBitSerial.cpp
index 1724bbd8..7d0a8c82 100644
--- a/libpimeval/src/pimPerfEnergyBitSerial.cpp
+++ b/libpimeval/src/pimPerfEnergyBitSerial.cpp
@@ -439,7 +439,7 @@ pimPerfEnergyBitSerial::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimO
 
 //! @brief  Perf energy model of bit-serial PIM for rotate
 pimeval::perfEnergy
-pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 0.0;
   double mjEnergy = 0.0;
@@ -451,8 +451,6 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI
   unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL);
   unsigned numRegions = obj.getRegions().size();
   unsigned numCore = obj.getNumCoreAvailable();
-  // boundary handling - assume two times copying between device and host for boundary elements
-  pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
 
   switch (m_simTarget) {
     case PIM_DEVICE_BITSIMD_V:
@@ -465,8 +463,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI
       totalOp += 3 * bitsPerElement * numPass * numCore;
       msRuntime = msRead + msWrite + msCompute;
       mjEnergy = (m_eAP + 3 * m_eL) * bitsPerElement * numPass; // for one pass
-      msRuntime += 2 * perfEnergyBT.m_msRuntime;
-      mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+      if(useCrossRegionCommunication) {
+        // boundary handling - assume two times copying between device and host for boundary elements
+        pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
+        msRuntime += 2 * perfEnergyBT.m_msRuntime;
+        mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+      }
       break;
     case PIM_DEVICE_SIMDRAM:
       // todo
@@ -481,8 +483,12 @@ pimPerfEnergyBitSerial::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjI
       msRuntime = (m_tR + (bitsPerElement + 2) * m_tL + m_tW); // for one pass
       msRuntime *= numPass;
       mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass;
-      msRuntime += 2 * perfEnergyBT.m_msRuntime;
-      mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+      if(useCrossRegionCommunication) {
+        // boundary handling - assume two times copying between device and host for boundary elements
+        pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
+        msRuntime += 2 * perfEnergyBT.m_msRuntime;
+        mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+      }
       break;
     default:
       assert(0);
diff --git a/libpimeval/src/pimPerfEnergyBitSerial.h b/libpimeval/src/pimPerfEnergyBitSerial.h
index 86d8b681..f2ce4ee1 100644
--- a/libpimeval/src/pimPerfEnergyBitSerial.h
+++ b/libpimeval/src/pimPerfEnergyBitSerial.h
@@ -26,7 +26,7 @@ class pimPerfEnergyBitSerial : public pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override;
   virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
 
 protected:
diff --git a/libpimeval/src/pimPerfEnergyFulcrum.cpp b/libpimeval/src/pimPerfEnergyFulcrum.cpp
index ced8b928..b74a46cd 100644
--- a/libpimeval/src/pimPerfEnergyFulcrum.cpp
+++ b/libpimeval/src/pimPerfEnergyFulcrum.cpp
@@ -307,7 +307,7 @@ pimPerfEnergyFulcrum::getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObj
 
 //! @brief  Perf energy model of Fulcrum for rotate
 pimeval::perfEnergy
-pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const
+pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const
 {
   double msRuntime = 0.0;
   double mjEnergy = 0.0;
@@ -318,8 +318,6 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf
   unsigned bitsPerElement = obj.getBitsPerElement(PimBitWidth::ACTUAL);
   unsigned numRegions = obj.getRegions().size();
   uint64_t totalOp = 0;
-  // boundary handling - assume two times copying between device and host for boundary elements
-  pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
 
   // rotate within subarray:
   // For every bit: Read row to SA; move SA to R1; Shift R1 by N steps; Move R1 to SA; Write SA to row
@@ -330,9 +328,14 @@ pimPerfEnergyFulcrum::getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInf
   msWrite = m_tW * numPass;
   msRuntime = msRead + msWrite + msCompute;
   mjEnergy = (m_eAP + (bitsPerElement + 2) * m_eL) * numPass;
-  msRuntime += 2 * perfEnergyBT.m_msRuntime;
-  mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
-  printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str());
+
+  if(useCrossRegionCommunication) {
+    // boundary handling - assume two times copying between device and host for boundary elements
+    pimeval::perfEnergy perfEnergyBT = getPerfEnergyForBytesTransfer(PimCmdEnum::COPY_D2H, numRegions * bitsPerElement / 8);
+    msRuntime += 2 * perfEnergyBT.m_msRuntime;
+    mjEnergy += 2 * perfEnergyBT.m_mjEnergy;
+    // printf("PIM-Warning: Perf energy model is not precise for PIM command %s\n", pimCmd::getName(cmdType, "").c_str());
+  }
 
   return pimeval::perfEnergy(msRuntime, mjEnergy, msRead, msWrite, msCompute, totalOp);
 }
diff --git a/libpimeval/src/pimPerfEnergyFulcrum.h b/libpimeval/src/pimPerfEnergyFulcrum.h
index 652107b9..f0e2d4a5 100644
--- a/libpimeval/src/pimPerfEnergyFulcrum.h
+++ b/libpimeval/src/pimPerfEnergyFulcrum.h
@@ -26,7 +26,7 @@ class pimPerfEnergyFulcrum : public pimPerfEnergyBase
   virtual pimeval::perfEnergy getPerfEnergyForFunc2(PimCmdEnum cmdType, const pimObjInfo& objSrc1, const pimObjInfo& objSrc2, const pimObjInfo& objDest) const override;
   virtual pimeval::perfEnergy getPerfEnergyForReduction(PimCmdEnum cmdType, const pimObjInfo& obj, unsigned numPass) const override;
   virtual pimeval::perfEnergy getPerfEnergyForBroadcast(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
-  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
+  virtual pimeval::perfEnergy getPerfEnergyForRotate(PimCmdEnum cmdType, const pimObjInfo& obj, bool useCrossRegionCommunication) const override;
   virtual pimeval::perfEnergy getPerfEnergyForPrefixSum(PimCmdEnum cmdType, const pimObjInfo& obj) const override;
 
 protected:
diff --git a/libpimeval/src/pimSim.cpp b/libpimeval/src/pimSim.cpp
index 356637f9..b1cb3078 100644
--- a/libpimeval/src/pimSim.cpp
+++ b/libpimeval/src/pimSim.cpp
@@ -893,7 +893,7 @@ pimSim::pimRotateElementsRight(PimObjId src)
 {
   pimPerfMon perfMon("pimRotateElementsRight");
   if (!isValidDevice()) { return false; }
-  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::ROTATE_ELEM_R, src);
+  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::ROTATE_ELEM_R, src, true);
   return m_device->executeCmd(std::move(cmd));
 }
 
@@ -902,25 +902,25 @@ pimSim::pimRotateElementsLeft(PimObjId src)
 {
   pimPerfMon perfMon("pimRotateElementsLeft");
   if (!isValidDevice()) { return false; }
-  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::ROTATE_ELEM_L, src);
+  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::ROTATE_ELEM_L, src, true);
   return m_device->executeCmd(std::move(cmd));
 }
 
 bool
-pimSim::pimShiftElementsRight(PimObjId src)
+pimSim::pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication)
 {
   pimPerfMon perfMon("pimShiftElementsRight");
   if (!isValidDevice()) { return false; }
-  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::SHIFT_ELEM_R, src);
+  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::SHIFT_ELEM_R, src, useCrossRegionCommunication);
   return m_device->executeCmd(std::move(cmd));
 }
 
 bool
-pimSim::pimShiftElementsLeft(PimObjId src)
+pimSim::pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication)
 {
   pimPerfMon perfMon("pimShiftElementsLeft");
   if (!isValidDevice()) { return false; }
-  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::SHIFT_ELEM_L, src);
+  std::unique_ptr<pimCmd> cmd = std::make_unique<pimCmdRotate>(PimCmdEnum::SHIFT_ELEM_L, src, useCrossRegionCommunication);
   return m_device->executeCmd(std::move(cmd));
 }
 
diff --git a/libpimeval/src/pimSim.h b/libpimeval/src/pimSim.h
index abdf4dff..3f1d1c58 100644
--- a/libpimeval/src/pimSim.h
+++ b/libpimeval/src/pimSim.h
@@ -121,8 +121,8 @@ class pimSim
   template <typename T> bool pimBroadcast(PimObjId dest, T value);
   bool pimRotateElementsRight(PimObjId src);
   bool pimRotateElementsLeft(PimObjId src);
-  bool pimShiftElementsRight(PimObjId src);
-  bool pimShiftElementsLeft(PimObjId src);
+  bool pimShiftElementsRight(PimObjId src, bool useCrossRegionCommunication);
+  bool pimShiftElementsLeft(PimObjId src, bool useCrossRegionCommunication);
   bool pimShiftBitsRight(PimObjId src, PimObjId dest, unsigned shiftAmount);
   bool pimShiftBitsLeft(PimObjId src, PimObjId dest, unsigned shiftAmount);
   bool pimAesSbox(PimObjId src, PimObjId dest, const std::vector<uint8_t>& lut); 
diff --git a/misc-bench/stencil/Makefile b/misc-bench/stencil/Makefile
new file mode 100644
index 00000000..c4cdb2e3
--- /dev/null
+++ b/misc-bench/stencil/Makefile
@@ -0,0 +1,16 @@
+# Makefile: C++ version of stencil
+# Copyright (c) 2025 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+SUBDIRS := PIM
+
+.PHONY: debug perf dramsim3_integ clean $(SUBDIRS)
+.DEFAULT_GOAL := perf
+
+USE_OPENMP ?= 0
+
+debug perf dramsim3_integ clean: $(SUBDIRS)
+
+$(SUBDIRS):
+	$(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP)
\ No newline at end of file
diff --git a/misc-bench/stencil/PIM/Makefile b/misc-bench/stencil/PIM/Makefile
new file mode 100644
index 00000000..cbeb7aa0
--- /dev/null
+++ b/misc-bench/stencil/PIM/Makefile
@@ -0,0 +1,30 @@
+# Makefile: C++ version of stencil
+# Copyright (c) 2025 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+PROJ_ROOT = ../../..
+include ${PROJ_ROOT}/Makefile.common
+
+# make USE_OPENMP=1
+USE_OPENMP ?= 0
+ifeq ($(USE_OPENMP),1)
+	CXXFLAGS += -fopenmp
+endif
+
+EXEC := stencil.out
+SRC := stencil.cpp
+
+EXEC2 := stencil_alt.out
+SRC2 := stencil_alt.cpp
+
+debug perf dramsim3_integ: $(EXEC) $(EXEC2)
+
+$(EXEC): $(SRC) $(DEPS)
+	$(CXX) $< $(CXXFLAGS) -o $@
+
+$(EXEC2): $(SRC2) $(DEPS)
+	$(CXX) $< $(CXXFLAGS) -o $@
+
+clean:
+	rm -rf $(EXEC) $(EXEC2) *.dSYM
\ No newline at end of file
diff --git a/misc-bench/stencil/PIM/stencil.cpp b/misc-bench/stencil/PIM/stencil.cpp
new file mode 100644
index 00000000..5ec25dc2
--- /dev/null
+++ b/misc-bench/stencil/PIM/stencil.cpp
@@ -0,0 +1,518 @@
+// Test: C++ version of the stencil
+// Copyright (c) 2025 University of Virginia
+// This file is licensed under the MIT License.
+// See the LICENSE file in the root of this repository for more details.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include <stdint.h>
+#include <iomanip>
+#include <cassert>
+#include <type_traits>
+#include <queue>
+#include <random>
+#include <limits>
+#include <algorithm>
+#include <list>
+#include <cstring>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "util.h"
+#include "libpimeval.h"
+
+constexpr bool isHorizontallyChunked = true;
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t iterations;
+  uint64_t gridWidth;
+  uint64_t gridHeight;
+  uint64_t radius;
+  const char *configFile;
+  const char *inputFile;
+  bool shouldVerify;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./stencil.out [options]"
+          "\n"
+          "\n    -n    iterations (default=10 iterations)"
+          "\n    -x    grid width (default=2048 elements)"
+          "\n    -y    grid height (default=2048 elements)"
+          "\n    -r    stencil radius (default=1)"
+          "\n    -c    dramsim config file"
+          "\n    -i    input file containing a 2d array (default=random)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+struct Params getInputParams(int argc, char **argv)
+{
+  struct Params p;
+  p.iterations = 10;
+  p.gridWidth = 2048;
+  p.gridHeight = 2048;
+  p.radius = 1;
+  p.configFile = nullptr;
+  p.inputFile = nullptr;
+  p.shouldVerify = false;
+
+  int opt;
+  while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+      break;
+    case 'n':
+      p.iterations = strtoull(optarg, NULL, 0);
+      break;
+    case 'x':
+      p.gridWidth = strtoull(optarg, NULL, 0);
+      break;
+    case 'y':
+      p.gridHeight = strtoull(optarg, NULL, 0);
+      break;
+    case 'r':
+      p.radius= strtoull(optarg, NULL, 0);
+      break;
+    case 'c':
+      p.configFile = optarg;
+      break;
+    case 'i':
+      p.inputFile = optarg;
+      break;
+    case 'v':
+      p.shouldVerify = (*optarg == 't');
+      break;
+    default:
+      fprintf(stderr, "\nUnrecognized option!\n");
+      usage();
+      exit(0);
+    }
+  }
+  return p;
+}
+
+//! @brief  Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum
+//!
+//! Sums radius number of elemements to the left and right of center element, including center element
+//! Puts each result pimRowSum[i] where i is the center index
+//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j]
+//! Works by shifting mid to the left and right and adding shifted versions
+//! @param[in]  mid  PIM row to be summed
+//! @param[out]  pimRowSum  The resultant PIM object to place the sum into
+//! @param[in,out]  shiftBackup  Temporary PIM object used for calculations
+//! @param[in]  radius  The stencil radius
+void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) {
+  PimStatus status;
+
+  if(radius == 0) {
+    return;
+  }
+  
+  status = pimCopyObjectToObject(mid, shiftBackup);
+  assert (status == PIM_OK);
+
+  status = pimShiftElementsRight(shiftBackup, !isHorizontallyChunked);
+  assert (status == PIM_OK);
+
+  status = pimAdd(mid, shiftBackup, pimRowSum);
+  assert (status == PIM_OK);
+
+  for(uint64_t shiftIter=1; shiftIter<radius; ++shiftIter) {
+    status = pimShiftElementsRight(shiftBackup, !isHorizontallyChunked);
+    assert (status == PIM_OK);
+
+    status = pimAdd(pimRowSum, shiftBackup, pimRowSum);
+    assert (status == PIM_OK);
+  }
+
+  status = pimCopyObjectToObject(mid, shiftBackup);
+  assert (status == PIM_OK);
+
+  for(uint64_t shiftIter=0; shiftIter<radius; ++shiftIter) {
+    status = pimShiftElementsLeft(shiftBackup, !isHorizontallyChunked);
+    assert (status == PIM_OK);
+
+    status = pimAdd(pimRowSum, shiftBackup, pimRowSum);
+    assert (status == PIM_OK);
+  }
+}
+
+//! @brief  Computes one iteration of one chunk of the stencil
+//!
+//! Uses circular queue to compute window sums
+//! Adds the next row to the front of the queue and to the sum
+//! Takes the sum (divided by the stencil area) as the result from the row
+//! Subtracts the back of the queue from the sum
+//! Pops from the queue back of the queue
+//! Repeats until done
+//! @param[in]  workingPimMemory  PIM rows in the stencil chunk
+//! @param[in]  rowsInSumCircularQueue  Queue used for keeping track of running sum of rows vertically
+//! @param[in,out]  tmpPim  Temporary PIM object used for calculations
+//! @param[in,out]  runningSum Temporary PIM object used for keeping track of the current running (vertical) sum
+//! @param[in]  stencilAreaToMultiplyPim This algorithm computes stencil average, thus each element in the result must be divided by the stencil area. This is done by multiplying by the inverse.
+//! @param[in]  radius  The stencil radius
+void computeStencilChunkIteration(std::vector<PimObjId>& workingPimMemory, std::vector<PimObjId>& rowsInSumCircularQueue, PimObjId tmpPim, PimObjId runningSum, const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) {
+  PimStatus status;
+
+  uint64_t circularQueueBot = 0;
+  uint64_t circularQueueTop = 0;
+
+  sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+  ++circularQueueTop;
+  sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+  ++circularQueueTop;
+  status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum);
+  assert (status == PIM_OK);
+
+  // At this point:
+  // circularQueueBot = 0
+  // circularQueueTop = 2
+  // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed
+  // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed
+  // runningSum = sum of first two rows horizontally summed
+
+  for(uint64_t i=2; i<2*radius; ++i) {
+    sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+    status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum);
+    assert (status == PIM_OK);
+    ++circularQueueTop;
+  }
+
+  // At this point:
+  // circularQueueBot = 0
+  // circularQueueTop = 2*radius
+  // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed
+  // runningSum = sum of rows [0...2*radius] horizontally summed
+
+  uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum
+
+  // Loops over the rest of the rows in the current chunk, vertically
+  // Each iteration, finds horizontal sum of the next row (nextRowToAdd)
+  // Places this horizontal sum at the front of the queue (at position circularQueueTop)
+  // Adds the horizontal sum to the runningSum
+  // Places runningSum/stencilArea into the workingPimMemory as the final result for the row
+  // If neccessary, subtracts the row from the back of the queue from the runningSum
+
+  for(uint64_t row=radius; row<workingPimMemory.size()-radius; ++row) {
+    sumStencilRow(workingPimMemory[nextRowToAdd], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+
+    status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum);
+    assert (status == PIM_OK);
+
+    circularQueueTop = (1+circularQueueTop) % rowsInSumCircularQueue.size();
+    ++nextRowToAdd;
+
+    status = pimMulScalar(runningSum, workingPimMemory[row], stencilAreaToMultiplyPim);
+    assert (status == PIM_OK);
+    
+    if(row+1<workingPimMemory.size()-radius) {
+      status = pimSub(runningSum, rowsInSumCircularQueue[circularQueueBot], runningSum);
+      assert (status == PIM_OK);
+      circularQueueBot = (1+circularQueueBot) % rowsInSumCircularQueue.size();
+    }
+  }
+}
+
+
+//! @brief  Copies data to/from PIM, accounting for chunking in necessary
+//! @param[in]  vec  Host vector to copy to PIM
+//! @param[in]  pimObj  PIM Object to copy vector into
+//! @param[in]  numInvalid  Number of elements on each side (left/right) of row that will not be included in final result
+//! @param[in]  numElementsHorizontal  Number of PIM fp32 elements that can be placed in a row without shifting issues. If stencil is horizontally chunked, this means that elements cannot shift accross these boundaries, e.g. if numElementsHorizontal=100, then pimShiftElementsLeft cannot shift element 100 into position 99.
+//! @param[in]  isToPim Direction to copy. If true, then copy from host to PIM, if false then PIM to host
+void copyChunkedVectorPim(std::vector<float> &vec, PimObjId pimObj, const uint64_t numInvalid, const uint64_t numElementsHorizontal, const bool isToPim) {
+  PimStatus status;
+  if constexpr (!isHorizontallyChunked) {
+    if(isToPim) {
+      status = pimCopyHostToDevice((void*) vec.data(), pimObj);
+    } else {
+      status = pimCopyDeviceToHost(pimObj, (void*) vec.data());
+    }
+    assert (status == PIM_OK);
+  } else {
+    //! @brief  Total number of usable elements in final result
+    const uint64_t totalValid = vec.size() - 2*numInvalid;
+    //! @brief  Maximum number of usable elements in a horizontal chunk, will be the number usable for all except for (possibly) the last chunk
+    const uint64_t maxUsable = numElementsHorizontal - 2*numInvalid;
+    //! @brief  Total number of horizontal chunks
+    const uint64_t numChunks = (totalValid + maxUsable - 1) / maxUsable;
+    if(isToPim) {
+      uint64_t hostStartIdx = 0;
+      uint64_t pimStartIdx = 0;
+      for(uint64_t i=0; i<numChunks; ++i) {
+        const uint64_t hostEndIdx = std::min(vec.size(), hostStartIdx+numElementsHorizontal);
+        const uint64_t len = hostEndIdx - hostStartIdx;
+
+        status = pimCopyHostToDevice((void*) &vec[hostStartIdx], pimObj, pimStartIdx, pimStartIdx+len);
+        assert (status == PIM_OK);
+
+        hostStartIdx += maxUsable;
+        pimStartIdx += numElementsHorizontal;
+      }
+    } else {
+      uint64_t hostStartIdx = numInvalid;
+      uint64_t pimStartIdx = numInvalid;
+      for(uint64_t i=0; i<numChunks; ++i) {
+        const uint64_t currValid = i+1==numChunks ? (totalValid%maxUsable) : maxUsable; // TODO: Is it better to evenly spread out horizontal chunks?
+
+        // Copy pim[pimStartIdx, pimStartIdx+currValid) into host[hostStartIdx,hostStartIdx+currValid)
+        status = pimCopyDeviceToHost(pimObj, (void*) &vec[hostStartIdx], pimStartIdx, pimStartIdx+currValid);
+        assert (status == PIM_OK);
+        
+        hostStartIdx += currValid;
+        pimStartIdx += currValid + 2*numInvalid;
+      }
+    }
+  }
+}
+
+//! @brief  Computes a stencil pattern over a 2d array
+//! @param[in]  srcHost  The input stencil grid
+//! @param[out]  dstHost  The resultant stencil grid
+//! @param[in]  numAssociable  Number of float 32 PIM objects that can be associated with each other
+//! @param[in]  numElementsHorizontal  Number of float 32 PIM objects that can be placed in a PIM row without creating shifting issues
+//! @param[in]  iterations  Number of iterations to run the stencil pattern for
+//! @param[in]  radius  The radius of the stencil pattern
+void stencil(const std::vector<std::vector<float>> &srcHost, std::vector<std::vector<float>> &dstHost, const uint64_t numAssociable,
+              const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) {
+  
+  assert(!srcHost.empty());
+  assert(!srcHost[0].empty());
+  assert(srcHost.size() == dstHost.size());
+  assert(srcHost[0].size() == dstHost[0].size());
+
+  std::vector<std::vector<float>> tmpGrid;
+  tmpGrid.resize(srcHost.size(), std::vector<float>(srcHost[0].size()));
+
+  const uint64_t gridWidth = srcHost[0].size();
+
+  const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1);
+  const float stencilAreaFloat = 1.0f / static_cast<float>(stencilAreaInt);
+  uint32_t tmp;
+  std::memcpy(&tmp, &stencilAreaFloat, sizeof(float));
+  const uint64_t stencilAreaToMultiplyPim = static_cast<uint64_t>(tmp);
+
+  // Model assumes that only a finite number of stencil iterations can be computed on the PIM device before transferring back to the host
+  // In chunked stencil implementations (with cross region computations) this limit is both vertical and horizontal
+  // In non-chunked stencil implementations, this limit is purely vertical
+  // TODO: Figure out what to make this number
+  constexpr uint64_t maxIterationsPerPim = 5; // TODO: what should this number be?
+
+  uint64_t pimAllocWidth;
+  if constexpr (isHorizontallyChunked) {
+    // Represents the number of elements on the left/right that aren't part of the final result for a horizontally chunked implementation. Without data movement, each iteration causes <radius> number of elements on each side to no longer be valid.
+    const uint64_t maxInvalidHorizontal = radius * std::min(maxIterationsPerPim, iterations);
+    const uint64_t maxUsableHorizontal = numElementsHorizontal - 2*maxInvalidHorizontal;
+    const uint64_t maxChunksHorizontal = (gridWidth + maxUsableHorizontal - 1) / maxUsableHorizontal;
+    pimAllocWidth = numElementsHorizontal * maxChunksHorizontal;
+  } else {
+    pimAllocWidth = gridWidth;
+  }
+
+  PimObjId tmpPim = pimAlloc(PIM_ALLOC_AUTO, pimAllocWidth, PIM_FP32);
+  assert(tmpPim != -1);
+  PimObjId runningSum = pimAllocAssociated(tmpPim, PIM_FP32);
+  assert(runningSum != -1);
+
+  std::vector<PimObjId> rowsInSumCircularQueue(2*radius+1);
+  for(uint64_t i=0; i<rowsInSumCircularQueue.size(); ++i) {
+    rowsInSumCircularQueue[i] = pimAllocAssociated(tmpPim, PIM_FP32);
+    assert(rowsInSumCircularQueue[i] != -1);
+  }
+
+  std::vector<PimObjId> workingPimMemory(numAssociable - (rowsInSumCircularQueue.size() + 2));
+  for(uint64_t i=0; i<workingPimMemory.size(); ++i) {
+    workingPimMemory[i] = pimAllocAssociated(tmpPim, PIM_FP32);
+    assert(workingPimMemory[i] != -1);
+  }
+
+  const uint64_t numLoops = (iterations + maxIterationsPerPim - 1)/maxIterationsPerPim;
+  for(uint64_t iter=0; iter<numLoops; ++iter) {
+    const uint64_t currIterations = iter+1==numLoops ? (iterations - maxIterationsPerPim*(numLoops-1)) : maxIterationsPerPim;
+    const uint64_t invalidResultsEachSide = radius * currIterations;
+
+    uint64_t firstRowSrc = 0;
+    for(;;) {
+      const uint64_t firstRowUsableSrc = firstRowSrc + invalidResultsEachSide;
+      if(firstRowUsableSrc + invalidResultsEachSide >= srcHost.size()) {
+        break;
+      }
+      const uint64_t totalRowsThisIter = std::min(srcHost.size(), firstRowSrc + workingPimMemory.size()) - firstRowSrc;
+      const uint64_t usableRowsThisIter = totalRowsThisIter - 2*invalidResultsEachSide;
+      uint64_t workingPimMemoryIdx = 0;
+      for(uint64_t srcHostRow = firstRowSrc; srcHostRow < firstRowSrc + totalRowsThisIter; ++srcHostRow) {
+        if(iter == 0) {
+          copyChunkedVectorPim(const_cast<std::vector<float>&>(srcHost[srcHostRow]), workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true);
+        } else {
+          copyChunkedVectorPim(tmpGrid[srcHostRow], workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, true);
+        }
+        ++workingPimMemoryIdx;
+      }
+
+      for(uint64_t iterNum = 0; iterNum < currIterations; ++iterNum) {
+        computeStencilChunkIteration(workingPimMemory, rowsInSumCircularQueue, tmpPim, runningSum, stencilAreaToMultiplyPim, radius);
+      }
+
+      workingPimMemoryIdx = invalidResultsEachSide;
+      for(uint64_t srcHostRow = firstRowUsableSrc; srcHostRow < firstRowUsableSrc + usableRowsThisIter; ++srcHostRow) {
+        copyChunkedVectorPim(dstHost[srcHostRow], workingPimMemory[workingPimMemoryIdx], invalidResultsEachSide, numElementsHorizontal, false);
+        ++workingPimMemoryIdx;
+      }
+
+      firstRowSrc += usableRowsThisIter;
+    }
+    std::swap(tmpGrid, dstHost);
+  }
+  std::swap(tmpGrid, dstHost);
+}
+
+void stencilCpu(std::vector<std::vector<float>>& src, std::vector<std::vector<float>>& dst, const uint64_t iterations, const uint64_t radius) {
+  const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1);
+  const float stencilAreaInverseFloat = 1.0f / static_cast<float>(stencilAreaInt);
+
+  for(uint64_t iter=1; iter<=iterations; ++iter) {
+    // Only compute when stencil is fully in range
+    const uint64_t startY = radius*iter;
+    const uint64_t endY = src.size() - startY;
+    const uint64_t startX = radius*iter;
+    const uint64_t endX = src[0].size() - startX;
+    #pragma omp parallel for collapse(2)
+    for(uint64_t gridY=startY; gridY<endY; ++gridY) {
+      for(uint64_t gridX=startX; gridX<endX; ++gridX) {
+        float resCPU = 0.0f;
+        for(uint64_t stencilY=gridY-radius; stencilY<=gridY+radius; ++stencilY) {
+          for(uint64_t stencilX=gridX-radius; stencilX<=gridX+radius; ++stencilX) {
+            resCPU += src[stencilY][stencilX];
+          }
+        }
+        dst[gridY][gridX] = resCPU * stencilAreaInverseFloat;
+      }
+    }
+    std::swap(src, dst);
+  }
+  std::swap(src, dst);
+}
+
+int main(int argc, char* argv[])
+{
+  struct Params params = getInputParams(argc, argv);
+
+  std::cout << "Running PIM stencil for grid: " << params.gridHeight << "x" << params.gridWidth << std::endl;
+  std::cout << "Stencil Radius: " << params.radius << ", Number of Iterations: " << params.iterations << std::endl;
+  if constexpr(isHorizontallyChunked) {
+    std::cout << "Stencil does not use cross region communication" << std::endl;
+  } else {
+    std::cout << "Stencil uses cross region communication" << std::endl;
+  }
+
+  std::vector<std::vector<float>> x, y;
+
+  if (params.inputFile == nullptr)
+  {
+    // Fill in random grid
+    x.resize(params.gridHeight, std::vector<float>(params.gridWidth));
+
+    #pragma omp parallel
+    {
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      std::uniform_real_distribution<float> dist(0.0f, 10000.0f);
+      
+      #pragma omp for
+      for(size_t i=0; i<params.gridHeight; ++i) {
+        for(size_t j=0; j<params.gridWidth; ++j) {
+          x[i][j] = dist(gen);
+        }
+      }
+    }
+  } 
+  else 
+  {
+    std::cout << "Reading from input file is not implemented yet." << std::endl;
+    return 1;
+  }
+  
+  if (!createDevice(params.configFile))
+  {
+    return 1;
+  }
+
+  y.resize(x.size(), std::vector<float>(x[0].size()));
+
+  PimDeviceProperties deviceProp;
+  PimStatus status = pimGetDeviceProperties(&deviceProp);
+  assert(status == PIM_OK);
+
+  constexpr uint64_t bitsPerElement = 32;
+
+  uint64_t numAssociable = 2 * deviceProp.numRowPerSubarray;
+  if(!deviceProp.isHLayoutDevice) {
+    numAssociable /= bitsPerElement;
+  }
+
+  uint64_t numElementsHorizontal;
+  if(deviceProp.isHLayoutDevice) {
+    switch(deviceProp.simTarget) {
+      case PIM_DEVICE_FULCRUM:
+      case PIM_DEVICE_BANK_LEVEL:
+        numElementsHorizontal = deviceProp.numColPerSubarray / bitsPerElement;
+        break;
+      default:
+        std::cerr << "Stencil unimplemented for simulation target: " << deviceProp.simTarget << std::endl;
+        std::exit(1);
+    }
+  } else {
+    numElementsHorizontal = deviceProp.numColPerSubarray;
+  }
+
+  stencil(x, y, numAssociable, numElementsHorizontal, params.iterations, params.radius);
+
+  if (params.shouldVerify) 
+  {
+    std::vector<std::vector<float>> cpuY(y.size(), std::vector<float>(y[0].size()));
+    stencilCpu(x, cpuY, params.iterations, params.radius);
+
+    bool ok = true;
+
+    // Only compute when stencil is fully in range
+    const uint64_t startY = params.radius * params.iterations;
+    const uint64_t endY = params.gridHeight - startY;
+    const uint64_t startX = params.radius * params.iterations;
+    const uint64_t endX = params.gridWidth - startX;
+
+    std::cout << std::fixed << std::setprecision(10);
+
+    #pragma omp parallel for collapse(2)
+    for(uint64_t gridY=startY; gridY<endY; ++gridY) {
+      for(uint64_t gridX=startX; gridX<endX; ++gridX) {
+        constexpr float acceptableDelta = 0.1f;
+        if (std::abs(cpuY[gridY][gridX] - y[gridY][gridX]) > acceptableDelta)
+        {
+          #pragma omp critical
+          {
+            std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl;
+            ok = false;
+          }
+        }
+      }
+    }
+    if(ok) {
+      std::cout << "Correct for stencil!" << std::endl;
+    }
+  }
+
+  pimShowStats();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/misc-bench/stencil/PIM/stencil_alt.cpp b/misc-bench/stencil/PIM/stencil_alt.cpp
new file mode 100644
index 00000000..9543e637
--- /dev/null
+++ b/misc-bench/stencil/PIM/stencil_alt.cpp
@@ -0,0 +1,565 @@
+// Test: C++ version of the stencil
+// Copyright (c) 2025 University of Virginia
+// This file is licensed under the MIT License.
+// See the LICENSE file in the root of this repository for more details.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include <stdint.h>
+#include <iomanip>
+#include <cassert>
+#include <type_traits>
+#include <queue>
+#include <random>
+#include <utility>
+#include <limits>
+#include <algorithm>
+#include <list>
+#include <cstring>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "util.h"
+#include "libpimeval.h"
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t iterations;
+  uint64_t gridWidth;
+  uint64_t gridHeight;
+  uint64_t radius;
+  const char *configFile;
+  const char *inputFile;
+  bool shouldVerify;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./stencil.out [options]"
+          "\n"
+          "\n    -n    iterations (default=10 iterations)"
+          "\n    -x    grid width (default=2048 elements)"
+          "\n    -y    grid height (default=2048 elements)"
+          "\n    -r    stencil radius (default=1)"
+          "\n    -c    dramsim config file"
+          "\n    -i    input file containing a 2d array (default=random)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+struct Params getInputParams(int argc, char **argv)
+{
+  struct Params p;
+  p.iterations = 10;
+  p.gridWidth = 2048;
+  p.gridHeight = 2048;
+  p.radius = 1;
+  p.configFile = nullptr;
+  p.inputFile = nullptr;
+  p.shouldVerify = false;
+
+  int opt;
+  while ((opt = getopt(argc, argv, "h:n:x:y:r:c:i:v:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+      break;
+    case 'n':
+      p.iterations = strtoull(optarg, NULL, 0);
+      break;
+    case 'x':
+      p.gridWidth = strtoull(optarg, NULL, 0);
+      break;
+    case 'y':
+      p.gridHeight = strtoull(optarg, NULL, 0);
+      break;
+    case 'r':
+      p.radius= strtoull(optarg, NULL, 0);
+      break;
+    case 'c':
+      p.configFile = optarg;
+      break;
+    case 'i':
+      p.inputFile = optarg;
+      break;
+    case 'v':
+      p.shouldVerify = (*optarg == 't');
+      break;
+    default:
+      fprintf(stderr, "\nUnrecognized option!\n");
+      usage();
+      exit(0);
+    }
+  }
+  return p;
+}
+
+//! @brief  Sums the neighbors of each element in a stencil row to compute the horizontal stencil sum
+//!
+//! Sums radius number of elemements to the left and right of center element, including center element
+//! Puts each result pimRowSum[i] where i is the center index
+//! Formula: pimRowSum[i] = Σ (j ∈ [i-radius, i+radius]) mid[j]
+//! Works by shifting mid to the left and right and adding shifted versions
+//! @param[in]  mid  PIM row to be summed
+//! @param[out]  pimRowSum  The resultant PIM object to place the sum into
+//! @param[in,out]  shiftBackup  Temporary PIM object used for calculations
+//! @param[in]  radius  The stencil radius
+void sumStencilRow(PimObjId mid, PimObjId pimRowSum, PimObjId shiftBackup, const uint64_t radius) {
+  PimStatus status;
+
+  if(radius == 0) {
+    return;
+  }
+  
+  status = pimCopyObjectToObject(mid, shiftBackup);
+  assert (status == PIM_OK);
+
+  status = pimShiftElementsRight(shiftBackup, true);
+  assert (status == PIM_OK);
+
+  status = pimAdd(mid, shiftBackup, pimRowSum);
+  assert (status == PIM_OK);
+
+  for(uint64_t shiftIter=1; shiftIter<radius; ++shiftIter) {
+    status = pimShiftElementsRight(shiftBackup, true);
+    assert (status == PIM_OK);
+
+    status = pimAdd(pimRowSum, shiftBackup, pimRowSum);
+    assert (status == PIM_OK);
+  }
+
+  status = pimCopyObjectToObject(mid, shiftBackup);
+  assert (status == PIM_OK);
+
+  for(uint64_t shiftIter=0; shiftIter<radius; ++shiftIter) {
+    status = pimShiftElementsLeft(shiftBackup, true);
+    assert (status == PIM_OK);
+
+    status = pimAdd(pimRowSum, shiftBackup, pimRowSum);
+    assert (status == PIM_OK);
+  }
+}
+
+struct StencilTilePim {
+  std::vector<PimObjId> rowsInSumCircularQueue;
+  std::vector<PimObjId> workingPimMemory;
+  uint64_t srcStartX;
+  uint64_t srcStartY;
+  uint64_t numX;
+  uint64_t numY;
+  PimObjId tmpPim;
+  PimObjId runningSum;
+
+  StencilTilePim(uint64_t radius, uint64_t srcStartY, uint64_t numY, uint64_t srcStartX, uint64_t numX)
+      : srcStartX(srcStartX), srcStartY(srcStartY), numX(numX), numY(numY) {
+
+    tmpPim = pimAlloc(PIM_ALLOC_AUTO, numX, PIM_FP32);
+    assert(tmpPim != -1);
+    runningSum = pimAllocAssociated(tmpPim, PIM_FP32);
+    assert(runningSum != -1);
+
+    rowsInSumCircularQueue.resize(2*radius+1);
+    for(uint64_t i=0; i<rowsInSumCircularQueue.size(); ++i) {
+      rowsInSumCircularQueue[i] = pimAllocAssociated(tmpPim, PIM_FP32);
+      assert(rowsInSumCircularQueue[i] != -1);
+    }
+
+    workingPimMemory.resize(numY);
+    for(uint64_t i=0; i<workingPimMemory.size(); ++i) {
+      workingPimMemory[i] = pimAllocAssociated(tmpPim, PIM_FP32);
+      assert(workingPimMemory[i] != -1);
+    }
+  }
+
+  void copyToPim(const std::vector<std::vector<float>> &srcHost) {
+    for(uint64_t idx = 0; idx < workingPimMemory.size(); ++idx) {
+      PimStatus status = pimCopyHostToDevice((void*) (srcHost[srcStartY + idx].data() + srcStartX), workingPimMemory[idx], 0, numX);
+      assert (status == PIM_OK);
+    }
+  }
+
+  void copyFromPim(std::vector<std::vector<float>> &dstHost, const uint64_t numOverlap) {
+    for(uint64_t idx = numOverlap; idx < workingPimMemory.size() - numOverlap; ++idx) {
+      PimStatus status = pimCopyDeviceToHost(workingPimMemory[idx], (void*) (dstHost[srcStartY + idx].data() + srcStartX + numOverlap), numOverlap, numX - numOverlap);
+      assert (status == PIM_OK);
+    }
+  }
+
+  //! @brief  Computes one iteration of one chunk of the stencil
+  //!
+  //! Uses circular queue to compute window sums
+  //! Adds the next row to the front of the queue and to the sum
+  //! Takes the sum (divided by the stencil area) as the result from the row
+  //! Subtracts the back of the queue from the sum
+  //! Pops from the queue back of the queue
+  //! Repeats until done
+  //! @param[in]  workingPimMemory  PIM rows in the stencil chunk
+  //! @param[in]  rowsInSumCircularQueue  Queue used for keeping track of running sum of rows vertically
+  //! @param[in,out]  tmpPim  Temporary PIM object used for calculations
+  //! @param[in,out]  runningSum Temporary PIM object used for keeping track of the current running (vertical) sum
+  //! @param[in]  stencilAreaToMultiplyPim This algorithm computes stencil average, thus each element in the result must be divided by the stencil area. This is done by multiplying by the inverse.
+  //! @param[in]  radius  The stencil radius
+  void computeStencilIteration(const uint64_t stencilAreaToMultiplyPim, const uint64_t radius) {
+    PimStatus status;
+
+    uint64_t circularQueueBot = 0;
+    uint64_t circularQueueTop = 0;
+
+    sumStencilRow(workingPimMemory[0], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+    ++circularQueueTop;
+    sumStencilRow(workingPimMemory[1], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+    ++circularQueueTop;
+    status = pimAdd(rowsInSumCircularQueue[0], rowsInSumCircularQueue[1], runningSum);
+    assert (status == PIM_OK);
+
+    // At this point:
+    // circularQueueBot = 0
+    // circularQueueTop = 2
+    // rowsInSumCircularQueue[0] = workingPimMemory[0] horizontally summed
+    // rowsInSumCircularQueue[1] = workingPimMemory[1] horizontally summed
+    // runningSum = sum of first two rows horizontally summed
+
+    for(uint64_t i=2; i<2*radius; ++i) {
+      sumStencilRow(workingPimMemory[i], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+      status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum);
+      assert (status == PIM_OK);
+      ++circularQueueTop;
+    }
+
+    // At this point:
+    // circularQueueBot = 0
+    // circularQueueTop = 2*radius
+    // rowsInSumCircularQueue[0...2*radius] are occupied with workingPimMemory[0...2*radius] horizontally summed
+    // runningSum = sum of rows [0...2*radius] horizontally summed
+
+    uint64_t nextRowToAdd = 2*radius; // The index of the next row to add to the queue and to the running sum
+
+    // Loops over the rest of the rows in the current chunk, vertically
+    // Each iteration, finds horizontal sum of the next row (nextRowToAdd)
+    // Places this horizontal sum at the front of the queue (at position circularQueueTop)
+    // Adds the horizontal sum to the runningSum
+    // Places runningSum/stencilArea into the workingPimMemory as the final result for the row
+    // If neccessary, subtracts the row from the back of the queue from the runningSum
+
+    for(uint64_t row=radius; row<workingPimMemory.size()-radius; ++row) {
+      sumStencilRow(workingPimMemory[nextRowToAdd], rowsInSumCircularQueue[circularQueueTop], tmpPim, radius);
+
+      status = pimAdd(runningSum, rowsInSumCircularQueue[circularQueueTop], runningSum);
+      assert (status == PIM_OK);
+
+      circularQueueTop = (1+circularQueueTop) % rowsInSumCircularQueue.size();
+      ++nextRowToAdd;
+
+      status = pimMulScalar(runningSum, workingPimMemory[row], stencilAreaToMultiplyPim);
+      assert (status == PIM_OK);
+      
+      if(row+1<workingPimMemory.size()-radius) {
+        status = pimSub(runningSum, rowsInSumCircularQueue[circularQueueBot], runningSum);
+        assert (status == PIM_OK);
+        circularQueueBot = (1+circularQueueBot) % rowsInSumCircularQueue.size();
+      }
+    }
+  }
+};
+
+void pimMove(std::vector<float>& hostTmpRow, PimObjId pimSrc, PimObjId pimDst, uint64_t srcIdx, uint64_t dstIdx, uint64_t num) {
+  PimStatus status = pimCopyDeviceToHost(pimSrc, hostTmpRow.data(), srcIdx, srcIdx + num);
+  assert(status == PIM_OK);
+  status = pimCopyHostToDevice(hostTmpRow.data(), pimDst, dstIdx, dstIdx + num);
+  assert(status == PIM_OK);
+}
+
+uint64_t getNumTiles(const uint64_t totalSize, const uint64_t maxChunkSize, const uint64_t numOverlap) {
+  if (totalSize <= maxChunkSize) {
+    return 1;
+  } else if (totalSize <= 2*(maxChunkSize - numOverlap)) {
+    return 2;
+  } else {
+    const uint64_t firstAndLastChunkRows = 2 * (maxChunkSize - numOverlap);
+    const uint64_t remainingRows = totalSize - firstAndLastChunkRows;
+    const uint64_t middleChunkSize = maxChunkSize - 2*numOverlap;
+    const uint64_t numMiddleChunks = (remainingRows + middleChunkSize - 1) / middleChunkSize;
+    return 2 + numMiddleChunks;
+  }
+}
+
+//! @brief  Computes a stencil pattern over a 2d array
+//! @param[in]  srcHost  The input stencil grid
+//! @param[out]  dstHost  The resultant stencil grid
+//! @param[in]  numAssociable  Number of float 32 PIM objects that can be associated with each other
+//! @param[in]  numElementsHorizontal  Number of float 32 PIM objects that can be placed in a PIM row without creating shifting issues
+//! @param[in]  iterations  Number of iterations to run the stencil pattern for
+//! @param[in]  radius  The radius of the stencil pattern
+void stencil(const std::vector<std::vector<float>> &srcHost, std::vector<std::vector<float>> &dstHost, const uint64_t numAssociable,
+              const uint64_t numElementsHorizontal, const uint64_t iterations, const uint64_t radius) {
+  
+  assert(!srcHost.empty());
+  assert(!srcHost[0].empty());
+  assert(srcHost.size() == dstHost.size());
+  assert(srcHost[0].size() == dstHost[0].size());
+
+  std::vector<std::vector<float>> tmpGrid;
+  tmpGrid.resize(srcHost.size(), std::vector<float>(srcHost[0].size()));
+
+  const uint64_t gridWidth = srcHost[0].size();
+
+  const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1);
+  const float stencilAreaFloat = 1.0f / static_cast<float>(stencilAreaInt);
+  uint32_t tmp;
+  std::memcpy(&tmp, &stencilAreaFloat, sizeof(float));
+  const uint64_t stencilAreaToMultiplyPim = static_cast<uint64_t>(tmp);
+
+  const uint64_t maxElemChunkY = numAssociable - (2*radius + 1) - 2;
+  const uint64_t maxElemChunkX = numElementsHorizontal;
+  const uint64_t numOverlap = radius;
+  const uint64_t numTileX = getNumTiles(srcHost[0].size(), maxElemChunkX, numOverlap);
+  const uint64_t numTileY = getNumTiles(srcHost.size(), maxElemChunkY, numOverlap);
+
+  std::vector<std::vector<StencilTilePim>> stenTilesPim(numTileY);
+  for (auto& row : stenTilesPim) {
+    row.reserve(numTileX);
+  }
+
+  for(uint64_t tileIdxY=0; tileIdxY<numTileY; ++tileIdxY) {
+    const uint64_t srcStartY = tileIdxY*(maxElemChunkY - 2*numOverlap);
+    const uint64_t lastElemIdxSrcY = std::min(srcHost.size(), srcStartY + maxElemChunkY);
+    const uint64_t numY = lastElemIdxSrcY - srcStartY;
+    for(uint64_t tileIdxX=0; tileIdxX<numTileX; ++tileIdxX) {
+      const uint64_t srcStartX = tileIdxX*(maxElemChunkX - 2*numOverlap);
+      const uint64_t lastElemIdxSrcX = std::min(srcHost[0].size(), srcStartX + maxElemChunkX);
+      const uint64_t numX = lastElemIdxSrcX - srcStartX;
+      stenTilesPim[tileIdxY].emplace_back(radius, srcStartY, numY, srcStartX, numX);
+      stenTilesPim[tileIdxY].back().copyToPim(srcHost);
+    }
+  }
+
+  std::vector<float> hostTmpRow(gridWidth, 0.0f);
+
+  for(uint64_t iter=0; iter<iterations; ++iter) {
+    for(auto& tileRow : stenTilesPim) {
+      for(auto& tile : tileRow) {
+        tile.computeStencilIteration(stencilAreaToMultiplyPim, radius);
+      }
+    }
+    if(iter+1<iterations) {
+      for(uint64_t tileIdxY=0; tileIdxY<numTileY; ++tileIdxY) {
+        for(uint64_t tileIdxX=0; tileIdxX<numTileX; ++tileIdxX) {
+          StencilTilePim& tile = stenTilesPim[tileIdxY][tileIdxX];
+
+          // handle vertical communication
+          if(tileIdxY+1 < numTileY) {
+            StencilTilePim& tileBelow = stenTilesPim[tileIdxY+1][tileIdxX];
+            std::vector<PimObjId>& above = tile.workingPimMemory;
+            std::vector<PimObjId>& below = tileBelow.workingPimMemory;
+
+            // only exchange rows with valid data
+            uint64_t startIdxX = tileIdxX == 0 ? 0 : numOverlap;
+            uint64_t endIdxX = tileIdxX == numTileX - 1 ? tile.numX : tile.numX - numOverlap; // exclusive
+            for(uint64_t row=0; row<numOverlap; ++row) {
+              PimObjId pimSrc;
+              PimObjId pimDst;
+              
+              pimSrc = above[above.size() - 2 * numOverlap + row];
+              pimDst = below[row];
+              pimMove(hostTmpRow, pimSrc, pimDst, startIdxX, startIdxX, endIdxX - startIdxX);
+    
+              pimSrc = below[numOverlap + row];
+              pimDst = above[above.size() - numOverlap + row];
+              pimMove(hostTmpRow, pimSrc, pimDst, startIdxX, startIdxX, endIdxX - startIdxX);
+            }
+          }
+
+          // handle horizontal communication
+          if(tileIdxX+1 < numTileX) {
+            StencilTilePim& tileRight = stenTilesPim[tileIdxY][tileIdxX+1];
+            std::vector<PimObjId>& left = tile.workingPimMemory;
+            std::vector<PimObjId>& right = tileRight.workingPimMemory;
+
+            // only exchange rows with valid data
+            uint64_t startIdxY = tileIdxY == 0 ? 0 : numOverlap;
+            uint64_t endIdxY = tileIdxY == numTileY - 1 ? tile.numY : tile.numY - numOverlap; // exclusive
+            for(uint64_t row=startIdxY; row<endIdxY; ++row) {
+              PimObjId pimSrc;
+              PimObjId pimDst;
+              
+              pimSrc = left[row];
+              pimDst = right[row];
+              pimMove(hostTmpRow, pimSrc, pimDst, tile.numX-2*numOverlap, 0, numOverlap);
+    
+              std::swap(pimSrc, pimDst);
+              pimMove(hostTmpRow, pimSrc, pimDst, numOverlap, tile.numX-numOverlap, numOverlap);
+            }
+          }
+
+          // handle corner communication
+          if(tileIdxX+1 < numTileX && tileIdxY+1 < numTileY) {
+            std::vector<PimObjId>& topLeft = tile.workingPimMemory;
+            std::vector<PimObjId>& topRight = stenTilesPim[tileIdxY][tileIdxX+1].workingPimMemory;
+            std::vector<PimObjId>& bottomLeft = stenTilesPim[tileIdxY+1][tileIdxX].workingPimMemory;
+            std::vector<PimObjId>& bottomRight = stenTilesPim[tileIdxY+1][tileIdxX+1].workingPimMemory;
+            for(uint64_t row=0; row<numOverlap; ++row) {
+              PimObjId pimSrc;
+              PimObjId pimDst;
+              
+              // top-left to bottom-right
+              pimSrc = topLeft[topLeft.size() - 2 * numOverlap + row];
+              pimDst = bottomRight[row];
+              pimMove(hostTmpRow, pimSrc, pimDst, tile.numX - 2*numOverlap, 0, numOverlap);
+
+              // bottom-right to top-left
+              pimSrc = bottomRight[numOverlap + row];
+              pimDst = topLeft[topLeft.size() - numOverlap + row];
+              pimMove(hostTmpRow, pimSrc, pimDst, numOverlap, tile.numX - numOverlap, numOverlap);
+
+              // top-right to bottom-left
+              pimSrc = topRight[topRight.size() - 2 * numOverlap + row];
+              pimDst = bottomLeft[row];
+              pimMove(hostTmpRow, pimSrc, pimDst, numOverlap, tile.numX - numOverlap, numOverlap);
+
+              // bottom-left to top-right
+              pimSrc = bottomLeft[numOverlap + row];
+              pimDst = topRight[topRight.size() - numOverlap + row];
+              pimMove(hostTmpRow, pimSrc, pimDst, tile.numX - 2*numOverlap, 0, numOverlap);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for(auto& tileRow : stenTilesPim) {
+    for(auto& tile : tileRow) {
+      tile.copyFromPim(dstHost, numOverlap);
+    }
+  }
+}
+
+void stencilCpu(std::vector<std::vector<float>>& src, std::vector<std::vector<float>>& dst, const uint64_t iterations, const uint64_t radius) {
+  const uint64_t stencilAreaInt = (2 * radius + 1) * (2 * radius + 1);
+  const float stencilAreaInverseFloat = 1.0f / static_cast<float>(stencilAreaInt);
+
+  for(uint64_t iter=1; iter<=iterations; ++iter) {
+    // Only compute when stencil is fully in range
+    const uint64_t startY = radius*iter;
+    const uint64_t endY = src.size() - startY;
+    const uint64_t startX = radius*iter;
+    const uint64_t endX = src[0].size() - startX;
+    #pragma omp parallel for collapse(2)
+    for(uint64_t gridY=startY; gridY<endY; ++gridY) {
+      for(uint64_t gridX=startX; gridX<endX; ++gridX) {
+        float resCPU = 0.0f;
+        for(uint64_t stencilY=gridY-radius; stencilY<=gridY+radius; ++stencilY) {
+          for(uint64_t stencilX=gridX-radius; stencilX<=gridX+radius; ++stencilX) {
+            resCPU += src[stencilY][stencilX];
+          }
+        }
+        dst[gridY][gridX] = resCPU * stencilAreaInverseFloat;
+      }
+    }
+    std::swap(src, dst);
+  }
+  std::swap(src, dst);
+}
+
+int main(int argc, char* argv[])
+{
+  struct Params params = getInputParams(argc, argv);
+
+  std::cout << "Running PIM stencil for grid: " << params.gridHeight << "x" << params.gridWidth << std::endl;
+  std::cout << "Stencil Radius: " << params.radius << ", Number of Iterations: " << params.iterations << std::endl;
+  if constexpr(true) {
+    std::cout << "Stencil does not use cross region communication" << std::endl;
+  } else {
+    std::cout << "Stencil uses cross region communication" << std::endl;
+  }
+
+  std::vector<std::vector<float>> x, y;
+
+  if (params.inputFile == nullptr)
+  {
+    // Fill in random grid
+    x.resize(params.gridHeight, std::vector<float>(params.gridWidth));
+
+    #pragma omp parallel
+    {
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      std::uniform_real_distribution<float> dist(0.0f, 10000.0f);
+      
+      #pragma omp for
+      for(size_t i=0; i<params.gridHeight; ++i) {
+        for(size_t j=0; j<params.gridWidth; ++j) {
+          x[i][j] = dist(gen);
+        }
+      }
+    }
+  } 
+  else 
+  {
+    std::cout << "Reading from input file is not implemented yet." << std::endl;
+    return 1;
+  }
+  
+  if (!createDevice(params.configFile))
+  {
+    return 1;
+  }
+
+  y.resize(x.size(), std::vector<float>(x[0].size()));
+
+  PimDeviceProperties deviceProp;
+  PimStatus status = pimGetDeviceProperties(&deviceProp);
+  assert(status == PIM_OK);
+
+  constexpr uint64_t bitsPerElement = 32;
+
+  const uint64_t numAssociable = deviceProp.isHLayoutDevice ? deviceProp.numRowPerCore : deviceProp.numRowPerCore / bitsPerElement;
+  const uint64_t numElementsHorizontal = deviceProp.isHLayoutDevice ? deviceProp.numColPerSubarray / bitsPerElement : deviceProp.numColPerSubarray;
+
+  stencil(x, y, numAssociable, numElementsHorizontal, params.iterations, params.radius);
+
+  if (params.shouldVerify) 
+  {
+    std::vector<std::vector<float>> cpuY(y.size(), std::vector<float>(y[0].size()));
+    stencilCpu(x, cpuY, params.iterations, params.radius);
+
+    bool ok = true;
+
+    // Only compute when stencil is fully in range
+    const uint64_t startY = params.radius * params.iterations;
+    const uint64_t endY = params.gridHeight - startY;
+    const uint64_t startX = params.radius * params.iterations;
+    const uint64_t endX = params.gridWidth - startX;
+
+    std::cout << std::fixed << std::setprecision(10);
+
+    #pragma omp parallel for collapse(2)
+    for(uint64_t gridY=startY; gridY<endY; ++gridY) {
+      for(uint64_t gridX=startX; gridX<endX; ++gridX) {
+        constexpr float acceptableDelta = 0.1f;
+        if (std::abs(cpuY[gridY][gridX] - y[gridY][gridX]) > acceptableDelta)
+        {
+          #pragma omp critical
+          {
+            std::cout << "Wrong answer: " << y[gridY][gridX] << " (expected " << cpuY[gridY][gridX] << ") at position (" << gridX << ", " << gridY << ")" << std::endl;
+            ok = false;
+          }
+        }
+      }
+    }
+    if(ok) {
+      std::cout << "Correct for stencil!" << std::endl;
+    }
+  }
+
+  pimShowStats();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/misc-bench/stencil/README.MD b/misc-bench/stencil/README.MD
new file mode 100644
index 00000000..55a2fd18
--- /dev/null
+++ b/misc-bench/stencil/README.MD
@@ -0,0 +1,55 @@
+# Stencil
+
+Iterative stencil loops consist of repeatedly applying a stencil pattern to a grid. This benchmark iterates a stencil average pattern on a 2D array, with both the number of iterations and the size of the stencil pattern parameterized. An example is below:
+
+- Input: [[2.0, 2.0, 2.0], [1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], radius=1, iterations=1
+- Output: [[#, #, #], [#, 1.66, #], [#, #, #]]
+- \# represents an undefined element
+
+## Directory Structure
+
+```
+stencil/
+├── PIM/
+│   ├── Makefile
+│   ├── stencil.cpp
+├── README.md
+├── Makefile
+```
+
+## Implementation Description
+
+This repository contains one implementation of the stencil benchmark:
+
+1. PIM
+
+### PIM Implementation
+
+The PIM variant is implemented using C++ with some speedup from OpenMP. Three different PIM architectures can be tested with this.
+
+## Compilation Instructions for Specific Variants
+
+### PIM Variant
+
+To compile for the PIM variant, use:
+
+```bash
+cd PIM
+make
+```
+
+## Execution Instructions
+
+### Running the Executable
+
+After compiling, run each executable with the following command that will run it for default parameters:
+
+```bash
+./stencil.out
+```
+
+To see help text on all usages and how to modify any of the input parameters, use the following command:
+
+```bash
+./stencil.out -h
+```
diff --git a/misc-bench/stencil/extra/optimizer.py b/misc-bench/stencil/extra/optimizer.py
new file mode 100644
index 00000000..5230dabc
--- /dev/null
+++ b/misc-bench/stencil/extra/optimizer.py
@@ -0,0 +1,90 @@
+# Finds the optimal layout for stencil assuming intra memory-layer transfer cost is consistent
+#   ie., all subarray to subarray transfers are equivalent in cost (within a bank)
+# Note after running for some test values: layout does change based on transfer cost parameters
+
+subarray_block_width = 100
+subarray_block_height = 100
+subarrays_per_bank = 16
+banks_per_rank = 16
+ranks = 16
+transfer_cost_subarray_to_subarray = 1
+transfer_cost_bank_to_bank = 20
+transfer_cost_rank_to_rank = 100
+
+
+def get_stats(num_blocks, grid_width, block_width, block_height):
+
+    if num_blocks % grid_width != 0:
+        raise ValueError("num blocks must be divisible by grid width")
+
+    grid_height = num_blocks/grid_width
+    to_move_horizontal = (2 * grid_height * (grid_width - 1)) * (block_width - 2)
+    to_move_vertical = (2 * grid_width * (grid_height - 1)) * (block_height - 2)
+    to_move_diagonal = (4 * (grid_width-1) * (grid_height - 1))
+    to_move_total = to_move_horizontal + to_move_vertical + to_move_diagonal
+    width_next = grid_width * block_width
+    height_next = grid_height * block_height
+    return to_move_total, width_next, height_next
+
+def total_move_cost(subarray_grid_width, bank_grid_width, rank_grid_width):
+    to_move_s2s, bank_block_width, bank_block_height = get_stats(subarrays_per_bank, subarray_grid_width, subarray_block_width, subarray_block_height)
+    to_move_b2b, rank_block_width, rank_block_height = get_stats(banks_per_rank, bank_grid_width, bank_block_width, bank_block_height)
+    to_move_r2r, final_block_width, final_block_height = get_stats(ranks, rank_grid_width, rank_block_width, rank_block_height)
+    cost = transfer_cost_subarray_to_subarray*to_move_s2s
+    cost += transfer_cost_bank_to_bank*to_move_b2b
+    cost += transfer_cost_rank_to_rank*to_move_r2r
+    return cost
+
+def get_divisors(n):
+    """Get all divisors of n"""
+    divisors = []
+    for i in range(1, int(n**0.5) + 1):
+        if n % i == 0:
+            divisors.append(i)
+            if i != n // i:
+                divisors.append(n // i)
+    return sorted(divisors)
+
+
+# Find all valid divisors
+subarray_divisors = get_divisors(subarrays_per_bank)
+bank_divisors = get_divisors(banks_per_rank)
+rank_divisors = get_divisors(ranks)
+
+print("Valid divisors:")
+print(f"  subarrays_per_bank ({subarrays_per_bank}): {subarray_divisors}")
+print(f"  banks_per_rank ({banks_per_rank}): {bank_divisors}")
+print(f"  ranks ({ranks}): {rank_divisors}")
+print()
+
+# Find optimal configuration
+min_cost = float('inf')
+best_config = None
+
+for sgw in subarray_divisors:
+    for bgw in bank_divisors:
+        for rgw in rank_divisors:
+            cost = total_move_cost(sgw, bgw, rgw)
+            if cost < min_cost:
+                min_cost = cost
+                best_config = (sgw, bgw, rgw)
+
+print("OPTIMAL CONFIGURATION:")
+print(f"  subarray_grid_width = {best_config[0]}")
+print(f"  bank_grid_width = {best_config[1]}")
+print(f"  rank_grid_width = {best_config[2]}")
+print(f"  Total move cost = {min_cost:,.0f}")
+print()
+
+# Show top 10 configurations
+print("Top 10 configurations:")
+results = []
+for sgw in subarray_divisors:
+    for bgw in bank_divisors:
+        for rgw in rank_divisors:
+            cost = total_move_cost(sgw, bgw, rgw)
+            results.append((cost, sgw, bgw, rgw))
+
+results.sort()
+for i, (cost, sgw, bgw, rgw) in enumerate(results[:10], 1):
+    print(f"{i:2}. Cost={cost:12,.0f}  subarray={sgw:2}, bank={bgw:2}, rank={rgw:2}")
\ No newline at end of file