diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 47dc17413f..10f2fb26e5 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -9,6 +9,7 @@
 #include <AMReX_NeighborList.H>
 #include <AMReX_OpenMP.H>
 #include <AMReX_ParticleTile.H>
+#include <AMReX_ParticleTileRT.H>
 
 namespace amrex {
 
diff --git a/Src/Particle/AMReX_ParIter.H b/Src/Particle/AMReX_ParIter.H
index 459ff31757..d37bef0db5 100644
--- a/Src/Particle/AMReX_ParIter.H
+++ b/Src/Particle/AMReX_ParIter.H
@@ -18,6 +18,9 @@ struct Particle;
 template <int NArrayReal, int NArrayInt>
 struct SoAParticle;
 
+template <class RType, class IType>
+struct RTSoAParticle;
+
 struct DefaultAssignor;
 
 // for backwards compatibility
@@ -87,11 +90,11 @@ public:
 
     [[nodiscard]] SoARef GetStructOfArrays () const { return GetParticleTile().GetStructOfArrays(); }
 
-    [[nodiscard]] int numParticles () const { return GetParticleTile().numParticles(); }
+    [[nodiscard]] auto numParticles () const { return GetParticleTile().numParticles(); }
 
-    [[nodiscard]] int numRealParticles () const { return GetParticleTile().numRealParticles(); }
+    [[nodiscard]] auto numRealParticles () const { return GetParticleTile().numRealParticles(); }
 
-    [[nodiscard]] int numNeighborParticles () const { return GetParticleTile().numNeighborParticles(); }
+    [[nodiscard]] auto numNeighborParticles () const { return GetParticleTile().numNeighborParticles(); }
 
     [[nodiscard]] Long capacity () const { return GetParticleTile().capacity(); }
 
@@ -226,6 +229,9 @@ using ParConstIter = ParConstIter_impl<Particle<T_NStructReal, T_NStructInt>, T_
 template <int T_NArrayReal, int T_NArrayInt, template<class> class Allocator=DefaultAllocator, class CellAssignor=DefaultAssignor>
 using ParConstIterSoA = ParConstIter_impl<SoAParticle<T_NArrayReal, T_NArrayInt>, T_NArrayReal, T_NArrayInt, Allocator, CellAssignor>;
 
+template <class RType=ParticleReal, class IType=int, class CellAssignor=DefaultAssignor>
+using ParConstIterRTSoA = ParConstIter_impl<RTSoAParticle<RType, IType>, 0, 0, PolymorphicArenaAllocator, CellAssignor>;
+
 template <int T_NStructReal, int T_NStructInt=0, int T_NArrayReal=0, int T_NArrayInt=0,
           template<class> class Allocator=DefaultAllocator, class CellAssignor=DefaultAssignor>
 using ParIter = ParIter_impl<Particle<T_NStructReal, T_NStructInt>, T_NArrayReal, T_NArrayInt, Allocator, CellAssignor>;
@@ -233,6 +239,9 @@ using ParIter = ParIter_impl<Particle<T_NStructReal, T_NStructInt>, T_NArrayReal
 template <int T_NArrayReal, int T_NArrayInt, template<class> class Allocator=DefaultAllocator, class CellAssignor=DefaultAssignor>
 using ParIterSoA = ParIter_impl<SoAParticle<T_NArrayReal, T_NArrayInt>, T_NArrayReal, T_NArrayInt, Allocator, CellAssignor>;
 
+template <class RType=ParticleReal, class IType=int, class CellAssignor=DefaultAssignor>
+using ParIterRTSoA = ParIter_impl<RTSoAParticle<RType, IType>, 0, 0, PolymorphicArenaAllocator, CellAssignor>;
+
 }
 
 #endif
diff --git a/Src/Particle/AMReX_Particle.H b/Src/Particle/AMReX_Particle.H
index 215e037122..4a1b84cdb0 100644
--- a/Src/Particle/AMReX_Particle.H
+++ b/Src/Particle/AMReX_Particle.H
@@ -389,6 +389,7 @@ struct SoAParticleBase
     static constexpr int NReal=0;
     static constexpr int NInt=0;
     static constexpr bool is_soa_particle = true;
+    static constexpr bool is_rtsoa_particle = false;
 };
 
 /** \brief The struct used to store particles.
@@ -401,6 +402,7 @@ struct alignas(sizeof(double)) Particle
     : ParticleBase<ParticleReal,T_NReal,T_NInt>
 {
     static constexpr bool is_soa_particle = false;
+    static constexpr bool is_rtsoa_particle = false;
     using StorageParticleType = Particle;
     using ConstType = Particle const;
 
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index 2397a7d482..67130a54d1 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -23,6 +23,7 @@
 #include <AMReX_ArrayOfStructs.H>
 #include <AMReX_Particle.H>
 #include <AMReX_ParticleTile.H>
+#include <AMReX_ParticleTileRT.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_GpuContainers.H>
 #include <AMReX_ParticleUtil.H>
@@ -179,8 +180,14 @@ public:
     RealDescriptor ParticleRealDescriptor = FPC::Native64RealDescriptor();
 #endif
 
+    static constexpr bool is_rtsoa_pc = ParticleType::is_rtsoa_particle;
+
     using ParticleContainerType = ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>;
-    using ParticleTileType = ParticleTile<ParticleType, NArrayReal, NArrayInt, Allocator>;
+    using ParticleTileType = std::conditional_t<
+        is_rtsoa_pc,
+        ParticleTileRT<typename ParticleType::RealType, typename ParticleType::IntType>,
+        ParticleTile<ParticleType, NArrayReal, NArrayInt, Allocator>
+    >;
     using ParticleInitData = ParticleInitType<NStructReal, NStructInt, NArrayReal, NArrayInt>;
 
     //! A single level worth of particles is indexed (grid id, tile id)
@@ -1461,11 +1468,13 @@ protected:
                                int lev_min = 0, int lev_max = -1, int local_grid=-1) const;
 
 public:
+    using FlagsVector = PODVector<int, Allocator<int>>;
+
     void
     WriteParticles (int level, std::ofstream& ofs, int fnum,
                     Vector<int>& which, Vector<int>& count, Vector<Long>& where,
                     const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
-                    const Vector<std::map<std::pair<int, int>,IntVector>>& particle_io_flags, bool is_checkpoint) const;
+                    const Vector<std::map<std::pair<int, int>,FlagsVector>>& particle_io_flags, bool is_checkpoint) const;
 #ifdef AMREX_USE_HDF5
 #include "AMReX_ParticlesHDF5.H"
 #endif
@@ -1555,6 +1564,9 @@ using ParticleContainer = ParticleContainer_impl<Particle<T_NStructReal, T_NStru
 template <int T_NArrayReal, int T_NArrayInt, template<class> class Allocator=DefaultAllocator, class CellAssignor=DefaultAssignor>
 using ParticleContainerPureSoA = ParticleContainer_impl<SoAParticle<T_NArrayReal, T_NArrayInt>, T_NArrayReal, T_NArrayInt, Allocator, CellAssignor>;
 
+template <class RType=ParticleReal, class IType=int, class CellAssignor=DefaultAssignor>
+using ParticleContainerRTSoA = ParticleContainer_impl<RTSoAParticle<RType, IType>, 0, 0, PolymorphicArenaAllocator, CellAssignor>;
+
 }
 
 #include "AMReX_ParticleInit.H"
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index aac96dad87..3e6975bb50 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -1145,7 +1145,7 @@ void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
 copyParticles (const PCType& other, bool local)
 {
-    using PData = ConstParticleTileData<typename ParticleType::StorageParticleType, NArrayReal, NArrayInt>;
+    using PData = typename ParticleTileType::ConstParticleTileDataType;
     copyParticles(other, [] AMREX_GPU_HOST_DEVICE (const PData& /*data*/, int /*i*/) { return 1; }, local);
 }
 
@@ -1156,7 +1156,7 @@ void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssignor>::
 addParticles (const PCType& other, bool local)
 {
-    using PData = ConstParticleTileData<typename ParticleType::StorageParticleType, NArrayReal, NArrayInt>;
+    using PData = typename ParticleTileType::ConstParticleTileDataType;
     addParticles(other, [] AMREX_GPU_HOST_DEVICE (const PData& /*data*/, int /*i*/) { return 1; }, local);
 }
 
@@ -1219,7 +1219,8 @@ addParticles (const PCType& other, F const& f, bool local)
             auto dst_index = ptile.numParticles();
             ptile.resize(dst_index + np);
 
-            auto count = amrex::filterParticles(ptile, ptile_other, f, 0, dst_index, np);
+            auto count = amrex::filterParticles(ptile, ptile_other, f,
+                static_cast<decltype(np)>(0), dst_index, np);
 
             ptile.resize(dst_index + count);
         }
@@ -1267,85 +1268,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
     const size_t np_total = np + ptile.numNeighborParticles();
 
     if (memEfficientSort) {
-#if defined(AMREX_USE_CUDA) && defined(_WIN32)
-        if           (!ParticleType::is_soa_particle) {
-#else
-        if constexpr (!ParticleType::is_soa_particle) {
-#endif
-            static_assert(sizeof(ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
-            using tmp_t = std::conditional_t<sizeof(ParticleType)%8 == 0,
-                                             uint64_t, uint32_t>;
-            constexpr std::size_t nchunks = sizeof(ParticleType) / sizeof(tmp_t);
-            Gpu::DeviceVector<tmp_t> tmp(np);
-            auto* ptmp = tmp.data();
-            auto* paos = (tmp_t*)(ptile.getParticleTileData().m_aos);
-            for (std::size_t ichunk = 0; ichunk < nchunks; ++ichunk) {
-                // Do not need to reorder neighbor particles
-                AMREX_HOST_DEVICE_FOR_1D(np, i,
-                {
-                    ptmp[i] = paos[permutations[i]*nchunks+ichunk];
-                });
-                AMREX_HOST_DEVICE_FOR_1D(np, i,
-                {
-                    paos[i*nchunks+ichunk] = ptmp[i];
-                });
-            }
-            Gpu::streamSynchronize();
-        } else {
-            typename SoA::IdCPU tmp_idcpu;
-            if constexpr (has_polymorphic_allocator) {
-                tmp_idcpu.setArena(arena());
-            }
-            tmp_idcpu.resize(np_total);
-            auto src = ptile.GetStructOfArrays().GetIdCPUData().data();
-            uint64_t* dst = tmp_idcpu.data();
-            AMREX_HOST_DEVICE_FOR_1D( np_total, i,
-            {
-                dst[i] = i < np ? src[permutations[i]] : src[i];
-            });
-
-            Gpu::streamSynchronize();
-
-            ptile.GetStructOfArrays().GetIdCPUData().swap(tmp_idcpu);
-        }
-
-        { // Create a scope for the temporary vector below
-            RealVector tmp_real;
-            if constexpr (has_polymorphic_allocator) {
-                tmp_real.setArena(arena());
-            }
-            tmp_real.resize(np_total);
-            for (int comp = 0; comp < NArrayReal + m_num_runtime_real; ++comp) {
-                auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
-                ParticleReal* dst = tmp_real.data();
-                AMREX_HOST_DEVICE_FOR_1D( np_total, i,
-                {
-                    dst[i] = i < np ? src[permutations[i]] : src[i];
-                });
-
-                Gpu::streamSynchronize();
-
-                ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
-            }
-        }
-
-        IntVector tmp_int;
-        if constexpr (has_polymorphic_allocator) {
-            tmp_int.setArena(arena());
-        }
-        tmp_int.resize(np_total);
-        for (int comp = 0; comp < NArrayInt + m_num_runtime_int; ++comp) {
-            auto src = ptile.GetStructOfArrays().GetIntData(comp).data();
-            int* dst = tmp_int.data();
-            AMREX_HOST_DEVICE_FOR_1D( np_total , i,
-            {
-                dst[i] = i < np ? src[permutations[i]] : src[i];
-            });
-
-            Gpu::streamSynchronize();
-
-            ptile.GetStructOfArrays().GetIntData(comp).swap(tmp_int);
-        }
+        amrex::ReorderParticles(ptile, permutations);
     } else {
         ParticleTileType ptile_tmp;
         ptile_tmp.define(m_num_runtime_real, m_num_runtime_int,
@@ -1679,34 +1602,18 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     // these are temporary buffers for each thread
     std::map<int, Vector<Vector<char> > > tmp_remote;
-    Vector<std::map<std::pair<int, int>, Vector<ParticleVector> > > tmp_local;
-    Vector<std::map<std::pair<int, int>, Vector<StructOfArrays<NArrayReal, NArrayInt, Allocator> > > > soa_local;
-    tmp_local.resize(theEffectiveFinestLevel+1);
-    soa_local.resize(theEffectiveFinestLevel+1);
+    Vector<std::map<std::pair<int, int>, Vector<ParticleTileType> > > ptile_local;
+    ptile_local.resize(theEffectiveFinestLevel+1);
 
     // we resize these buffers outside the parallel region
     for (int lev = lev_min; lev <= lev_max; lev++) {
         for (MFIter mfi(*m_dummy_mf[lev], this->do_tiling ? this->tile_size : IntVect::TheZeroVector());
              mfi.isValid(); ++mfi) {
             auto index = std::make_pair(mfi.index(), mfi.LocalTileIndex());
-            tmp_local[lev][index].resize(num_threads);
-            soa_local[lev][index].resize(num_threads);
+            ptile_local[lev][index].resize(num_threads);
             for (int t = 0; t < num_threads; ++t) {
-                soa_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int,
-                    &m_soa_rdata_names, &m_soa_idata_names);
-                if constexpr (has_polymorphic_allocator) {
-                    if constexpr (ParticleType::is_soa_particle) {
-                        soa_local[lev][index][t].GetIdCPUData().setArena(arena());
-                    } else {
-                        tmp_local[lev][index][t].setArena(arena());
-                    }
-                    for (int j = 0; j < soa_local[lev][index][t].NumRealComps(); ++j) {
-                        soa_local[lev][index][t].GetRealData(j).setArena(arena());
-                    }
-                    for (int j = 0; j < soa_local[lev][index][t].NumIntComps(); ++j) {
-                        soa_local[lev][index][t].GetIntData(j).setArena(arena());
-                    }
-                }
+                ptile_local[lev][index][t].define(m_num_runtime_real, m_num_runtime_int,
+                     &m_soa_rdata_names, &m_soa_idata_names, arena());
             }
         }
     }
@@ -1741,262 +1648,92 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
             int thread_num = OpenMP::get_thread_num();
             int grid = grid_tile_ids[pmap_it].first;
             int tile = grid_tile_ids[pmap_it].second;
-            auto& soa = ptile_ptrs[pmap_it]->GetStructOfArrays();
-            auto& aos = ptile_ptrs[pmap_it]->GetArrayOfStructs();
 
-            // AMREX_ASSERT_WITH_MESSAGE((NumRealComps() == 0 && NumIntComps() == 0)
-            //                           || aos.size() == soa.size(),
-            //     "The AoS and SoA data on this tile are different sizes - "
-            //     "perhaps particles have not been initialized correctly?");
             unsigned npart = ptile_ptrs[pmap_it]->numParticles();
             ParticleLocData pld;
 
-            if constexpr (!ParticleType::is_soa_particle){
-
-                if (npart != 0) {
-                    Long last = npart - 1;
-                    Long pindex = 0;
-                    while (pindex <= last) {
-                        ParticleType& p = aos[pindex];
-
-                        if ((remove_negative == false) && (!p.id().is_valid())) {
-                            ++pindex;
-                            continue;
-                        }
-
-                        if (!p.id().is_valid())
-                        {
-                            aos[pindex] = aos[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, aos[pindex]);
-                            --last;
-                            continue;
-                        }
-
-                        locateParticle(p, pld, lev_min, lev_max, nGrow, local ? grid : -1);
-
-                        particlePostLocate(p, pld, lev);
-
-                        if (!p.id().is_valid())
-                        {
-                            aos[pindex] = aos[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, aos[pindex]);
-                            --last;
-                            continue;
-                        }
-
-                        const int who = ParallelContext::global_to_local_rank(ParticleDistributionMap(pld.m_lev)[pld.m_grid]);
-                        if (who == MyProc) {
-                            if (pld.m_lev != lev || pld.m_grid != grid || pld.m_tile != tile) {
-                                // We own it but must shift it to another place.
-                                auto index = std::make_pair(pld.m_grid, pld.m_tile);
-                                AMREX_ASSERT(tmp_local[pld.m_lev][index].size() == num_threads);
-                                tmp_local[pld.m_lev][index][thread_num].push_back(p);
-                                for (int comp = 0; comp < NumRealComps(); ++comp) {
-                                    RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
-                                    arr.push_back(soa.GetRealData(comp)[pindex]);
-                                }
-                                for (int comp = 0; comp < NumIntComps(); ++comp) {
-                                    IntVector& arr = soa_local[pld.m_lev][index][thread_num].GetIntData(comp);
-                                    arr.push_back(soa.GetIntData(comp)[pindex]);
-                                }
-
-                                p.id().make_invalid(); // Invalidate the particle
-                            }
-                        }
-                        else {
-                            auto& particles_to_send = tmp_remote[who][thread_num];
-                            auto old_size = particles_to_send.size();
-                            auto new_size = old_size + superparticle_size;
-                            particles_to_send.resize(new_size);
-                            std::memcpy(&particles_to_send[old_size], &p, particle_size);
-                            char* dst = &particles_to_send[old_size] + particle_size;
-                            int array_comp_start = 0;
-                            if constexpr (!ParticleType::is_soa_particle) {
-                                array_comp_start = AMREX_SPACEDIM + NStructReal;
-                            }
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                if (h_redistribute_real_comp[array_comp_start + comp]) {
-                                    std::memcpy(dst, &soa.GetRealData(comp)[pindex], sizeof(ParticleReal));
-                                    dst += sizeof(ParticleReal);
-                                }
-                            }
-                            array_comp_start = 2 + NStructInt;
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                if (h_redistribute_int_comp[array_comp_start + comp]) {
-                                    std::memcpy(dst, &soa.GetIntData(comp)[pindex], sizeof(int));
-                                    dst += sizeof(int);
-                                }
-                            }
-
-                            p.id().make_invalid(); // Invalidate the particle
-                        }
+            auto particle_tile = ptile_ptrs[pmap_it];
+            auto ptd = particle_tile->getParticleTileData();
 
-                        if (!p.id().is_valid())
-                        {
-                            aos[pindex] = aos[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, aos[pindex]);
-                            --last;
-                            continue;
-                        }
+            if (npart != 0) {
+                Long last = npart - 1;
+                Long pindex = 0;
+                while (pindex <= last) {
+                    decltype(auto) p = ptd[pindex];
 
+                    if ((remove_negative == false) && (!ptd.id(pindex).is_valid())) {
                         ++pindex;
+                        continue;
                     }
 
-                    aos().erase(aos().begin() + last + 1, aos().begin() + npart);
-                    for (int comp = 0; comp < NumRealComps(); comp++) {
-                        RealVector& rdata = soa.GetRealData(comp);
-                        rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
-                    }
-                    for (int comp = 0; comp < NumIntComps(); comp++) {
-                        IntVector& idata = soa.GetIntData(comp);
-                        idata.erase(idata.begin() + last + 1, idata.begin() + npart);
+                    if (!ptd.id(pindex).is_valid()) {
+                        copyParticle(ptd, ptd, last, pindex);
+                        correctCellVectors(last, pindex, grid, p);
+                        --last;
+                        continue;
                     }
-                }
 
-            } else { // soa particle
+                    locateParticle(p, pld, lev_min, lev_max, nGrow, local ? grid : -1);
 
-                auto particle_tile = ptile_ptrs[pmap_it];
-                if (npart != 0) {
-                    Long last = npart - 1;
-                    Long pindex = 0;
-                    auto ptd = particle_tile->getParticleTileData();
-                    while (pindex <= last) {
-                        ParticleType p = ptd[pindex];
+                    particlePostLocate(p, pld, lev);
 
-                        if ((remove_negative == false) && (!ptd.id(pindex).is_valid())) {
-                            ++pindex;
-                            continue;
-                        }
+                    if (!ptd.id(pindex).is_valid()) {
+                        copyParticle(ptd, ptd, last, pindex);
+                        correctCellVectors(last, pindex, grid, p);
+                        --last;
+                        continue;
+                    }
 
-                        if (!ptd.id(pindex).is_valid()){
-                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, ptd[pindex]);
-                            --last;
-                            continue;
-                        }
+                    const int who = ParallelContext::global_to_local_rank(ParticleDistributionMap(pld.m_lev)[pld.m_grid]);
+                    if (who == MyProc) {
+                        if (pld.m_lev != lev || pld.m_grid != grid || pld.m_tile != tile) {
+                            // We own it but must shift it to another place.
+                            auto index = std::make_pair(pld.m_grid, pld.m_tile);
+                            AMREX_ASSERT(ptile_local[pld.m_lev][index].size() == num_threads);
 
-                        locateParticle(p, pld, lev_min, lev_max, nGrow, local ? grid : -1);
-
-                        particlePostLocate(p, pld, lev);
-
-                        if (!ptd.id(pindex).is_valid()) {
-                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, ptd[pindex]);
-                            --last;
-                            continue;
-                        }
+                            auto& dst = ptile_local[pld.m_lev][index][thread_num];
+                            auto old_size = dst.size();
+                            auto new_size = old_size + 1;
+                            dst.resize(new_size);
+                            auto dst_ptd = dst.getParticleTileData();
+
+                            copyParticle(dst_ptd, ptd, pindex, old_size);
 
-                        const int who = ParallelContext::global_to_local_rank(ParticleDistributionMap(pld.m_lev)[pld.m_grid]);
-                        if (who == MyProc) {
-                            if (pld.m_lev != lev || pld.m_grid != grid || pld.m_tile != tile) {
-                                // We own it but must shift it to another place.
-                                auto index = std::make_pair(pld.m_grid, pld.m_tile);
-                                AMREX_ASSERT(soa_local[pld.m_lev][index].size() == num_threads);
-                                {
-                                    auto& arr = soa_local[pld.m_lev][index][thread_num].GetIdCPUData();
-                                    arr.push_back(soa.GetIdCPUData()[pindex]);
-                                }
-                                for (int comp = 0; comp < NumRealComps(); ++comp) {
-                                    RealVector& arr = soa_local[pld.m_lev][index][thread_num].GetRealData(comp);
-                                    arr.push_back(soa.GetRealData(comp)[pindex]);
-                                }
-                                for (int comp = 0; comp < NumIntComps(); ++comp) {
-                                    IntVector& arr = soa_local[pld.m_lev][index][thread_num].GetIntData(comp);
-                                    arr.push_back(soa.GetIntData(comp)[pindex]);
-                                }
-
-                                ptd.id(pindex).make_invalid(); // Invalidate the particle
-                            }
-                        }
-                        else {
-                            auto& particles_to_send = tmp_remote[who][thread_num];
-                            auto old_size = particles_to_send.size();
-                            auto new_size = old_size + superparticle_size;
-                            particles_to_send.resize(new_size);
-
-                            char* dst = &particles_to_send[old_size];
-                            {
-                                std::memcpy(dst, &soa.GetIdCPUData()[pindex], sizeof(uint64_t));
-                                dst += sizeof(uint64_t);
-                            }
-                            int array_comp_start = 0;
-                            if constexpr (!ParticleType::is_soa_particle) {
-                                array_comp_start = AMREX_SPACEDIM + NStructReal;
-                            }
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                if (h_redistribute_real_comp[array_comp_start + comp]) {
-                                    std::memcpy(dst, &soa.GetRealData(comp)[pindex], sizeof(ParticleReal));
-                                    dst += sizeof(ParticleReal);
-                                }
-                            }
-                            array_comp_start = 2 + NStructInt;
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                if (h_redistribute_int_comp[array_comp_start + comp]) {
-                                    std::memcpy(dst, &soa.GetIntData(comp)[pindex], sizeof(int));
-                                    dst += sizeof(int);
-                                }
-                            }
                             ptd.id(pindex).make_invalid(); // Invalidate the particle
                         }
+                    } else {
+                        auto& particles_to_send = tmp_remote[who][thread_num];
+                        auto old_size = particles_to_send.size();
+                        auto new_size = old_size + superparticle_size;
+                        particles_to_send.resize(new_size);
 
-                        if (!ptd.id(pindex).is_valid()){
-                            soa.GetIdCPUData()[pindex] = soa.GetIdCPUData()[last];
-                            for (int comp = 0; comp < NumRealComps(); comp++) {
-                                soa.GetRealData(comp)[pindex] = soa.GetRealData(comp)[last];
-                            }
-                            for (int comp = 0; comp < NumIntComps(); comp++) {
-                                soa.GetIntData(comp)[pindex] = soa.GetIntData(comp)[last];
-                            }
-                            correctCellVectors(last, pindex, grid, ptd[pindex]);
-                            --last;
-                            continue;
-                        }
+                        ptd.packParticleData(particles_to_send.data(), pindex, old_size,
+                            h_redistribute_real_comp.data(), h_redistribute_int_comp.data());
 
-                        ++pindex;
+                        ptd.id(pindex).make_invalid(); // Invalidate the particle
                     }
 
-                    {
-                        auto& iddata = soa.GetIdCPUData();
-                        iddata.erase(iddata.begin() + last + 1, iddata.begin() + npart);
-                    }
-                    for (int comp = 0; comp < NumRealComps(); comp++) {
-                        RealVector& rdata = soa.GetRealData(comp);
-                        rdata.erase(rdata.begin() + last + 1, rdata.begin() + npart);
+                    if (!ptd.id(pindex).is_valid()) {
+                        copyParticle(ptd, ptd, last, pindex);
+                        correctCellVectors(last, pindex, grid, p);
+                        --last;
+                        continue;
                     }
-                    for (int comp = 0; comp < NumIntComps(); comp++) {
-                        IntVector& idata = soa.GetIntData(comp);
-                        idata.erase(idata.begin() + last + 1, idata.begin() + npart);
+
+                    ++pindex;
+                }
+
+                auto tot_npart = ptile_ptrs[pmap_it]->numTotalParticles();
+
+                if (last != npart - 1) {
+                    pindex = last + 1;
+                    last = npart;
+                    while (last < tot_npart) {
+                        copyParticle(ptd, ptd, last, pindex);
+                        ++pindex;
+                        ++last;
                     }
+                    particle_tile->resize(pindex);
                 }
             }
         }
@@ -2008,87 +1745,40 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
 
     // Second pass - for each tile in parallel, collect the particles we are owed from all thread's buffers.
     for (int lev = lev_min; lev <= lev_max; lev++) {
-        typename std::map<std::pair<int, int>, Vector<ParticleVector > >::iterator pmap_it;
 
-        if constexpr(!ParticleType::is_soa_particle) {
-            Vector<std::pair<int, int> > grid_tile_ids;
-            Vector<Vector<ParticleVector>* > pvec_ptrs;
+        Vector<std::pair<int, int> > grid_tile_ids;
 
-            // we need to create any missing map entries in serial here
-            for (pmap_it=tmp_local[lev].begin(); pmap_it != tmp_local[lev].end(); pmap_it++)
-            {
-                DefineAndReturnParticleTile(lev, pmap_it->first.first, pmap_it->first.second);
-                grid_tile_ids.push_back(pmap_it->first);
-                pvec_ptrs.push_back(&(pmap_it->second));
-            }
+        // we need to create any missing map entries in serial here
+        for (auto pmap_it = ptile_local[lev].begin(); pmap_it != ptile_local[lev].end(); pmap_it++)
+        {
+            DefineAndReturnParticleTile(lev, pmap_it->first.first, pmap_it->first.second);
+            grid_tile_ids.push_back(pmap_it->first);
+        }
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel for
 #endif
-            for (int pit = 0; pit < static_cast<int>(pvec_ptrs.size()); ++pit)
-            {
-                auto index = grid_tile_ids[pit];
-                auto& ptile = ParticlesAt(lev, index.first, index.second);
-                auto& aos = ptile.GetArrayOfStructs();
-                auto& soa = ptile.GetStructOfArrays();
-                auto& aos_tmp = *(pvec_ptrs[pit]);
-                auto& soa_tmp = soa_local[lev][index];
-                for (int i = 0; i < num_threads; ++i) {
-                    aos.insert(aos.end(), aos_tmp[i].begin(), aos_tmp[i].end());
-                    aos_tmp[i].erase(aos_tmp[i].begin(), aos_tmp[i].end());
-                    for (int comp = 0; comp < NumRealComps(); ++comp) {
-                        RealVector& arr = soa.GetRealData(comp);
-                        RealVector& tmp = soa_tmp[i].GetRealData(comp);
-                        arr.insert(arr.end(), tmp.begin(), tmp.end());
-                        tmp.erase(tmp.begin(), tmp.end());
-                    }
-                    for (int comp = 0; comp < NumIntComps(); ++comp) {
-                        IntVector& arr = soa.GetIntData(comp);
-                        IntVector& tmp = soa_tmp[i].GetIntData(comp);
-                        arr.insert(arr.end(), tmp.begin(), tmp.end());
-                        tmp.erase(tmp.begin(), tmp.end());
-                    }
-                }
-            }
-        } else { // soa particle
-            Vector<std::pair<int, int> > grid_tile_ids;
+        for (int pit = 0; pit < static_cast<int>(grid_tile_ids.size()); ++pit) // NOLINT(modernize-loop-convert)
+        {
+            auto index = grid_tile_ids[pit];
+            auto& dst_ptile = ParticlesAt(lev, index.first, index.second);
+            auto& src_ptile = ptile_local[lev][index];
 
-            // we need to create any missing map entries in serial here
-            for (auto soa_map_it=soa_local[lev].begin(); soa_map_it != soa_local[lev].end(); soa_map_it++)
-            {
-                DefineAndReturnParticleTile(lev, soa_map_it->first.first, soa_map_it->first.second);
-                grid_tile_ids.push_back(soa_map_it->first);
-            }
+            for (int i = 0; i < num_threads; ++i) {
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel for
-#endif
-            for (int pit = 0; pit < static_cast<int>(grid_tile_ids.size()); ++pit) // NOLINT(modernize-loop-convert)
-            {
-                auto index = grid_tile_ids[pit];
-                auto& ptile = ParticlesAt(lev, index.first, index.second);
-                auto& soa = ptile.GetStructOfArrays();
-                auto& soa_tmp = soa_local[lev][index];
-                for (int i = 0; i < num_threads; ++i) {
-                    {
-                        auto& arr = soa.GetIdCPUData();
-                        auto& tmp = soa_tmp[i].GetIdCPUData();
-                        arr.insert(arr.end(), tmp.begin(), tmp.end());
-                        tmp.erase(tmp.begin(), tmp.end());
-                    }
-                    for (int comp = 0; comp < NumRealComps(); ++comp) {
-                        RealVector& arr = soa.GetRealData(comp);
-                        RealVector& tmp = soa_tmp[i].GetRealData(comp);
-                        arr.insert(arr.end(), tmp.begin(), tmp.end());
-                        tmp.erase(tmp.begin(), tmp.end());
-                    }
-                    for (int comp = 0; comp < NumIntComps(); ++comp) {
-                        IntVector& arr = soa.GetIntData(comp);
-                        IntVector& tmp = soa_tmp[i].GetIntData(comp);
-                        arr.insert(arr.end(), tmp.begin(), tmp.end());
-                        tmp.erase(tmp.begin(), tmp.end());
-                    }
+                auto to_copy = src_ptile[i].numParticles();
+                auto old_size = dst_ptile.numTotalParticles();
+                auto new_size = old_size + to_copy;
+                dst_ptile.resize(new_size);
+
+                auto dst_ptd = dst_ptile.getParticleTileData();
+                auto src_ptd = src_ptile[i].getParticleTileData();
+
+                for (Long j = 0; j < to_copy; ++j) {
+                    copyParticle(dst_ptd, src_ptd, j, j + old_size);
                 }
+
+                src_ptile[i].resize(0);
             }
         }
     }
@@ -2339,18 +2029,22 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
             {
                 auto& ptile = m_particles[rcv_levs[ipart]][std::make_pair(rcv_grid[ipart],
                                                                           rcv_tile[ipart])];
+                auto old_size = ptile.numTotalParticles();
+                auto new_size = old_size + 1;
+                ptile.resize(new_size);
+
                 char* pbuf = ((char*) &recvdata[offset]) + j*superparticle_size;
 
                 if constexpr (ParticleType::is_soa_particle) {
                     uint64_t idcpudata;
                     std::memcpy(&idcpudata, pbuf, sizeof(uint64_t));
                     pbuf += sizeof(uint64_t);
-                    ptile.GetStructOfArrays().GetIdCPUData().push_back(idcpudata);
+                    ptile.GetStructOfArrays().GetIdCPUData()[old_size] = idcpudata;
                 } else {
                     ParticleType p;
                     std::memcpy(&p, pbuf, sizeof(ParticleType));
                     pbuf += sizeof(ParticleType);
-                    ptile.push_back(p);
+                    ptile.GetArrayOfStructs()[old_size] = p;
                 }
 
                 int array_comp_start = 0;
@@ -2362,9 +2056,9 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
                         ParticleReal rdata;
                         std::memcpy(&rdata, pbuf, sizeof(ParticleReal));
                         pbuf += sizeof(ParticleReal);
-                        ptile.push_back_real(comp, rdata);
+                        ptile.GetStructOfArrays().GetRealData(comp)[old_size] = rdata;
                     } else {
-                        ptile.push_back_real(comp, 0.0);
+                        ptile.GetStructOfArrays().GetRealData(comp)[old_size] = 0.0;
                     }
                 }
 
@@ -2374,9 +2068,9 @@ RedistributeMPI (std::map<int, Vector<char> >& not_ours,
                         int idata;
                         std::memcpy(&idata, pbuf, sizeof(int));
                         pbuf += sizeof(int);
-                        ptile.push_back_int(comp, idata);
+                        ptile.GetStructOfArrays().GetIntData(comp)[old_size] = idata;
                     } else {
-                        ptile.push_back_int(comp, 0);
+                        ptile.GetStructOfArrays().GetIntData(comp)[old_size] = 0;
                     }
                 }
                 ++ipart;
diff --git a/Src/Particle/AMReX_ParticleIO.H b/Src/Particle/AMReX_ParticleIO.H
index 105fa06b46..0744b683ba 100644
--- a/Src/Particle/AMReX_ParticleIO.H
+++ b/Src/Particle/AMReX_ParticleIO.H
@@ -575,7 +575,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator, CellAssig
                   Vector<int>& which, Vector<int>& count, Vector<Long>& where,
                   const Vector<int>& write_real_comp,
                   const Vector<int>& write_int_comp,
-                  const Vector<std::map<std::pair<int, int>, IntVector>>& particle_io_flags,
+                  const Vector<std::map<std::pair<int, int>, FlagsVector>>& particle_io_flags,
                   bool is_checkpoint) const
 {
     BL_PROFILE("ParticleContainer::WriteParticles()");
diff --git a/Src/Particle/AMReX_ParticleMesh.H b/Src/Particle/AMReX_ParticleMesh.H
index 8c0d56da48..c10eead740 100644
--- a/Src/Particle/AMReX_ParticleMesh.H
+++ b/Src/Particle/AMReX_ParticleMesh.H
@@ -36,6 +36,21 @@ auto call_f (F const& f,
         return f(p, i, fabarr);
     }
 }
+
+template <typename F, typename T, template<class, class> class PTDType, class RType, class IType>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+auto call_f (F const& f,
+             const PTDType<RType, IType>& p,
+             const int i, Array4<T> const& fabarr,
+             GpuArray<Real,AMREX_SPACEDIM> const& plo,
+             GpuArray<Real,AMREX_SPACEDIM> const& dxi) noexcept
+{
+    if constexpr (IsCallable<F, decltype(p), int, decltype(fabarr), decltype(plo), decltype(dxi)>::value) {
+        return f(p, i, fabarr, plo, dxi);
+    } else {
+        return f(p, i, fabarr);
+    }
+}
 }
 
 template <class PC, class MF, class F, std::enable_if_t<IsParticleContainer<PC>::value, int> foo = 0>
diff --git a/Src/Particle/AMReX_ParticleReduce.H b/Src/Particle/AMReX_ParticleReduce.H
index 6df792be33..b4e7478fd3 100644
--- a/Src/Particle/AMReX_ParticleReduce.H
+++ b/Src/Particle/AMReX_ParticleReduce.H
@@ -33,6 +33,15 @@ auto call_f (F const& f,
         return f(p, i);
     }
 }
+
+template <typename F, template<class, class> class PTDType, class RType, class IType>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+auto call_f (F const& f,
+             const PTDType<RType, IType>& p,
+             const int i) noexcept
+{
+    return f(p, i);
+}
 }
 
 /**
diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H
index 4b9c7135dd..44dcb570a2 100644
--- a/Src/Particle/AMReX_ParticleTile.H
+++ b/Src/Particle/AMReX_ParticleTile.H
@@ -328,6 +328,7 @@ struct alignas(sizeof(double)) ConstSoAParticle : SoAParticleBase
     using StorageParticleType = SoAParticleBase;
     using ConstPTD = ConstParticleTileData<SoAParticleBase, NArrayReal, NArrayInt>;
     static constexpr bool is_soa_particle = true;
+    static constexpr bool is_rtsoa_particle = false;
     static constexpr bool is_constsoa_particle = true;
 
     using RealType = ParticleReal;
@@ -391,6 +392,7 @@ struct alignas(sizeof(double)) SoAParticle : SoAParticleBase
     using StorageParticleType = SoAParticleBase;
     using PTD = ParticleTileData<SoAParticleBase, NArrayReal, NArrayInt>;
     static constexpr bool is_soa_particle = true;
+    static constexpr bool is_rtsoa_particle = false;
     static constexpr bool is_constsoa_particle = false;
 
     using ConstType = ConstSoAParticle<T_NArrayReal, T_NArrayInt>;
diff --git a/Src/Particle/AMReX_ParticleTileRT.H b/Src/Particle/AMReX_ParticleTileRT.H
new file mode 100644
index 0000000000..0be3665f38
--- /dev/null
+++ b/Src/Particle/AMReX_ParticleTileRT.H
@@ -0,0 +1,789 @@
+#ifndef AMREX_PARTICLETILERT_H_
+#define AMREX_PARTICLETILERT_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Extension.H>
+#include <AMReX_Particle.H>
+#include <AMReX_ArrayOfStructs.H>
+#include <AMReX_StructOfArrays.H>
+#include <AMReX_Vector.H>
+#include <AMReX_REAL.H>
+#include <AMReX_RealVect.H>
+
+#include <array>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+
+namespace amrex {
+
+// Forward Declaration
+template <class RType, class IType>
+struct RTSoAParticle;
+
+template <class T>
+struct ArrayView {
+    using size_type = std::size_t;
+
+    T* m_data = nullptr;
+    size_type m_capacity = 0;
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T& operator[] (const size_type i) const {
+        AMREX_ASSERT(i < m_capacity);
+        return m_data[i];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T* data () const {
+        return m_data;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T* dataPtr () const {
+        return m_data;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T* begin () const {
+        return m_data;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    T* end () const {
+        return m_data + m_capacity;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    explicit operator T* () const {
+        return m_data;
+    }
+};
+
+template <class T>
+ArrayView(T* data, std::size_t capacity) -> ArrayView<T>;
+
+template <class RType, class IType>
+struct ParticleTileDataRT
+{
+    using Self = ParticleTileDataRT<RType, IType>;
+    using size_type = std::size_t;
+
+    using ParticleType = RTSoAParticle<RType, IType>;
+    using RealType = RType;
+    using IntType = IType;
+
+    static constexpr bool is_particle_tile_data = true;
+    static constexpr bool is_const = std::is_const_v<RType>;
+
+    size_type m_capacity = 0;
+    std::conditional_t<is_const,
+        const uint64_t* AMREX_RESTRICT,
+        uint64_t* AMREX_RESTRICT> m_idcpu = nullptr;
+    RType* AMREX_RESTRICT m_rdata = nullptr;
+    IType* AMREX_RESTRICT m_idata = nullptr;
+    int m_n_real = 0;
+    int m_n_int = 0;
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    constexpr ParticleTileDataRT () = default;
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    constexpr ParticleTileDataRT (size_type a_capacity,
+        decltype(m_idcpu) a_idcpu, decltype(m_rdata) a_rdata,
+        decltype(m_idata) a_idata, int a_n_real, int a_n_int) noexcept :
+        m_capacity{a_capacity},
+        m_idcpu{a_idcpu},
+        m_rdata{a_rdata},
+        m_idata{a_idata},
+        m_n_real{a_n_real},
+        m_n_int{a_n_int} {}
+
+    template <class aRType, class aIType>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    constexpr ParticleTileDataRT (ParticleTileDataRT<aRType, aIType> const& rhs) noexcept :
+        m_capacity{rhs.m_capacity},
+        m_idcpu{rhs.m_idcpu},
+        m_rdata{rhs.m_rdata},
+        m_idata{rhs.m_idata},
+        m_n_real{rhs.m_n_real},
+        m_n_int{rhs.m_n_int} {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RType& pos (const int dir, const size_type index) const &
+    {
+        AMREX_ASSERT(index < m_capacity && 0 <= dir && dir < m_n_real);
+        return this->m_rdata[dir * m_capacity + index];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) id (const size_type index) const &
+    {
+        AMREX_ASSERT(index < m_capacity);
+        if constexpr (is_const) {
+            return ConstParticleIDWrapper(this->m_idcpu[index]);
+        } else {
+            return ParticleIDWrapper(this->m_idcpu[index]);
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) cpu (const size_type index) const &
+    {
+        AMREX_ASSERT(index < m_capacity);
+        if constexpr (is_const) {
+            return ConstParticleCPUWrapper(this->m_idcpu[index]);
+        } else {
+            return ParticleCPUWrapper(this->m_idcpu[index]);
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) idcpu (const size_type index) const &
+    {
+        AMREX_ASSERT(index < m_capacity);
+        return this->m_idcpu[index];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RType * rdata (const int comp_index) const &
+    {
+        AMREX_ASSERT(0 <= comp_index && comp_index < m_n_real);
+        return this->m_rdata + comp_index * m_capacity;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IType * idata (const int comp_index) const &
+    {
+        AMREX_ASSERT(0 <= comp_index && comp_index < m_n_int);
+        return this->m_idata + comp_index * m_capacity;
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RType& rdata (const int comp_index, const size_type partice_index) const &
+    {
+        AMREX_ASSERT(partice_index < m_capacity && 0 <= comp_index && comp_index < m_n_real);
+        return this->m_rdata[comp_index * m_capacity + partice_index];
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IType& idata (const int comp_index, const size_type partice_index) const &
+    {
+        AMREX_ASSERT(partice_index < m_capacity && 0 <= comp_index && comp_index < m_n_int);
+        return this->m_idata[comp_index * m_capacity + partice_index];
+    }
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) operator[] (const size_type index) const
+    {
+        AMREX_ASSERT(index < m_capacity);
+        return RTSoAParticle<RType, IType>(*this, index);
+    }
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void packParticleData (char* buffer, size_type src_index, std::size_t dst_offset,
+                           const int* comm_real, const int * comm_int) const noexcept
+    {
+        AMREX_ASSERT(src_index < m_capacity);
+        auto* dst = buffer + dst_offset;
+
+        memcpy(dst, m_idcpu + src_index, sizeof(uint64_t));
+        dst += sizeof(uint64_t);
+
+        for (int i = 0; i < m_n_real; ++i) {
+            if (comm_real[i]) {
+                memcpy(dst, m_rdata + i * m_capacity + src_index, sizeof(RType));
+                dst += sizeof(RType);
+            }
+        }
+
+        for (int i = 0; i < m_n_int; ++i) {
+            if (comm_int[i]) {
+                memcpy(dst, m_idata + i * m_capacity + src_index, sizeof(IType));
+                dst += sizeof(IType);
+            }
+        }
+    }
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void unpackParticleData (const char* buffer, Long src_offset, size_type dst_index,
+                             const int* comm_real, const int* comm_int) const noexcept
+    {
+        AMREX_ASSERT(dst_index < m_capacity);
+        const auto* src = buffer + src_offset;
+
+        memcpy(m_idcpu + dst_index, src, sizeof(uint64_t));
+        src += sizeof(uint64_t);
+
+        for (int i = 0; i < m_n_real; ++i) {
+            if (comm_real[i]) {
+                memcpy(m_rdata + i * m_capacity + dst_index, src, sizeof(RType));
+                src += sizeof(RType);
+            }
+        }
+
+        for (int i = 0; i < m_n_int; ++i) {
+            if (comm_int[i]) {
+                memcpy(m_idata + i * m_capacity + dst_index, src, sizeof(IType));
+                src += sizeof(IType);
+            }
+        }
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Particle<0, 0> getSuperParticle (size_type index) const noexcept
+    {
+        return {{
+            { AMREX_D_DECL( pos(0, index), pos(1, index), pos(2, index) ) },
+            idcpu(index)
+        }};
+    }
+};
+
+template <class RType, class IType>
+struct RTSoAParticle
+{
+    using PTD = ParticleTileDataRT<RType, IType>;
+    using ConstType = RTSoAParticle<std::add_const_t<RType>, std::add_const_t<IType>>;
+    static constexpr bool is_constsoa_particle = PTD::is_const;
+    static constexpr int NReal = 0;
+    static constexpr int NInt = 0;
+    static constexpr bool is_soa_particle = true;
+    static constexpr bool is_rtsoa_particle = true;
+    using size_type = typename PTD::size_type;
+    using RealType = RType;
+    using IntType = IType;
+
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RTSoAParticle (PTD const& ptd, size_type i) noexcept :
+        m_particle_tile_data(ptd),
+        m_index(i) {}
+
+    template <class aRType, class aIType>
+    friend struct RTSoAParticle;
+
+    template <class aRType, class aIType>
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RTSoAParticle (RTSoAParticle<aRType, aIType> const& rhs) noexcept :
+        m_particle_tile_data(rhs.m_particle_tile_data),
+        m_index(rhs.m_index) {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) cpu () const & { return this->m_particle_tile_data.cpu(m_index); }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) id () const & { return this->m_particle_tile_data.id(m_index); }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    decltype(auto) idcpu () const & { return this->m_particle_tile_data.idcpu(m_index); }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RType& rdata (const int comp_index) const & {
+        return this->m_particle_tile_data.rdata(comp_index, m_index);
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IType& idata (const int comp_index) const & {
+        return this->m_particle_tile_data.idata(comp_index, m_index);
+    }
+
+    //functions to get positions of the particle in the SOA data
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RealVect pos () const & {
+        return RealVect(AMREX_D_DECL(
+            this->m_particle_tile_data.pos(0, m_index),
+            this->m_particle_tile_data.pos(1, m_index),
+            this->m_particle_tile_data.pos(2, m_index)
+        ));
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    RType& pos (int position_index) const & {
+        return this->m_particle_tile_data.pos(position_index, m_index);
+    }
+
+    static Long NextID ();
+
+    /**
+    * \brief This version can only be used inside omp critical.
+    */
+    static Long UnprotectedNextID ();
+
+    /**
+    * \brief Reset on restart.
+    *
+    * \param nextid
+    */
+    static void NextID (Long nextid);
+
+private:
+
+    static_assert(std::is_trivially_copyable<PTD>(), "ParticleTileData is not trivially copyable");
+
+    PTD m_particle_tile_data;
+    size_type m_index;
+};
+
+struct NextIDRTSoA {
+    inline static Long the_next_id = 1;
+};
+
+template <class RType, class IType>
+Long
+RTSoAParticle<RType, IType>::NextID ()
+{
+    Long next;
+// we should be able to test on _OPENMP < 201107 for capture (version 3.1)
+// but we must work around a bug in gcc < 4.9
+#if defined(AMREX_USE_OMP) && defined(_OPENMP) && _OPENMP < 201307
+#pragma omp critical (amrex_particle_nextid)
+#elif defined(AMREX_USE_OMP)
+#pragma omp atomic capture
+#endif
+    next = NextIDRTSoA::the_next_id++;
+
+    if (next > LongParticleIds::LastParticleID) {
+        amrex::Abort("RTSoAParticle<RType, IType>::NextID() -- too many particles");
+    }
+
+    return next;
+}
+
+template <class RType, class IType>
+Long
+RTSoAParticle<RType, IType>::UnprotectedNextID ()
+{
+    Long next = NextIDRTSoA::the_next_id++;
+    if (next > LongParticleIds::LastParticleID) {
+        amrex::Abort("RTSoAParticle<RType, IType>::NextID() -- too many particles");
+    }
+    return next;
+}
+
+template <class RType, class IType>
+void
+RTSoAParticle<RType, IType>::NextID (Long nextid)
+{
+    NextIDRTSoA::the_next_id = nextid;
+}
+
+template<class RType, class IType>
+using ConstParticleTileDataRT = ParticleTileDataRT<const RType, const IType>;
+
+template <class RType=ParticleReal, class IType=int>
+struct ParticleTileRT
+{
+    using ParticleType = RTSoAParticle<RType, IType>;
+    using RealType = RType;
+    using IntType = IType;
+
+    using ParticleTileDataType = ParticleTileDataRT<RType, IType>;
+    using ConstParticleTileDataType = ParticleTileDataRT<const RType, const IType>;
+
+    using size_type = typename ParticleTileDataType::size_type;
+
+    using AoS = ThisParticleTileHasNoAoS;
+    using SoA = ParticleTileRT<RType, IType>;
+
+    using RealVector = ArrayView<RType>;
+    using IntVector = ArrayView<IType>;
+
+    ParticleTileRT () = default;
+
+#ifndef _WIN32  // workaround windows compiler bug
+    ~ParticleTileRT () = default;
+
+    ParticleTileRT (ParticleTileRT const&) = delete;
+    ParticleTileRT (ParticleTileRT &&) noexcept = default;
+
+    ParticleTileRT& operator= (ParticleTileRT const&) = delete;
+    ParticleTileRT& operator= (ParticleTileRT &&) noexcept = default;
+#endif
+
+    void define (
+        int a_num_real_comps,
+        int a_num_int_comps,
+        std::vector<std::string>* a_rdata_names=nullptr,
+        std::vector<std::string>* a_idata_names=nullptr,
+        Arena* a_arena=nullptr
+    )
+    {
+        if (m_defined) {
+            AMREX_ALWAYS_ASSERT(a_arena == nullptr || a_arena == m_arena);
+            if (a_num_real_comps != m_n_real || a_num_int_comps != m_n_int) {
+                realloc_and_move(m_size, m_capacity, a_num_real_comps, a_num_int_comps);
+            }
+        } else {
+            AMREX_ALWAYS_ASSERT(a_arena != nullptr);
+            m_defined = true;
+            m_arena = a_arena;
+            m_n_real = a_num_real_comps;
+            m_n_int = a_num_int_comps;
+        }
+
+        if (a_rdata_names != nullptr) {
+            m_real_names = a_rdata_names;
+        }
+        if (a_idata_names != nullptr) {
+            m_int_names = a_idata_names;
+        }
+    }
+
+    [[nodiscard]] bool empty () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return size() == 0;
+    }
+
+    /**
+    * \brief Returns the total number of particles (real and neighbor)
+    *
+    */
+    size_type size () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_size;
+    }
+
+    /**
+    * \brief Returns the number of real particles (excluding neighbors)
+    *
+    */
+    size_type numParticles () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return numRealParticles();
+    }
+
+    /**
+    * \brief Returns the number of real particles (excluding neighbors)
+    *
+    */
+    size_type numRealParticles () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_size;
+    }
+
+    /**
+    * \brief Returns the number of neighbor particles (excluding reals)
+    *
+    */
+    size_type numNeighborParticles () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return 0;
+    }
+
+    /**
+    * \brief Returns the total number of particles, real and neighbor
+    *
+    */
+    size_type numTotalParticles () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_size;
+    }
+
+    size_type getNumNeighbors () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return 0;
+    }
+
+    [[nodiscard]] int NumRealComps () const noexcept
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_n_real;
+    }
+
+    [[nodiscard]] int NumIntComps () const noexcept
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_n_int;
+    }
+
+    [[nodiscard]] int NumRuntimeRealComps () const noexcept
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_n_real;
+    }
+
+    [[nodiscard]] int NumRuntimeIntComps () const noexcept
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_n_int;
+    }
+
+    /** Get the names for the real SoA components **/
+    [[nodiscard]] std::vector<std::string> GetRealNames () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        if (m_real_names != nullptr) {
+            return *m_real_names;
+        } else {
+            return std::vector<std::string>();
+        }
+    }
+
+    /** Get the names for the int SoA components **/
+    [[nodiscard]] std::vector<std::string> GetIntNames () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        if (m_int_names != nullptr) {
+            return *m_int_names;
+        } else {
+            return std::vector<std::string>();
+        }
+    }
+
+    [[nodiscard]] ArrayView<uint64_t> GetIdCPUData () {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return {m_idcpu_data.data(), m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<const uint64_t> GetIdCPUData () const {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return {m_idcpu_data.data(), m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<RType> GetRealData (int comp_index) {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        AMREX_ALWAYS_ASSERT(0 <= comp_index && comp_index < m_n_real);
+        return {m_real_data.data() + comp_index * m_capacity, m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<const RType> GetRealData (int comp_index) const {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        AMREX_ALWAYS_ASSERT(0 <= comp_index && comp_index < m_n_real);
+        return {m_real_data.data() + comp_index * m_capacity, m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<IType> GetIntData (int comp_index) {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        AMREX_ALWAYS_ASSERT(0 <= comp_index && comp_index < m_n_int);
+        return {m_int_data.data() + comp_index * m_capacity, m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<const IType> GetIntData (int comp_index) const {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        AMREX_ALWAYS_ASSERT(0 <= comp_index && comp_index < m_n_int);
+        return {m_int_data.data() + comp_index * m_capacity, m_capacity};
+    }
+
+    [[nodiscard]] ArrayView<RType> GetRealData (std::string const & name) {
+        return GetRealData(get_idx_from_str(name, m_real_names));
+    }
+
+    [[nodiscard]] ArrayView<const RType> GetRealData (std::string const & name) const {
+        return GetRealData(get_idx_from_str(name, m_real_names));
+    }
+
+    [[nodiscard]] ArrayView<IType> GetIntData (std::string const & name) {
+        return GetIntData(get_idx_from_str(name, m_int_names));
+    }
+
+    [[nodiscard]] ArrayView<const IType> GetIntData (std::string const & name) const {
+        return GetIntData(get_idx_from_str(name, m_int_names));
+    }
+
+    void resize (size_type new_size)
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+
+        if (new_size <= m_capacity && new_size * 10 > m_capacity) {
+            m_size = new_size;
+        } else {
+            realloc_and_move(new_size, new_size, m_n_real, m_n_int);
+        }
+    }
+
+    void resize_geometric_grow (size_type new_size)
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+
+        if (new_size <= m_capacity && new_size * 10 > m_capacity) {
+            m_size = new_size;
+        } else {
+            realloc_and_move(new_size, std::max(new_size, m_capacity + m_capacity / 4),
+                             m_n_real, m_n_int);
+        }
+    }
+
+    void reserve (size_type new_capacity)
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+
+        if (new_capacity > m_capacity) {
+            realloc_and_move(m_size, new_capacity, m_n_real, m_n_int);
+        }
+    }
+
+    void realloc_and_move (size_type new_size, size_type new_capacity,
+                           int new_n_real, int new_n_int)
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+
+        if (new_capacity <= new_size) {
+            new_capacity = new_size;
+        }
+
+        decltype(m_idcpu_data) new_idcpu_data;
+        decltype(m_real_data) new_real_data;
+        decltype(m_int_data) new_int_data;
+
+        new_idcpu_data.setArena(m_arena);
+        new_real_data.setArena(m_arena);
+        new_int_data.setArena(m_arena);
+
+        new_idcpu_data.resize(new_capacity);
+        new_real_data.resize(new_capacity * new_n_real);
+        new_int_data.resize(new_capacity * new_n_int);
+
+        auto p_idcpu = new_idcpu_data.data();
+        auto p_real = new_real_data.data();
+        auto p_int = new_int_data.data();
+
+        auto old_ptd = getParticleTileData();
+        auto copy_size = std::min(m_size, new_size);
+        auto copy_n_real = std::min(m_n_real, new_n_real);
+        auto copy_n_int = std::min(m_n_int, new_n_int);
+
+        m_size = new_size;
+        m_capacity = new_capacity;
+        m_n_real = new_n_real;
+        m_n_int = new_n_int;
+        m_idcpu_data.swap(new_idcpu_data);
+        m_real_data.swap(new_real_data);
+        m_int_data.swap(new_int_data);
+
+        auto new_ptd = getParticleTileData();
+
+        if (new_size == 0) {
+            return;
+        }
+
+        if (m_arena->isDeviceAccessible()) {
+            ParallelFor(copy_size,
+                [=] AMREX_GPU_DEVICE (size_type i) noexcept {
+                    new_ptd.idcpu(i) = old_ptd.idcpu(i);
+
+                    for (int n = 0;  n < copy_n_real; ++n) {
+                        new_ptd.rdata(n, i) = old_ptd.rdata(n, i);
+                    }
+
+                    for (int n = 0;  n < copy_n_int; ++n) {
+                        new_ptd.idata(n, i) = old_ptd.idata(n, i);
+                    }
+                });
+            Gpu::streamSynchronize();
+        } else {
+            for (size_type i = 0; i < copy_size; ++i) {
+                new_ptd.idcpu(i) = old_ptd.idcpu(i);
+
+                for (int n = 0;  n < copy_n_real; ++n) {
+                    new_ptd.rdata(n, i) = old_ptd.rdata(n, i);
+                }
+
+                for (int n = 0;  n < copy_n_int; ++n) {
+                    new_ptd.idata(n, i) = old_ptd.idata(n, i);
+                }
+            }
+        }
+    }
+
+    void shrink_to_fit ()
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+
+        if (m_size != m_capacity) {
+            realloc_and_move(m_size, m_size, m_n_real, m_n_int);
+        }
+    }
+
+    size_type capacity () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return m_capacity * (sizeof(uint64_t) + m_n_real * sizeof(RType) + m_n_int * sizeof(IType));
+    }
+
+    void swap (ParticleTileRT<RType, IType>& other) noexcept
+    {
+        std::swap(m_defined, other.m_defined);
+        std::swap(m_arena, other.m_arena);
+        std::swap(m_capacity, other.m_capacity);
+        std::swap(m_size, other.m_size);
+        std::swap(m_n_real, other.m_n_real);
+        std::swap(m_n_int, other.m_n_int);
+        m_idcpu_data.swap(other.m_idcpu_data);
+        m_real_data.swap(other.m_real_data);
+        m_int_data.swap(other.m_int_data);
+        std::swap(m_real_names, other.m_real_names);
+        std::swap(m_int_names, other.m_int_names);
+    }
+
+    ParticleTileDataType getParticleTileData ()
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return ParticleTileDataType{
+            m_capacity,
+            m_idcpu_data.data(),
+            m_real_data.data(),
+            m_int_data.data(),
+            m_n_real,
+            m_n_int
+        };
+    }
+
+    ConstParticleTileDataType getConstParticleTileData () const
+    {
+        AMREX_ALWAYS_ASSERT(m_defined);
+        return ConstParticleTileDataType{
+            m_capacity,
+            m_idcpu_data.data(),
+            m_real_data.data(),
+            m_int_data.data(),
+            m_n_real,
+            m_n_int
+        };
+    }
+
+    auto&       GetStructOfArrays ()       { return *this; }
+    const auto& GetStructOfArrays () const { return *this; }
+
+    [[nodiscard]] Arena* arena() const {
+        return m_arena;
+    }
+
+private:
+
+    static int get_idx_from_str (std::string const & name, std::vector<std::string>* name_list) {
+        AMREX_ALWAYS_ASSERT(name_list != nullptr);
+        auto const pos = std::find(name_list->begin(), name_list->end(), name);
+        AMREX_ALWAYS_ASSERT(pos != name_list->end());
+        return static_cast<int>(std::distance(name_list->begin(), pos));
+    }
+
+    bool m_defined = false;
+
+    Arena* m_arena = nullptr;
+
+    size_type m_capacity = 0;
+    size_type m_size = 0;
+
+    int m_n_real = 0;
+    int m_n_int = 0;
+
+    amrex::PODVector<uint64_t, PolymorphicArenaAllocator<uint64_t>> m_idcpu_data;
+    amrex::PODVector<RType, PolymorphicArenaAllocator<RType>> m_real_data;
+    amrex::PODVector<IType, PolymorphicArenaAllocator<IType>> m_int_data;
+
+    std::vector<std::string>* m_real_names = nullptr;
+    std::vector<std::string>* m_int_names = nullptr;
+};
+
+} // namespace amrex
+
+#endif // AMREX_PARTICLETILERT_H_
diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H
index 1b0b29b662..bcd9e992f4 100644
--- a/Src/Particle/AMReX_ParticleTransformation.H
+++ b/Src/Particle/AMReX_ParticleTransformation.H
@@ -7,6 +7,7 @@
 #include <AMReX_Gpu.H>
 #include <AMReX_Print.H>
 #include <AMReX_ParticleTile.H>
+#include <AMReX_ParticleTileRT.H>
 #include <AMReX_ParticleUtil.H>
 
 namespace amrex
@@ -146,6 +147,60 @@ void swapParticle (const ParticleTileData<T_ParticleType, NAR, NAI>& dst,
     }
 }
 
+/**
+ * \brief A general single particle copying routine that can run on the GPU.
+ *
+ * \param dst the destination tile
+ * \param src the source tile
+ * \param src_i the index in the source to read from
+ * \param dst_i the index in the destination to write to
+ *
+ */
+template <class DRType, class DIType, class SRType, class SIType>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void copyParticle (const ParticleTileDataRT<DRType, DIType>& dst,
+                   const ParticleTileDataRT<SRType, SIType>& src,
+                   typename ParticleTileDataRT<SRType, SIType>::size_type src_i,
+                   typename ParticleTileDataRT<DRType, DIType>::size_type dst_i) noexcept
+{
+    dst.idcpu(dst_i) = src.idcpu(src_i);
+
+    for (int j = 0; j < dst.m_n_real && j < src.m_n_real; ++j) {
+        dst.rdata(j)[dst_i] = src.rdata(j)[src_i];
+    }
+
+    for (int j = 0; j < dst.m_n_int && j < src.m_n_int; ++j) {
+        dst.idata(j)[dst_i] = src.idata(j)[src_i];
+    }
+}
+
+/**
+ * \brief A general single particle swapping routine that can run on the GPU.
+ *
+ * \param dst the destination tile
+ * \param src the source tile
+ * \param src_i the index in the source to read from
+ * \param dst_i the index in the destination to write to
+ *
+ */
+template <class DRType, class DIType, class SRType, class SIType>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void swapParticle (const ParticleTileDataRT<DRType, DIType>& dst,
+                   const ParticleTileDataRT<SRType, SIType>& src,
+                   typename ParticleTileDataRT<SRType, SIType>::size_type src_i,
+                   typename ParticleTileDataRT<DRType, DIType>::size_type dst_i) noexcept
+{
+    amrex::Swap(dst.idcpu(dst_i), src.idcpu(src_i));
+
+    for (int j = 0; j < dst.m_n_real && j < src.m_n_real; ++j) {
+        amrex::Swap(dst.rdata(j)[dst_i], src.rdata(j)[src_i]);
+    }
+
+    for (int j = 0; j < dst.m_n_int && j < src.m_n_int; ++j) {
+        amrex::Swap(dst.idata(j)[dst_i], src.idata(j)[src_i]);
+    }
+}
+
 /**
  * \brief Copy particles from src to dst. This version copies all the
  * particles, writing them to the beginning of dst.
@@ -213,7 +268,8 @@ template <typename DstTile, typename SrcTile, typename F>
 void transformParticles (DstTile& dst, const SrcTile& src, F&& f) noexcept
 {
     auto np = src.numParticles();
-    transformParticles(dst, src, 0, 0, np, std::forward<F>(f));
+    using Index = decltype(np);
+    transformParticles(dst, src, Index{0}, Index{0}, np, std::forward<F>(f));
 }
 
 /**
@@ -270,7 +326,8 @@ template <typename DstTile1, typename DstTile2, typename SrcTile, typename F>
 void transformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& src, F&& f) noexcept
 {
     auto np = src.numParticles();
-    transformParticles(dst1, dst2, src, 0, 0, 0, np, std::forward<F>(f));
+    using Index = decltype(np);
+    transformParticles(dst1, dst2, src, Index{0}, Index{0}, Index{0}, np, std::forward<F>(f));
 }
 
 /**
@@ -329,7 +386,7 @@ template <typename DstTile, typename SrcTile, typename Index, typename N,
           std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterParticles (DstTile& dst, const SrcTile& src, const Index* mask) noexcept
 {
-    return filterParticles(dst, src, mask, 0, 0, src.numParticles());
+    return filterParticles(dst, src, mask, Index{0}, Index{0}, src.numParticles());
 }
 
 /**
@@ -500,7 +557,7 @@ template <typename DstTile, typename SrcTile, typename Index, typename F,
           std::enable_if_t<std::is_integral_v<Index>, int> foo = 0>
 Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Index* mask, F&& f) noexcept
 {
-    return filterAndTransformParticles(dst, src, mask, std::forward<F>(f), 0, 0);
+    return filterAndTransformParticles(dst, src, mask, std::forward<F>(f), Index{0}, Index{0});
 }
 
 /**
@@ -522,7 +579,9 @@ template <typename DstTile, typename SrcTile, typename Pred, typename F,
           std::enable_if_t<!std::is_pointer_v<std::decay_t<Pred>>,int> foo = 0>
 int filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F&& f) noexcept
 {
-    return filterAndTransformParticles(dst, src, std::forward<Pred>(p), std::forward<F>(f), 0, 0);
+    using Index = decltype(src.numParticles());
+    return filterAndTransformParticles(dst, src, std::forward<Pred>(p), std::forward<F>(f),
+                                       Index{0}, Index{0});
 }
 
 /**
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index ac91573f85..918fc67252 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -11,6 +11,7 @@
 #include <AMReX_MFIter.H>
 #include <AMReX_ParGDB.H>
 #include <AMReX_ParticleTile.H>
+#include <AMReX_ParticleTileRT.H>
 #include <AMReX_ParticleBufferMap.H>
 #include <AMReX_TypeTraits.H>
 #include <AMReX_Scan.H>
@@ -377,17 +378,12 @@ IntVect getParticleCell (PTD const& ptd, int i,
                          amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxi,
                          const Box& domain) noexcept
 {
-    if constexpr (PTD::ParticleType::is_soa_particle)
-    {
-        IntVect iv(
-                   AMREX_D_DECL(int(amrex::Math::floor((ptd.m_rdata[0][i]-plo[0])*dxi[0])),
-                                int(amrex::Math::floor((ptd.m_rdata[1][i]-plo[1])*dxi[1])),
-                                int(amrex::Math::floor((ptd.m_rdata[2][i]-plo[2])*dxi[2]))));
-        iv += domain.smallEnd();
-        return iv;
-    } else {
-        return getParticleCell(ptd.m_aos[i], plo, dxi, domain);;
-    }
+    IntVect iv(
+                AMREX_D_DECL(int(amrex::Math::floor((ptd.pos(0, i)-plo[0])*dxi[0])),
+                             int(amrex::Math::floor((ptd.pos(1, i)-plo[1])*dxi[1])),
+                             int(amrex::Math::floor((ptd.pos(2, i)-plo[2])*dxi[2]))));
+    iv += domain.smallEnd();
+    return iv;
 }
 
 struct DefaultAssignor
@@ -865,6 +861,157 @@ std::string getDefaultCompNameInt (const int i) {
     return name;
 }
 
+template <class PTile, class index_type,
+          std::enable_if_t<!PTile::ParticleType::is_rtsoa_particle, int> foo = 0>
+void
+ReorderParticles (PTile& ptile, const index_type* permutations)
+{
+    const size_t np       = ptile.numParticles();
+    const size_t np_total = np + ptile.numNeighborParticles();
+
+#if defined(AMREX_USE_CUDA) && defined(_WIN32)
+    if           (!PTile::ParticleType::is_soa_particle)
+#else
+    if constexpr (!PTile::ParticleType::is_soa_particle)
+#endif
+    {
+        static_assert(sizeof(typename PTile::ParticleType)%4 == 0 && sizeof(uint32_t) == 4);
+        using tmp_t = std::conditional_t<sizeof(typename PTile::ParticleType)%8 == 0,
+                                            uint64_t, uint32_t>;
+        constexpr std::size_t nchunks = sizeof(typename PTile::ParticleType) / sizeof(tmp_t);
+        Gpu::DeviceVector<tmp_t> tmp(np);
+        auto* ptmp = tmp.data();
+        auto* paos = (tmp_t*)(ptile.getParticleTileData().m_aos);
+        for (std::size_t ichunk = 0; ichunk < nchunks; ++ichunk) {
+            // Do not need to reorder neighbor particles
+            AMREX_HOST_DEVICE_FOR_1D(np, i,
+            {
+                ptmp[i] = paos[permutations[i]*nchunks+ichunk];
+            });
+            AMREX_HOST_DEVICE_FOR_1D(np, i,
+            {
+                paos[i*nchunks+ichunk] = ptmp[i];
+            });
+        }
+        Gpu::streamSynchronize();
+    } else {
+        typename PTile::SoA::IdCPU tmp_idcpu;
+        if constexpr (PTile::has_polymorphic_allocator) {
+            tmp_idcpu.setArena(ptile.GetStructOfArrays().GetIdCPUData().arena());
+        }
+        tmp_idcpu.resize(np_total);
+        auto src = ptile.GetStructOfArrays().GetIdCPUData().data();
+        uint64_t* dst = tmp_idcpu.data();
+        AMREX_HOST_DEVICE_FOR_1D( np_total, i,
+        {
+            dst[i] = i < np ? src[permutations[i]] : src[i];
+        });
+
+        Gpu::streamSynchronize();
+
+        ptile.GetStructOfArrays().GetIdCPUData().swap(tmp_idcpu);
+    }
+
+    { // Create a scope for the temporary vector below
+        typename PTile::RealVector tmp_real;
+        if (ptile.NumRealComps() > 0) {
+            if constexpr (PTile::has_polymorphic_allocator) {
+                tmp_real.setArena(ptile.GetStructOfArrays().GetRealData(0).arena());
+            }
+            tmp_real.resize(np_total);
+        }
+        for (int comp = 0; comp < ptile.NumRealComps(); ++comp) {
+            auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
+            ParticleReal* dst = tmp_real.data();
+            AMREX_HOST_DEVICE_FOR_1D( np_total, i,
+            {
+                dst[i] = i < np ? src[permutations[i]] : src[i];
+            });
+
+            Gpu::streamSynchronize();
+
+            ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
+        }
+    }
+
+    typename PTile::IntVector tmp_int;
+    if (ptile.NumIntComps() > 0) {
+        if constexpr (PTile::has_polymorphic_allocator) {
+            tmp_int.setArena(ptile.GetStructOfArrays().GetIntData(0).arena());
+        }
+        tmp_int.resize(np_total);
+    }
+
+    for (int comp = 0; comp < ptile.NumIntComps(); ++comp) {
+        auto src = ptile.GetStructOfArrays().GetIntData(comp).data();
+        int* dst = tmp_int.data();
+        AMREX_HOST_DEVICE_FOR_1D( np_total , i,
+        {
+            dst[i] = i < np ? src[permutations[i]] : src[i];
+        });
+
+        Gpu::streamSynchronize();
+
+        ptile.GetStructOfArrays().GetIntData(comp).swap(tmp_int);
+    }
+}
+
+template <class PTile, class index_type,
+          std::enable_if_t<PTile::ParticleType::is_rtsoa_particle, int> foo = 0>
+void
+ReorderParticles (PTile& ptile, const index_type* permutations)
+{
+    const size_t np = ptile.numParticles();
+    {
+        amrex::Gpu::AsyncVector<uint64_t> tmp_idcpu(np);
+
+        auto src = ptile.GetIdCPUData().data();
+        uint64_t* dst = tmp_idcpu.data();
+        amrex::ParallelFor(np,
+            [=] AMREX_GPU_DEVICE (size_t i) {
+                dst[i] = src[permutations[i]];
+            });
+        amrex::ParallelFor(np,
+            [=] AMREX_GPU_DEVICE (size_t i) {
+                src[i] = dst[i];
+            });
+    }
+    {
+        amrex::Gpu::AsyncVector<typename PTile::RealType> tmp_real(np);
+
+        for (int comp = 0; comp < ptile.NumRealComps(); ++comp) {
+            auto src = ptile.GetRealData(comp).data();
+            auto dst = tmp_real.data();
+            amrex::ParallelFor(np,
+                [=] AMREX_GPU_DEVICE (size_t i) {
+                    dst[i] = src[permutations[i]];
+                });
+            amrex::ParallelFor(np,
+                [=] AMREX_GPU_DEVICE (size_t i) {
+                    src[i] = dst[i];
+                });
+        }
+    }
+    {
+        amrex::Gpu::AsyncVector<typename PTile::IntType> tmp_int(np);
+
+        for (int comp = 0; comp < ptile.NumIntComps(); ++comp) {
+            auto src = ptile.GetIntData(comp).data();
+            auto dst = tmp_int.data();
+            amrex::ParallelFor(np,
+                [=] AMREX_GPU_DEVICE (size_t i) {
+                    dst[i] = src[permutations[i]];
+                });
+            amrex::ParallelFor(np,
+                [=] AMREX_GPU_DEVICE (size_t i) {
+                    src[i] = dst[i];
+                });
+        }
+    }
+    // stream sync for deallocation of permutations variable in user code
+    Gpu::streamSynchronize();
+}
+
 #ifdef AMREX_USE_HDF5_ASYNC
 void async_vol_es_wait_particle();
 void async_vol_es_wait_close_particle();
diff --git a/Src/Particle/AMReX_Particles.H b/Src/Particle/AMReX_Particles.H
index 281f350b32..d36e02273f 100644
--- a/Src/Particle/AMReX_Particles.H
+++ b/Src/Particle/AMReX_Particles.H
@@ -10,6 +10,7 @@
 #include <AMReX_ArrayOfStructs.H>
 #include <AMReX_Particle.H>
 #include <AMReX_ParticleTile.H>
+#include <AMReX_ParticleTileRT.H>
 #include <AMReX_ParticleUtil.H>
 #include <AMReX_ParticleReduce.H>
 #include <AMReX_ParticleBufferMap.H>
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 51a05a6a33..255d692ffa 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -237,47 +237,56 @@ void packParticleIDs (I* idata, const std::uint64_t idcpu, bool is_checkpoint) n
 
 template<class PTD>
 AMREX_GPU_HOST_DEVICE void
-rPackParticleData (const PTD& ptd, int idx, typename PTD::RealType * rdata_ptr,
+rPackParticleData (const PTD& ptd, int idx, std::remove_const_t<typename PTD::RealType> * rdata_ptr,
                    const int * write_real_comp)
 {
     std::size_t rout_index = 0;
 
-    for (int j = 0; j < AMREX_SPACEDIM; ++j) {
-        rdata_ptr[rout_index] = ptd.pos(j, idx);
-        rout_index++;
-    }
+    if constexpr (!PTD::ParticleType::is_rtsoa_particle) {
+        for (int j = 0; j < AMREX_SPACEDIM; ++j) {
+            rdata_ptr[rout_index] = ptd.pos(j, idx);
+            rout_index++;
+        }
 
-    if constexpr (!PTD::ParticleType::is_soa_particle) {
-        const auto& p = ptd[idx];
+        if constexpr (!PTD::ParticleType::is_soa_particle) {
+            const auto& p = ptd[idx];
 
-        for (int j = 0; j < PTD::ParticleType::NReal; ++j) {
-            if (write_real_comp[j]) {
-                rdata_ptr[rout_index] = p.rdata(j);
-                rout_index++;
+            for (int j = 0; j < PTD::ParticleType::NReal; ++j) {
+                if (write_real_comp[j]) {
+                    rdata_ptr[rout_index] = p.rdata(j);
+                    rout_index++;
+                }
             }
         }
-    }
 
-    constexpr int real_start_offset = PTD::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
+        constexpr int real_start_offset = PTD::ParticleType::is_soa_particle ? AMREX_SPACEDIM : 0;
 
-    for (int j = real_start_offset; j < PTD::NAR; ++j) {
-        if (write_real_comp[PTD::ParticleType::NReal + j - real_start_offset]) {
-            rdata_ptr[rout_index] = ptd.rdata(j)[idx];
-            rout_index++;
+        for (int j = real_start_offset; j < PTD::NAR; ++j) {
+            if (write_real_comp[PTD::ParticleType::NReal + j - real_start_offset]) {
+                rdata_ptr[rout_index] = ptd.rdata(j)[idx];
+                rout_index++;
+            }
         }
-    }
 
-    for (int j = 0; j < ptd.m_num_runtime_real; ++j) {
-        if (write_real_comp[PTD::ParticleType::NReal + PTD::NAR + j - real_start_offset]) {
-            rdata_ptr[rout_index] = ptd.m_runtime_rdata[j][idx];
-            rout_index++;
+        for (int j = 0; j < ptd.m_num_runtime_real; ++j) {
+            if (write_real_comp[PTD::ParticleType::NReal + PTD::NAR + j - real_start_offset]) {
+                rdata_ptr[rout_index] = ptd.m_runtime_rdata[j][idx];
+                rout_index++;
+            }
+        }
+    } else {
+        for (int j = 0; j < ptd.m_n_real; ++j) {
+            if ((j < AMREX_SPACEDIM) || write_real_comp[j - AMREX_SPACEDIM]) {
+                rdata_ptr[rout_index] = ptd.rdata(j)[idx];
+                rout_index++;
+            }
         }
     }
 }
 
 template<class PTD>
 AMREX_GPU_HOST_DEVICE void
-iPackParticleData (const PTD& ptd, int idx, typename PTD::IntType * idata_ptr,
+iPackParticleData (const PTD& ptd, int idx, std::remove_const_t<typename PTD::IntType> * idata_ptr,
                    const int * write_int_comp, bool is_checkpoint)
 {
     std::size_t iout_index = 0;
@@ -285,28 +294,37 @@ iPackParticleData (const PTD& ptd, int idx, typename PTD::IntType * idata_ptr,
     packParticleIDs(&idata_ptr[iout_index], ptd.idcpu(idx), is_checkpoint);
     iout_index += 2;
 
-    if constexpr (!PTD::ParticleType::is_soa_particle) {
-        const auto& p = ptd[idx];
+    if constexpr (!PTD::ParticleType::is_rtsoa_particle) {
+        if constexpr (!PTD::ParticleType::is_soa_particle) {
+            const auto& p = ptd[idx];
 
-        for (int j = 0; j < PTD::ParticleType::NInt; ++j) {
-            if (write_int_comp[j]) {
-                idata_ptr[iout_index] = p.idata(j);
-                iout_index++;
+            for (int j = 0; j < PTD::ParticleType::NInt; ++j) {
+                if (write_int_comp[j]) {
+                    idata_ptr[iout_index] = p.idata(j);
+                    iout_index++;
+                }
             }
         }
-    }
 
-    for (int j = 0; j < PTD::NAI; ++j) {
-        if (write_int_comp[PTD::ParticleType::NInt + j]) {
-            idata_ptr[iout_index] = ptd.idata(j)[idx];
-            iout_index++;
+        for (int j = 0; j < PTD::NAI; ++j) {
+            if (write_int_comp[PTD::ParticleType::NInt + j]) {
+                idata_ptr[iout_index] = ptd.idata(j)[idx];
+                iout_index++;
+            }
         }
-    }
 
-    for (int j = 0; j < ptd.m_num_runtime_int; ++j) {
-        if (write_int_comp[PTD::ParticleType::NInt + PTD::NAI + j]) {
-            idata_ptr[iout_index] = ptd.m_runtime_idata[j][idx];
-            iout_index++;
+        for (int j = 0; j < ptd.m_num_runtime_int; ++j) {
+            if (write_int_comp[PTD::ParticleType::NInt + PTD::NAI + j]) {
+                idata_ptr[iout_index] = ptd.m_runtime_idata[j][idx];
+                iout_index++;
+            }
+        }
+    } else {
+        for (int j = 0; j < ptd.m_n_int; ++j) {
+            if (write_int_comp[j]) {
+                idata_ptr[iout_index] = ptd.rdata(j)[idx];
+                iout_index++;
+            }
         }
     }
 }
@@ -315,7 +333,7 @@ template <class PC>
 void
 packIODataGpu (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int lev, int grid,
                const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
-               const Vector<std::map<std::pair<int, int>, typename PC::IntVector>>& particle_io_flags,
+               const Vector<std::map<std::pair<int, int>, typename PC::FlagsVector>>& particle_io_flags,
                const Vector<int>& tiles, int np, bool is_checkpoint)
 {
     int num_output_int = 0;
@@ -389,7 +407,7 @@ template <class PC>
 void
 packIODataCpu (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int lev, int grid,
                const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
-               const Vector<std::map<std::pair<int, int>, typename PC::IntVector>>& particle_io_flags,
+               const Vector<std::map<std::pair<int, int>, typename PC::FlagsVector>>& particle_io_flags,
                const Vector<int>& tiles, int np, bool is_checkpoint)
 {
     int num_output_int = 0;
@@ -433,10 +451,10 @@ template <class PC>
 void
 packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int lev, int grid,
             const Vector<int>& write_real_comp, const Vector<int>& write_int_comp,
-            const Vector<std::map<std::pair<int, int>, typename PC::IntVector>>& particle_io_flags,
+            const Vector<std::map<std::pair<int, int>, typename PC::FlagsVector>>& particle_io_flags,
             const Vector<int>& tiles, int np, bool is_checkpoint)
 {
-    if constexpr (IsPolymorphicArenaAllocator<typename PC::IntVector::allocator_type>::value) {
+    if constexpr (IsPolymorphicArenaAllocator<typename PC::FlagsVector::allocator_type>::value) {
         if (pc.arena()->isManaged() || pc.arena()->isDevice()) {
             packIODataGpu(idata, rdata, pc, lev, grid, write_real_comp, write_int_comp,
                           particle_io_flags, tiles, np, is_checkpoint);
@@ -445,7 +463,7 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
                           particle_io_flags, tiles, np, is_checkpoint);
         }
     } else {
-        if constexpr (RunOnGpu<typename PC::IntVector::allocator_type>::value) {
+        if constexpr (RunOnGpu<typename PC::FlagsVector::allocator_type>::value) {
             packIODataGpu(idata, rdata, pc, lev, grid, write_real_comp, write_int_comp,
                           particle_io_flags, tiles, np, is_checkpoint);
         } else {
@@ -506,8 +524,9 @@ void WriteBinaryParticleDataSync (PC const& pc,
     Long maxnextid;
 
     // evaluate f for every particle to determine which ones to output
-    Vector<std::map<std::pair<int, int>, typename PC::IntVector > >
+    Vector<std::map<std::pair<int, int>, typename PC::FlagsVector > >
         particle_io_flags(pc.GetParticles().size());
+
     for (int lev = 0; lev < pc.GetParticles().size();  lev++)
     {
         const auto& pmap = pc.GetParticles(lev);
@@ -898,8 +917,11 @@ void WriteBinaryParticleDataAsync (PC const& pc,
     }
 
     // make tmp particle tiles in pinned memory to write
-    using PinnedPTile = ParticleTile<typename PC::ParticleType, NArrayReal, NArrayInt,
-                                     PolymorphicArenaAllocator>;
+    using PinnedPTile = std::conditional_t<
+        PC::is_rtsoa_pc,
+        typename PC::ParticleTileType,
+        ParticleTile<typename PC::ParticleType, NArrayReal, NArrayInt, PolymorphicArenaAllocator>
+    >;
     auto myptiles = std::make_shared<Vector<std::map<std::pair<int, int>,PinnedPTile> > >();
     myptiles->resize(pc.finestLevel()+1);
     for (int lev = 0; lev <= pc.finestLevel(); lev++)
diff --git a/Src/Particle/CMakeLists.txt b/Src/Particle/CMakeLists.txt
index 7f475b0dbc..1ffa29775c 100644
--- a/Src/Particle/CMakeLists.txt
+++ b/Src/Particle/CMakeLists.txt
@@ -30,6 +30,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_StructOfArrays.H
        AMReX_ArrayOfStructs.H
        AMReX_ParticleTile.H
+       AMReX_ParticleTileRT.H
        AMReX_MakeParticle.H
        AMReX_NeighborParticlesCPUImpl.H
        AMReX_NeighborParticlesGPUImpl.H
diff --git a/Src/Particle/Make.package b/Src/Particle/Make.package
index 7959368ef8..95ad4670d7 100644
--- a/Src/Particle/Make.package
+++ b/Src/Particle/Make.package
@@ -13,6 +13,7 @@ CEXE_headers += AMReX_ParIter.H
 CEXE_headers += AMReX_StructOfArrays.H
 CEXE_headers += AMReX_ArrayOfStructs.H
 CEXE_headers += AMReX_ParticleTile.H
+CEXE_headers += AMReX_ParticleTileRT.H
 CEXE_headers += AMReX_MakeParticle.H
 
 CEXE_headers += AMReX_NeighborParticles.H