-
Notifications
You must be signed in to change notification settings - Fork 3
Example: tiled nbody using bulk copies
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 6, 2021
·
22 revisions
Vectorization library from: https://github.com/aff3ct/MIPP
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"
// testing
#include <iostream>
#include "omp.h"
// a simd tool from github
#define MIPP_ALIGNED_LOADS
#include "mipp/mipp.h"
int main()
{
const int simd = mipp::N<float>();
std::cout<<"simd width: "<<simd<<std::endl;
const int n = simd * 5000;
std::cout<<"particles: "<< n <<std::endl;
GraphicsCardSupplyDepot depot;
VirtualMultiArray<float> xVir(n, depot.requestGpus(), 1000, 1);
VirtualMultiArray<float> yVir(n, depot.requestGpus(), 1000, 1);
VirtualMultiArray<float> zVir(n, depot.requestGpus(), 1000, 1);
VirtualMultiArray<float> vxVir(n, depot.requestGpus(), 1000, 1);
VirtualMultiArray<float> vyVir(n, depot.requestGpus(), 1000, 1);
VirtualMultiArray<float> vzVir(n, depot.requestGpus(), 1000, 1);
mipp::Reg<float> smoothing = 0.0001f;
// mapped array access
{
CpuBenchmarker bench(((size_t)n) * n * sizeof(float) * 3, "mapped access");
const int tileSize = 1000;
const int regTile = 100;
#pragma omp parallel for
for (int i = 0; i < n; i += regTile)
{
std::vector<float> x0 = xVir.readOnlyGetN(i,regTile);
std::vector<float> y0 = yVir.readOnlyGetN(i,regTile);
std::vector<float> z0 = zVir.readOnlyGetN(i,regTile);
mipp::Reg<float> fma1[regTile];
mipp::Reg<float> fma2[regTile];
mipp::Reg<float> fma3[regTile];
for(int j=0;j<regTile;j++)
{
fma1[j]=0.0f;
fma2[j]=0.0f;
fma3[j]=0.0f;
}
for (int ii = 0; ii < n; ii += tileSize)
{
xVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrX1) {
const float* __restrict__ const ptrX = ptrX1 + ii;
yVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrY1) {
const float* __restrict__ const ptrY = ptrY1 + ii;
zVir.mappedReadWriteAccess(ii, tileSize, [&, ii](float* ptrZ1) {
const float* __restrict__ const ptrZ = ptrZ1 + ii;
for(int reg0 = 0; reg0 < regTile; reg0+=4)
for (int ld = 0; ld < tileSize; ld += simd)
{
mipp::Reg<float> x = mipp::load(ptrX + ld);
mipp::Reg<float> y = mipp::load(ptrY + ld);
mipp::Reg<float> z = mipp::load(ptrZ + ld);
for(int reg1 = 0; reg1<4; reg1++)
{
const int reg = reg0 + reg1;
const mipp::Reg<float> x0r = x0[reg];
const mipp::Reg<float> y0r = y0[reg];
const mipp::Reg<float> z0r = z0[reg];
const mipp::Reg<float> dx = mipp::sub(x,x0r);
const mipp::Reg<float> dy = mipp::sub(y,y0r);
const mipp::Reg<float> dz = mipp::sub(z,z0r);
const mipp::Reg<float> dx2 = mipp::mul(dx,dx);
const mipp::Reg<float> dy2 = mipp::mul(dy,dy);
const mipp::Reg<float> dz2 = mipp::mul(dz,dz);
const mipp::Reg<float> dxy2 = mipp::add(dx2,dy2);
const mipp::Reg<float> dz2s = mipp::add(dz2,smoothing);
const mipp::Reg<float> smoothed = mipp::add(dxy2,dz2s);
const mipp::Reg<float> r = mipp::rsqrt(smoothed);
const mipp::Reg<float> r3 = mipp::mul(mipp::mul(r, r), r);
fma1[reg] = mipp::fmadd(dx, r3, fma1[reg]);
fma2[reg] = mipp::fmadd(dy, r3, fma2[reg]);
fma3[reg] = mipp::fmadd(dz, r3, fma3[reg]);
}
}
}, false, true, false);
}, false, true, false);
}, false, true, false);
}
for(int j=0;j<regTile;j++)
{
vxVir.set(i+j, vxVir.get(i+j) + mipp::hadd(fma1[j]));
vyVir.set(i+j, vyVir.get(i+j) + mipp::hadd(fma2[j]));
vzVir.set(i+j, vzVir.get(i+j) + mipp::hadd(fma3[j]));
}
}
}
return 0;
}
Due to uninitialized (a lot of NaN) data, floating point operations are too slow. Without initialization:
Ubuntu 18.04 LTS, g++-10, fx8150 (2.1GHz) output:
simd width: 8
particles: 40000
mapped access: 3813243273 nanoseconds (bandwidth = 5035.08 MB/s)
Windows 10, Visual Studio 2019 (2.1GHz):
simd width: 8
particles: 40000
mapped access: 9047568600 nanoseconds (bandwidth = 2122.12 MB/s)
Windows 10, Visual Studio 2019 (3.6GHz):
simd width: 8
particles: 40000
mapped access: 4366219200 nanoseconds (bandwidth = 4397.40 MB/s)