Example: tiled nbody using bulk copies

Vectorization library from: https://github.com/aff3ct/MIPP

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"

// a simd tool from github
#define MIPP_ALIGNED_LOADS
#include "mipp/mipp.h"

int main()
{
	const int simd = mipp::N<float>();
	std::cout<<"simd width: "<<simd<<std::endl;

	const int n = simd * 5000;
	std::cout<<"particles: "<< n <<std::endl;

	GraphicsCardSupplyDepot depot;
	VirtualMultiArray<float> xVir(n, depot.requestGpus(), 1000, 1);
	VirtualMultiArray<float> yVir(n, depot.requestGpus(), 1000, 1);
	VirtualMultiArray<float> zVir(n, depot.requestGpus(), 1000, 1);
	VirtualMultiArray<float> vxVir(n, depot.requestGpus(), 1000, 1);
	VirtualMultiArray<float> vyVir(n, depot.requestGpus(), 1000, 1);
	VirtualMultiArray<float> vzVir(n, depot.requestGpus(), 1000, 1);

	mipp::Reg<float> smoothing = 0.0001f;
	// mapped array access
	{
		CpuBenchmarker bench(((size_t)n) * n * sizeof(float) * 3, "mapped access");
		const int tileSize = 1000;
		const int regTile = 100;
		#pragma omp parallel for
		for (int i = 0; i < n; i += regTile)
		{
			std::vector<float> x0 = xVir.readOnlyGetN(i,regTile);
			std::vector<float> y0 = yVir.readOnlyGetN(i,regTile);
			std::vector<float> z0 = zVir.readOnlyGetN(i,regTile);

			mipp::Reg<float> fma1[regTile];
			mipp::Reg<float> fma2[regTile];
			mipp::Reg<float> fma3[regTile];
			for(int j=0;j<regTile;j++)
			{
				fma1[j]=0.0f;
				fma2[j]=0.0f;
				fma3[j]=0.0f;
			}

			for (int ii = 0; ii < n; ii += tileSize)
			{

				xVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrX1) {
					const float* __restrict__ const ptrX = ptrX1 + ii;
					yVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrY1) {
						const float* __restrict__ const ptrY = ptrY1 + ii;
						zVir.mappedReadWriteAccess(ii, tileSize, [&, ii](float* ptrZ1) {
							const float* __restrict__ const ptrZ = ptrZ1 + ii;

							for(int reg0 = 0; reg0 < regTile; reg0+=4)
							for (int ld = 0; ld < tileSize; ld += simd)
							{
								mipp::Reg<float> x = mipp::load(ptrX + ld);
								mipp::Reg<float> y = mipp::load(ptrY + ld);
								mipp::Reg<float> z = mipp::load(ptrZ + ld);


								for(int reg1 = 0; reg1<4; reg1++)
								{
									const int reg = reg0 + reg1;
									const mipp::Reg<float> x0r = x0[reg];
									const mipp::Reg<float> y0r = y0[reg];
									const mipp::Reg<float> z0r = z0[reg];

									const mipp::Reg<float> dx = mipp::sub(x,x0r);
									const mipp::Reg<float> dy = mipp::sub(y,y0r);
									const mipp::Reg<float> dz = mipp::sub(z,z0r);
									const mipp::Reg<float> dx2 = mipp::mul(dx,dx);
									const mipp::Reg<float> dy2 = mipp::mul(dy,dy);
									const mipp::Reg<float> dz2 = mipp::mul(dz,dz);
									const mipp::Reg<float> dxy2 = mipp::add(dx2,dy2);
									const mipp::Reg<float> dz2s = mipp::add(dz2,smoothing);
									const mipp::Reg<float> smoothed = mipp::add(dxy2,dz2s);
									const mipp::Reg<float> r = mipp::rsqrt(smoothed);
									const mipp::Reg<float> r3 = mipp::mul(mipp::mul(r, r), r);



									fma1[reg] = mipp::fmadd(dx, r3, fma1[reg]);
									fma2[reg] = mipp::fmadd(dy, r3, fma2[reg]);
									fma3[reg] = mipp::fmadd(dz, r3, fma3[reg]);
								}


							}
						}, false, true, false);
					}, false, true, false);
				}, false, true, false);

			}

			for(int j=0;j<regTile;j++)
			{
				vxVir.set(i+j, vxVir.get(i+j) + mipp::hadd(fma1[j]));
				vyVir.set(i+j, vyVir.get(i+j) + mipp::hadd(fma2[j]));
				vzVir.set(i+j, vzVir.get(i+j) + mipp::hadd(fma3[j]));
			}

		}
	}

	return 0;
}

Due to uninitialized (a lot of NaN) data, floating point operations are too slow. Without initialization:

Ubuntu 18.04 LTS, g++-10, fx8150 (2.1GHz) output:

simd width: 8
particles: 40000
mapped access: 3813243273 nanoseconds     (bandwidth = 5035.08 MB/s)

Windows 10, Visual Studio 2019 (2.1GHz):

simd width: 8
particles: 40000
mapped access: 9047568600 nanoseconds     (bandwidth = 2122.12 MB/s)

Windows 10, Visual Studio 2019 (3.6GHz):

simd width: 8
particles: 40000
mapped access: 4366219200 nanoseconds     (bandwidth = 4397.40 MB/s)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Example: tiled nbody using bulk copies

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally