Skip to content

Example: tiled nbody using bulk copies

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Feb 13, 2021 · 22 revisions

In this sample code, an (extremely) unoptimized nbody force calculation bandwidth is compared to a tiled version which uses readOnlyGetN() and writeOnlySetN() methods. Note that same virtual array is used out of its original instance's scope since internal data is shared between all of its instances.

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"

// this example is not meant for performance but simplicity
// only for benchmarking bandwidth under multi-threaded access using different access methods
// that's why AOS is chosen instead of SOA data. SOA with aligned buffers would work much better for avx/sse & tiling

struct Force
{
	float x,y,z;
	Force():x(0),y(0),z(0){}
};
struct Particle
{
	float x,y,z;
	float vx,vy,vz;
	float m;
	Particle():x(0),y(0),z(0),vx(0),vy(0),vz(0),m(1){}
	void computeForce(const Particle & p, Force & f)
	{
		float dx = x - p.x;
		float dy = y - p.y;
		float dz = z - p.z;
		float mm = m * p.m;
		float r = 1.0f/sqrtf(dx*dx+dy*dy+dz*dz+0.001f);
		float rm = r*r*r*mm;
		f.x+= dx * rm;
		f.y+= dy * rm;
		f.z+= dz * rm;
	}
};



struct Nbody
{
public:
	Nbody():n(0){}
	Nbody(const int nP):n(nP){}
	Nbody(const int nP, VirtualMultiArray<Particle> particles, VirtualMultiArray<Force> forces):n(nP),vmaParticle(particles),vmaForce(forces){}
	void compute()
	{
		#pragma omp parallel for schedule(guided)
		for(int i=0;i<n;i++)
		{
			auto pi = vmaParticle.get(i);
			Force force;
			for(int j=0;j<n;j++)
			{
				auto pj = vmaParticle.get(j);
				pi.computeForce(pj,force);
			}
			vmaForce.set(i,force);
		}
	}


	int n;
	VirtualMultiArray<Particle> vmaParticle;
	VirtualMultiArray<Force> vmaForce;
};

// an optimized implementation
struct NbodyTiling
{
public:
	NbodyTiling():n(0){}
	NbodyTiling(const int nP):n(nP){}
	NbodyTiling(const int nP, VirtualMultiArray<Particle> particles, VirtualMultiArray<Force> forces):n(nP),vmaParticle(particles),vmaForce(forces){}
	void compute()
	{
		#pragma omp parallel for schedule(guided)
		for(int i=0;i<n;i+=100)
		{
			// bulk read
			auto pi = vmaParticle.readOnlyGetN(i,100);
			std::vector<Force> forceVec(100);
			Force * const __restrict__ force = forceVec.data();
			for(int j=0;j<n;j+=100)
			{
				// tiling 100x100 = fits in cache too

				// bulk read

				auto pj = vmaParticle.readOnlyGetN(j,100);

				#pragma omp declare reduction( + : Force : omp_out.x += omp_in.x, omp_out.y += omp_in.y, omp_out.z += omp_in.z )

				#pragma omp simd reduction(+:force[:100])
				for(int ii=0;ii<100;ii++)
				{
					for(int jj=0;jj<100;jj++)
					{
						pi[ii].computeForce(pj[jj],force[ii]);
					}
				}
			}
			vmaForce.writeOnlySetN(i,forceVec);
		}
	}

	int n;
	VirtualMultiArray<Particle> vmaParticle;
	VirtualMultiArray<Force> vmaForce;
};


// an optimized implementation
struct NbodyTilingMapped
{
public:
	NbodyTilingMapped():n(0){}
	NbodyTilingMapped(const int nP):n(nP){}
	NbodyTilingMapped(const int nP, VirtualMultiArray<Particle> particles, VirtualMultiArray<Force> forces):n(nP),vmaParticle(particles),vmaForce(forces){}
	void compute()
	{
		#pragma omp parallel for schedule(guided)
		for(int i=0;i<n;i+=100)
		{
			// bulk read
			auto pi = vmaParticle.readOnlyGetN(i,100);
			std::vector<Force> forceVec(100);
			Force * const __restrict__ force = forceVec.data();
			for(int j=0;j<n;j+=100)
			{
				// tiling 100x100 = fits in cache too

				// mapped read

				vmaParticle.mappedReadWriteAccess(j,100,[&](Particle * tmpBuffer){

					Particle * __restrict__ pj = tmpBuffer + j;
					#pragma omp declare reduction( + : Force : omp_out.x += omp_in.x, omp_out.y += omp_in.y, omp_out.z += omp_in.z )

					#pragma omp simd reduction(+:force[:100])
					for(int ii=0;ii<100;ii++)
					{
						for(int jj=0;jj<100;jj++)
						{
							pi[ii].computeForce(pj[jj],force[ii]);
						}
					}


				},false,true,false);


			}
			vmaForce.writeOnlySetN(i,forceVec);
		}
	}

	int n;
	VirtualMultiArray<Particle> vmaParticle;
	VirtualMultiArray<Force> vmaForce;
};

int main(int argC, char ** argV)
{
	const size_t n = 5000;
	std::cout<<"Preparing virtual array..."<<std::endl;
	Nbody nbody(n);
	NbodyTiling optimizedNbody(n);
	NbodyTilingMapped optimizedMappedNbody(n);
	{
		const size_t pageSize=25;
		const std::vector<int> gpuMultipliers = PcieBandwidthBenchmarker().bestBandwidth(2);
		const int maxActivePagesPerGpu = 5;

		GraphicsCardSupplyDepot depot;
		VirtualMultiArray<Particle> arrParticle(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,gpuMultipliers);
		VirtualMultiArray<Force> arrForce(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,gpuMultipliers);

		nbody.vmaParticle=arrParticle;
		nbody.vmaForce=arrForce;

		optimizedNbody.vmaParticle=arrParticle;
		optimizedNbody.vmaForce=arrForce;

		optimizedMappedNbody.vmaParticle = arrParticle;
		optimizedMappedNbody.vmaForce = arrForce;
	}
	// now both nbody structs are sharing same data space with the other
        // and original arrParticle/arrForce can be safely destructed; depot also can be destructed


	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"benchmarking bandwidth (scalar get-set)"<<std::endl;
	std::chrono::milliseconds t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	nbody.compute();
	std::chrono::milliseconds t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	double timing1 = t2.count()-t1.count();
	std::cout<<(timing1)<<"ms"<<std::endl;
	std::cout<<(sizeof(Particle)*(size_t)n)*n/(timing1/1000.0)/1000000.0<<" MB/s bandwidth"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;

	std::cout<<std::endl<<std::endl;

	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"benchmarking bandwidth (tiling with bulk get-set)"<<std::endl;
	t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	for(int i=0;i<20;i++)
		optimizedNbody.compute();
	t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	double timing2 = (t2.count()-t1.count())/20.0;
	std::cout<<(timing2)<<"ms"<<std::endl;
	std::cout<<(sizeof(Particle)*(size_t)n)*n/((timing2)/1000.0)/1000000.0<<" MB/s bandwidth"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;

	std::cout<<"Tiling vs unoptimized: "<<(timing1/timing2)<<"x reading bandwidth speedup"<<std::endl;

	std::cout<<std::endl<<std::endl;

	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"benchmarking bandwidth (tiling with mapped access)"<<std::endl;
	t1 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	for(int i=0;i<20;i++)
		optimizedMappedNbody.compute();
	t2 =  std::chrono::duration_cast< std::chrono::milliseconds >(std::chrono::system_clock::now().time_since_epoch());
	double timing3 = (t2.count()-t1.count())/20.0;
	std::cout<<(timing3)<<"ms"<<std::endl;
	std::cout<<(sizeof(Particle)*(size_t)n)*n/((timing3)/1000.0)/1000000.0<<" MB/s bandwidth"<<std::endl;
	std::cout<<"#################################################"<<std::endl;
	std::cout<<"#################################################"<<std::endl;

	std::cout<<"Mapped access vs bulk read/write: "<<(timing2/timing3)<<"x reading bandwidth speedup"<<std::endl;

	return 0;
}

output on development computer:

Preparing virtual array...
#################################################
#################################################
benchmarking bandwidth (scalar get-set)
11243ms
62.261 MB/s bandwidth
#################################################
#################################################


#################################################
#################################################
benchmarking bandwidth (tiling with bulk get-set)
154.8ms
4521.96 MB/s bandwidth
#################################################
#################################################
Tiling vs unoptimized: 72.6292x reading bandwidth speedup


#################################################
#################################################
benchmarking bandwidth (tiling with mapped access)
149.35ms
4686.98 MB/s bandwidth
#################################################
#################################################
Mapped access vs bulk read/write: 1.03649x reading bandwidth speedup

Clone this wiki locally