Skip to content

Overlapped read performance

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 10, 2021 · 8 revisions

Generally, overlapping helps usage of page caching to increase bandwidth. Overlapping too much causes page-lock contention. Following program tests performance of all read methods:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include<omp.h>


class Obj
{
public:
	Obj() { b = -1; }
	Obj(int i) { b = i; }
	int b;
	char buf[100];
};

int main(int argC, char** argV)
{

	std::cout << "preparing virtual array..." << std::endl;
	size_t n = 1000000;
	size_t p = 1000;
	GraphicsCardSupplyDepot gpu;
	VirtualMultiArray<Obj> arr(n, gpu.requestGpus(), p);

	std::cout << "initializing data..." << std::endl;
#pragma omp parallel for
	for (int i = 0; i < n; i++)
	{
		arr.set(i, Obj(i));
	}
	std::cout << "non-overlapped n = " << sizeof(Obj) * n << " bytes" << std::endl;
	std::cout << "overlapped n x 1000 = " << sizeof(Obj) * n * 1000 << " bytes" << std::endl;
	{
		std::cout << "<overlapped> get() x n x 1000:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			for (int j = 0; j < 1000; j++)
			{
				if (arr.get(i + j).b != i + j)
				{
					std::cout << "err" << std::endl;
				}
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}

	{
		std::cout << "<overlapped> readOnlyGetN() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			auto data = arr.readOnlyGetN(i, 1000);
			for (int j = 0; j < 1000; j++)
			{
				if (data[j].b != i + j)
				{
					std::cout << "err" << std::endl;
				}
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}

	{
		std::cout << "<overlapped> mappedReadWriteAccess() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false);
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}


	{
		std::cout << "<overlapped> mappedReadWriteAccess(userPtr) x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(1000 * omp_get_max_threads());

#pragma omp parallel for
		for (int i = 1000; i < n - 1000; i++)
		{
			int threadId = omp_get_thread_num();
			arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false, tmp.data() + (threadId * 1000));
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}


	{
		std::cout << "<overlapped, variable length> mappedReadWriteAccess() x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
#pragma omp parallel for
		for (int i = 1000; i < n - 2000; i++)
		{
			arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000 + i % 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false);
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}

	{
		std::cout << "<overlapped, variable length> mappedReadWriteAccess(userPtr) x n:    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(2000 * omp_get_max_threads());
#pragma omp parallel for
		for (int i = 1000; i < n - 2000; i++)
		{
			int threadId = omp_get_thread_num();
			arr.mappedReadWriteAccess(i, 1000 + i % 1000, [&](Obj* ptr)
				{
					for (int j = i; j < i + 1000 + i % 1000; j++)
					{
						if (ptr[j].b != j)
						{
							std::cout << "err" << std::endl;
						}
					}
				}, false, true, false, tmp.data() + (threadId * 2000));
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << t2.count() - t1.count() << "ms" << std::endl;
	}

	{
		std::cout << "<non-overlapped> mappedReadWriteAccess(userPtr):    " << std::flush;
		std::chrono::milliseconds t1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::vector<Obj> tmp(1000 * omp_get_max_threads());
		for (int k = 0; k < 100; k++)
		{
#pragma omp parallel for
			for (int i = 0; i < n; i += 1000)
			{
				int threadId = omp_get_thread_num();
				arr.mappedReadWriteAccess(i, 1000, [&](Obj* ptr)
					{
						for (int j = i; j < i + 1000; j++)
						{
							if (ptr[j].b != j)
							{
								std::cout << "err" << std::endl;
							}
						}
					}, false, true, false, tmp.data() + (threadId * 1000));
			}
		}
		std::chrono::milliseconds t2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
		std::cout << (t2.count() - t1.count()) / 100.0 << "ms" << std::endl;
	}

	return 0;
}

Output for development computer:

preparing virtual array...
initializing data...
non-overlapped n = 104000000 bytes
overlapped n x 1000 = 104000000000 bytes
<overlapped> get() x n x 1000:    67713ms
<overlapped> readOnlyGetN() x n:    8911ms
<overlapped> mappedReadWriteAccess() x n:    3413ms
<overlapped> mappedReadWriteAccess(userPtr) x n:    3249ms
<overlapped, variable length> mappedReadWriteAccess() x n:    11696ms
<overlapped, variable length> mappedReadWriteAccess(userPtr) x n:    11355ms
<non-overlapped> mappedReadWriteAccess(userPtr):    27.98ms

Mapping achieved best performance due to the aligned raw-pointer. For systems with memory fragmentation, userPtr mapping should work better due to re-using user pointers.

An overlapped mappedREadWriteAccess(userPtr) x n benchmark result of 3249ms means (n=)1M times 1000-element scan which means 1000000 x 104kB in 3249ms which means 32GB/s. So, the LRU cache works.

Non-overlapped part does not test the LRU cache but PCIE bandwidth. 27.98ms per scan of whole array. 1M elements = 104MB data. 3.7GB/s pcie performance achieved.

Clone this wiki locally