-
Notifications
You must be signed in to change notification settings - Fork 1
Player Movement Example With Custom Tile Index Calculation
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Sep 5, 2025
·
3 revisions
Code:
#include <stdio.h>
#include <random>
#include <cmath>
// Cached encoder tool for 2d terrain.
#include "CompressedTerrainCache.cuh"
__global__ void k_keepGpuBoosted() {
int r = clock();
for (int i = 0; i < 1000000; i++) {
r += r * r;
if (i % 100 == 0) {
r += clock();
}
}
if (r == 0) {
printf("!");
}
}
int main()
{
// Player can see this far (in units).
uint64_t playerVisibilityRadius = 2025;
// Low velocity is more cache-friendly, high velocity causes more decoding and PCIE utilization.
float playerOrbitAngularVelocity = 0.009f;
// 2D terrain map size (in units), 2.5GB for terrain data, no allocation on device memory.
uint64_t terrainWidth = 11001;
uint64_t terrainHeight = 11003;
// 2D tile size (in units).
uint64_t tileWidth = 64;
uint64_t tileHeight = 64;
// Tile cache size, in tiles (so that 64x64 cache can store 4096 tiles at once). Consumes device memory.
uint64_t tileCacheSlotColumns = 64;
uint64_t tileCacheSlotRows = 64;
// internally this calculation is used as ordering of tiles.(index = tileX + tileY * numTilesX) (row-major)
uint64_t numTerrainElements = terrainWidth * terrainHeight;
uint64_t numTilesX = (terrainWidth + tileWidth - 1) / tileWidth;
uint64_t numTilesY = (terrainHeight + tileHeight - 1) / tileHeight;
uint64_t numTiles = numTilesX * numTilesY;
// Uses 2x memory, 1 for slow method, 1 for fast method. Slow method only demonstrates unoptimized access to terrain to compare to optimized version that uses decoding and caching.
bool benchmarkSlowMethodForComparison = false;
using TerrainObject = uint32_t;
// Generating sample terrain (2D cos wave pattern).
std::vector<TerrainObject> terrain = std::vector<TerrainObject>(numTerrainElements);
for (uint64_t y = 0; y < terrainHeight; y++) {
for (uint64_t x = 0; x < terrainWidth; x++) {
uint64_t index = x + y * terrainWidth;
// LSB[0:1] = 2 -> object (static object id = MSB 16 bits)
// LSB[0:1] = 1 = object (npc id = MSB 16 bits)
// LSB[0:1] = 0 = empty
terrain[index] = rand() % 0xFFFFF;
}
}
int deviceIndex = 0;
int numCpuThreads = std::thread::hardware_concurrency();
std::cout << "Encoding tiles." << std::endl;
CompressedTerrainCache::TileManager<TerrainObject> tileManager(terrain.data(), terrainWidth, terrainHeight, tileWidth, tileHeight, tileCacheSlotColumns, tileCacheSlotRows, numCpuThreads, deviceIndex);
double timeDecode = 0.0f;
double dataSizeDecode = 0.0f;
double throughputDecode = 0.0;
double averageThroughput = 0.0f;
float playerAngularPosition = 0.0f;
std::vector<uint32_t> tileIndices;
int playerMovements = 250;
for (int i = 0; i < playerMovements; i++) {
tileIndices.clear();
playerAngularPosition += playerOrbitAngularVelocity;
for (uint64_t tileY = 0; tileY < numTilesY; tileY++) {
for (uint64_t tileX = 0; tileX < numTilesX; tileX++) {
// Checking if player visibility range collides with the current tile.
uint64_t playerX = terrainWidth / 2 + cos(playerAngularPosition) * terrainWidth / 4;
uint64_t playerY = terrainHeight / 2 + sin(playerAngularPosition) * terrainHeight / 4;
uint64_t distanceX = playerX - (tileX * tileWidth + tileWidth / 2);
uint64_t distanceY = playerY - (tileY * tileHeight + tileHeight / 2);
uint64_t distance = sqrt(distanceX * distanceX + distanceY * distanceY);
if (distance < playerVisibilityRadius) {
tileIndices.push_back(tileX + tileY * numTilesX);
}
}
}
unsigned char* loadedTilesOnDevice_d = tileManager.decodeSelectedTiles(tileIndices, &timeDecode, &dataSizeDecode, &throughputDecode);
// Use the fetched terrain data on gpu
void* args[] = { &loadedTilesOnDevice_d };
int minGridSize;
int blockSize;
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)k_keepGpuBoosted, 0));
CUDA_CHECK(cudaLaunchKernel((void*)k_keepGpuBoosted, dim3(minGridSize, 1, 1), dim3(blockSize, 1, 1), args, 0, tileManager.stream));
CUDA_CHECK(cudaStreamSynchronize(tileManager.stream));
std::cout << "time = " << timeDecode << " seconds, dataSizeDecode = " << dataSizeDecode << " GB, throughputDecode = " << throughputDecode << " GB/s" << std::endl;
averageThroughput += throughputDecode;
}
std::cout << "------------------------------------------------" << std::endl;
std::cout << "Average throughput = " << averageThroughput / playerMovements << " GB/s" << std::endl;
return 0;
}output with RTX4070:
time = 0.000438048 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 117.668 GB/s
time = 0.000373792 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 137.895 GB/s
time = 0.000491392 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 104.961 GB/s
time = 0.00041456 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 124.176 GB/s
time = 0.000393888 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 130.693 GB/s
time = 0.000370464 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 138.957 GB/s
time = 0.00043056 seconds, dataSizeDecode = 0.0514949 GB, throughputDecode = 119.6 GB/s
time = 0.000426816 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 120.841 GB/s
time = 0.0004112 seconds, dataSizeDecode = 0.0515277 GB, throughputDecode = 125.311 GB/s
time = 0.000376224 seconds, dataSizeDecode = 0.0515932 GB, throughputDecode = 137.134 GB/s
------------------------------------------------
Average throughput = 138.887 GB/s
output with RTX5070:
time = 0.000261216 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 197.324 GB/s
time = 0.00024416 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 211.108 GB/s
time = 0.000244576 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 210.749 GB/s
time = 0.00027504 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 187.525 GB/s
time = 0.000244192 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 210.812 GB/s
time = 0.00024672 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 208.652 GB/s
time = 0.000208128 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 247.341 GB/s
time = 0.000226208 seconds, dataSizeDecode = 0.0514949 GB, throughputDecode = 227.644 GB/s
time = 0.000246496 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 209.24 GB/s
time = 0.000246112 seconds, dataSizeDecode = 0.0515277 GB, throughputDecode = 209.367 GB/s
time = 0.000241792 seconds, dataSizeDecode = 0.0515932 GB, throughputDecode = 213.379 GB/s
------------------------------------------------
Average throughput = 206.4 GB/s