Skip to content

Player Movement Example With Custom Tile Index Calculation

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Sep 5, 2025 · 3 revisions

Code:

#include <stdio.h>
#include <random>
#include <cmath>
// Cached encoder tool for 2d terrain.
#include "CompressedTerrainCache.cuh"
__global__ void k_keepGpuBoosted() {
    int r = clock();
    for (int i = 0; i < 1000000; i++) {
        r += r * r;
        if (i % 100 == 0) {
            r += clock();
        }
    }
    if (r == 0) {
        printf("!");
    }
}
int main()
{
    // Player can see this far (in units).
    uint64_t playerVisibilityRadius = 2025;
    // Low velocity is more cache-friendly, high velocity causes more decoding and PCIE utilization.
    float playerOrbitAngularVelocity = 0.009f;
    // 2D terrain map size (in units), 2.5GB for terrain data, no allocation on device memory.
    uint64_t terrainWidth = 11001;
    uint64_t terrainHeight = 11003;
    // 2D tile size (in units).
    uint64_t tileWidth = 64;
    uint64_t tileHeight = 64;
    // Tile cache size, in tiles (so that 64x64 cache can store 4096 tiles at once). Consumes device memory.
    uint64_t tileCacheSlotColumns = 64;
    uint64_t tileCacheSlotRows = 64;
    // internally this calculation is used as ordering of tiles.(index = tileX + tileY * numTilesX) (row-major)
    uint64_t numTerrainElements = terrainWidth * terrainHeight;
    uint64_t numTilesX = (terrainWidth + tileWidth - 1) / tileWidth;
    uint64_t numTilesY = (terrainHeight + tileHeight - 1) / tileHeight;
    uint64_t numTiles = numTilesX * numTilesY;
    // Uses 2x memory, 1 for slow method, 1 for fast method. Slow method only demonstrates unoptimized access to terrain to compare to optimized version that uses decoding and caching.
    bool benchmarkSlowMethodForComparison = false;

    using TerrainObject = uint32_t;


    // Generating sample terrain (2D cos wave pattern).
    std::vector<TerrainObject> terrain = std::vector<TerrainObject>(numTerrainElements);
    for (uint64_t y = 0; y < terrainHeight; y++) {
        for (uint64_t x = 0; x < terrainWidth; x++) {
            uint64_t index = x + y * terrainWidth;
            // LSB[0:1] = 2 -> object (static object id = MSB 16 bits)
            // LSB[0:1] = 1  = object (npc id = MSB 16 bits)
            // LSB[0:1] = 0  = empty
            terrain[index] = rand() % 0xFFFFF;
        }
    }

    int deviceIndex = 0;
    int numCpuThreads = std::thread::hardware_concurrency();
    std::cout << "Encoding tiles." << std::endl;
    CompressedTerrainCache::TileManager<TerrainObject> tileManager(terrain.data(), terrainWidth, terrainHeight, tileWidth, tileHeight, tileCacheSlotColumns, tileCacheSlotRows, numCpuThreads, deviceIndex);

    double timeDecode = 0.0f;
    double dataSizeDecode = 0.0f;
    double throughputDecode = 0.0;
    double averageThroughput = 0.0f;
    float playerAngularPosition = 0.0f;
    std::vector<uint32_t> tileIndices;
    int playerMovements = 250;
    for (int i = 0; i < playerMovements; i++) {
        tileIndices.clear();
        playerAngularPosition += playerOrbitAngularVelocity;
        for (uint64_t tileY = 0; tileY < numTilesY; tileY++) {
            for (uint64_t tileX = 0; tileX < numTilesX; tileX++) {
                // Checking if player visibility range collides with the current tile.
                uint64_t playerX = terrainWidth / 2 + cos(playerAngularPosition) * terrainWidth / 4;
                uint64_t playerY = terrainHeight / 2 + sin(playerAngularPosition) * terrainHeight / 4;
                uint64_t distanceX = playerX - (tileX * tileWidth + tileWidth / 2);
                uint64_t distanceY = playerY - (tileY * tileHeight + tileHeight / 2);
                uint64_t distance = sqrt(distanceX * distanceX + distanceY * distanceY);
                if (distance < playerVisibilityRadius) {
                    tileIndices.push_back(tileX + tileY * numTilesX);
                }
            }
        }
        unsigned char* loadedTilesOnDevice_d = tileManager.decodeSelectedTiles(tileIndices, &timeDecode, &dataSizeDecode, &throughputDecode);
        // Use the fetched terrain data on gpu
        void* args[] = { &loadedTilesOnDevice_d };
        int minGridSize;
        int blockSize;
        CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)k_keepGpuBoosted, 0));
        CUDA_CHECK(cudaLaunchKernel((void*)k_keepGpuBoosted, dim3(minGridSize, 1, 1), dim3(blockSize, 1, 1), args, 0, tileManager.stream));
        CUDA_CHECK(cudaStreamSynchronize(tileManager.stream));

        std::cout << "time = " << timeDecode << " seconds, dataSizeDecode = " << dataSizeDecode << " GB, throughputDecode = " << throughputDecode << " GB/s" << std::endl;
        averageThroughput += throughputDecode;
    }
    std::cout << "------------------------------------------------" << std::endl;
    std::cout << "Average throughput = " << averageThroughput / playerMovements << " GB/s" << std::endl;
    return 0;
}

output with RTX4070:

time = 0.000438048 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 117.668 GB/s
time = 0.000373792 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 137.895 GB/s
time = 0.000491392 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 104.961 GB/s
time = 0.00041456 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 124.176 GB/s
time = 0.000393888 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 130.693 GB/s
time = 0.000370464 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 138.957 GB/s
time = 0.00043056 seconds, dataSizeDecode = 0.0514949 GB, throughputDecode = 119.6 GB/s
time = 0.000426816 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 120.841 GB/s
time = 0.0004112 seconds, dataSizeDecode = 0.0515277 GB, throughputDecode = 125.311 GB/s
time = 0.000376224 seconds, dataSizeDecode = 0.0515932 GB, throughputDecode = 137.134 GB/s
------------------------------------------------
Average throughput = 138.887 GB/s

output with RTX5070:

time = 0.000261216 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 197.324 GB/s
time = 0.00024416 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 211.108 GB/s
time = 0.000244576 seconds, dataSizeDecode = 0.0515441 GB, throughputDecode = 210.749 GB/s
time = 0.00027504 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 187.525 GB/s
time = 0.000244192 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 210.812 GB/s
time = 0.00024672 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 208.652 GB/s
time = 0.000208128 seconds, dataSizeDecode = 0.0514785 GB, throughputDecode = 247.341 GB/s
time = 0.000226208 seconds, dataSizeDecode = 0.0514949 GB, throughputDecode = 227.644 GB/s
time = 0.000246496 seconds, dataSizeDecode = 0.0515768 GB, throughputDecode = 209.24 GB/s
time = 0.000246112 seconds, dataSizeDecode = 0.0515277 GB, throughputDecode = 209.367 GB/s
time = 0.000241792 seconds, dataSizeDecode = 0.0515932 GB, throughputDecode = 213.379 GB/s
------------------------------------------------
Average throughput = 206.4 GB/s
Clone this wiki locally