Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ VULKAN_SHADER_BINS := $(VULKAN_SHADER_SRCS:.comp=.spv)
DEPS += $(VULKAN_SHADER_BINS)

%.spv: %.comp
$(CGLSLC) -c $< -o $@
$(CGLSLC) -c $< -o $@ --target-env=vulkan1.2
nn-vulkan-test: src/nn/nn-vulkan-test.cpp nn-quants.o nn-core.o nn-executor.o nn-vulkan.o ${DEPS}
$(CXX) $(CXXFLAGS) $(filter-out %.spv, $^) -o $@ $(LIBS)
endif
Expand Down
12 changes: 6 additions & 6 deletions src/nn/nn-vulkan-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,16 +662,16 @@ void testMultiheadAtt_F32_F32() {
int main() {
initQuants();

testRmsNorm_F32_F32_F32<3>();
testRmsNorm_F32_F32_F32<4>();
testRmsNorm_F32_F32_F32<1024>();
testRmsNorm_F32_F32_F32<3191>();
testRmsNorm_F32_F32_F32<3196>();

testSilu_F32_F32<3>();
testSilu_F32_F32<4>();
testSilu_F32_F32<32>();
testSilu_F32_F32<101>();
testSilu_F32_F32<104>();

testMul_F32_F32<32>();
testMul_F32_F32<47>();
testMul_F32_F32<48>();

testMergeAdd_F32_F32();

Expand All @@ -686,7 +686,7 @@ int main() {

testCast_F32_F32<128>();
testCast_F32_F32<32>();
testCast_F32_F32<9>();
testCast_F32_F32<8>();

testCast_F32_Q80<256>();
testCast_F32_Q80<64>();
Expand Down
81 changes: 61 additions & 20 deletions src/nn/nn-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,22 +522,48 @@ static std::vector<NnVulkanBatchInfo> buildBatchInfo(NnOpConfig *opConfig, NnVul
return offset;
}

static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSize, NnUint *groupCount) {
static NnUint roundUpPow2(NnUint n, NnUint min, NnUint max) {
NnUint p = 1;
while (p << 1 <= n) p <<= 1;
if (p < n) p <<= 1;
if (p < min) p = min;
if (p > max) p = max;
return p;
}

static uint32_t resolveShaderNThreads(const NnOpConfig *opConfig, const NnSize2D inputSize) {
if (opConfig->code == OP_MATMUL) {
if (opConfig->weightSize.floatType == F_Q40) {
constexpr NnUint maxThreads = 256; // Shader constant
NnUint t = roundUpPow2(inputSize.x / (Q40_BLOCK_SIZE * 2), 32, maxThreads);
return t;
}
}
return 0;
}

static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSize, NnUint *groupCount, const NnSize2D inputSize, const NnSize2D outputSize) {
groupCount[0] = 1;
groupCount[1] = batchSize;
groupCount[2] = 1;

if (opConfig->code == OP_CAST ||
opConfig->code == OP_MUL ||
opConfig->code == OP_SILU ||
opConfig->code == OP_SHIFT ||
opConfig->code == OP_MERGE_ADD)
groupCount[2] = 32;
else if (opConfig->code == OP_MATMUL) {
if (opConfig->code == OP_CAST) {
if (outputSize.floatType == F_Q80) {
groupCount[2] = outputSize.x / Q80_BLOCK_SIZE;
} else {
constexpr NnUint chunkSize = 4; // Shader constant
groupCount[2] = outputSize.x / chunkSize;
}
} else if (opConfig->code == OP_MERGE_ADD) {
if (inputSize.floatType == F_Q80) {
groupCount[2] = outputSize.x / Q80_BLOCK_SIZE; // Yes, outputSize is used here
} else {
groupCount[2] = 32;
}
} else if (opConfig->code == OP_MATMUL) {
if (opConfig->weightSize.floatType == F_Q40) {
// Must be synced with the shader
constexpr NnUint tileSizeN = 2;
constexpr NnUint tileSizeD = 16;
constexpr NnUint tileSizeN = 2; // Shader constant
constexpr NnUint tileSizeD = 8; // Shader constant
const NnUint blockSize = getBlockSize(opConfig->weightSize.floatType);
assert(opConfig->weightSize.y % (tileSizeN * blockSize) == 0);
assert(opConfig->weightSize.x % tileSizeD == 0);
Expand All @@ -550,8 +576,17 @@ static void resolveShaderGroups(const NnOpConfig *opConfig, const NnUint batchSi
groupCount[2] = ((NnMultiHeadAttOpConfig *)opConfig->config)->nHeads0;
else if (opConfig->code == OP_INV_RMS)
groupCount[2] = ((NnInvRmsOpConfig *)opConfig->config)->nColumns;
else if (opConfig->code == OP_RMS_NORM)
groupCount[2] = ((NnRmsNormOpConfig *)opConfig->config)->nColumns;
else if (
opConfig->code == OP_EMBEDDING ||
opConfig->code == OP_RMS_NORM ||
opConfig->code == OP_MUL ||
opConfig->code == OP_SILU ||
opConfig->code == OP_SHIFT
) {
constexpr NnUint chunkSize = 4; // Shader constant
assert(outputSize.x % chunkSize == 0);
groupCount[2] = outputSize.x / chunkSize;
}
}

static std::vector<uint32_t> readShader(const char *fileName) {
Expand Down Expand Up @@ -647,12 +682,10 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD

std::vector<vk::PipelineShaderStageCreateInfo> shaderCreateInfos(segmentConfig->nOps);

constexpr NnUint maxConsts = 3;
std::vector<NnUint> nConsts(segmentConfig->nOps);
std::vector<int> consts(segmentConfig->nOps * maxConsts);
std::vector<vk::SpecializationInfo> specInfos(segmentConfig->nOps);
std::vector<vk::SpecializationMapEntry> specMapEntries(segmentConfig->nOps * maxConsts);

std::vector<vk::SpecializationMapEntry> specEntries(segmentConfig->nOps);
std::vector<uint32_t> nThreads(segmentConfig->nOps);

for (NnUint opIndex = 0; opIndex < segmentConfig->nOps; opIndex++) {
NnOpConfig *opConfig = &segmentConfig->ops[opIndex];
NnSize2D inputSize = data->resolveBufferSize(&opConfig->input);
Expand All @@ -674,12 +707,17 @@ NnVulkanDeviceSegment::NnVulkanDeviceSegment(NnVulkanContext *context, NnVulkanD
code.data()
);

nThreads[opIndex] = resolveShaderNThreads(opConfig, inputSize);
specEntries[opIndex] = vk::SpecializationMapEntry(0, 0, sizeof(uint32_t));
specInfos[opIndex] = vk::SpecializationInfo(1, &specEntries[opIndex], sizeof(uint32_t), &nThreads[opIndex]);

vk::ShaderModule shaderModule = context->device.createShaderModule(shaderModuleCreateInfo);
vk::PipelineShaderStageCreateInfo shaderCreateInfo(
vk::PipelineShaderStageCreateFlags(),
vk::ShaderStageFlagBits::eCompute,
shaderModule,
"main"
"main",
&specInfos[opIndex]
);

shaderModules[opIndex] = shaderModule;
Expand Down Expand Up @@ -837,7 +875,10 @@ void NnVulkanDeviceSegment::forward(NnUint opIndex, NnUint nThreads, NnUint thre

NnUint opGroupCount[3];
for (NnUint opIndex = 0; opIndex < segmentConfig->nOps; opIndex++) {
resolveShaderGroups(&segmentConfig->ops[opIndex], batchSize, opGroupCount);
NnSize2D inputSize = data->resolveBufferSize(&segmentConfig->ops[opIndex].input);
NnSize2D outputSize = data->resolveBufferSize(&segmentConfig->ops[opIndex].output);

resolveShaderGroups(&segmentConfig->ops[opIndex], batchSize, opGroupCount, inputSize, outputSize);

if (opIndex > 0) {
std::vector<NnOpBufferUsage> *usages = &opBufferUsages[opIndex];
Expand Down
18 changes: 7 additions & 11 deletions src/nn/vulkan/cast-forward-f32-f32.comp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#version 450

#define N_THREADS 256
#extension GL_EXT_control_flow_attributes : enable

#define CHUNK_SIZE 4
#define N_BATCHES 32

layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

struct BatchInfo {
uint inputOffset;
Expand All @@ -17,20 +19,14 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };

void main() {
const uint threadIndex = gl_LocalInvocationID.x;
const uint nWorkGroups = gl_NumWorkGroups.z;
const uint batchIndex = gl_WorkGroupID.y;
const uint workGroupIndex = gl_WorkGroupID.z;

const uint chunkIndex = gl_WorkGroupID.z;
const BatchInfo info = infos[batchIndex];
const uint slice = info.inputSizeX / nWorkGroups;
const uint rest = info.inputSizeX % nWorkGroups;
const uint offset = workGroupIndex * slice + min(rest, workGroupIndex);
const uint dim = slice + (workGroupIndex < rest ? 1 : 0);
const uint offset = chunkIndex * CHUNK_SIZE;
const uint xOffset = info.inputOffset + offset;
const uint yOffset = info.outputOffset + offset;

for (uint i = threadIndex; i < dim; i += N_THREADS) {
[[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
y[yOffset + i] = x[xOffset + i];
}
}
49 changes: 18 additions & 31 deletions src/nn/vulkan/cast-forward-f32-q80.comp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types : enable

#define Q80_BLOCK_SIZE 32
#define N_THREADS 256
#define N_BATCHES 32

layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

struct BatchInfo {
uint inputOffset;
Expand All @@ -27,37 +26,25 @@ layout(binding = 1) writeonly buffer outputBuffer { BlockQ80 y[]; };
layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };

void main() {
const uint threadIndex = gl_LocalInvocationID.x;
const uint nWorkGroups = gl_NumWorkGroups.z;
const uint batchIndex = gl_WorkGroupID.y;
const uint workGroupIndex = gl_WorkGroupID.z;
const uint d = gl_WorkGroupID.z;

const BatchInfo info = infos[batchIndex];
const uint slice = info.outputSizeX / nWorkGroups;
const uint rest = info.outputSizeX % nWorkGroups;
const uint yStart = workGroupIndex * slice + min(rest, workGroupIndex);
const uint yEnd = yStart + slice + (workGroupIndex < rest ? 1 : 0);
const uint xOffset = info.inputOffset;
const uint yOffset = info.outputOffset;

for (uint i = yStart + threadIndex; i < yEnd; i += N_THREADS) {
const uint xiOffset = xOffset + i * Q80_BLOCK_SIZE;
const uint yiOffset = yOffset + i;

float amax = 0.0;
[[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
const float v = abs(x[xiOffset + j]);
amax = max(amax, v);
}

const float d = amax / 127.0f;
const float id = d != 0.0f ? 1.0f / d : 0.0f;

y[yiOffset].d = float16_t(d);

[[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
const float v = x[xiOffset + j];
y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0f, 127.0f));
}
const uint xiOffset = info.inputOffset + d * Q80_BLOCK_SIZE;
const uint yiOffset = info.outputOffset + d;

float amax = 0.0;
[[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
amax = max(amax, abs(x[xiOffset + j]));
}

const float dd = amax / 127.0f;
const float id = dd != 0.0f ? 1.0f / dd : 0.0f;

y[yiOffset].d = float16_t(dd);

[[unroll]] for (uint j = 0; j < Q80_BLOCK_SIZE; ++j) {
const float v = x[xiOffset + j];
y[yiOffset].qs[j] = int8_t(clamp(round(v * id), -127.0f, 127.0f));
}
}
27 changes: 11 additions & 16 deletions src/nn/vulkan/embedding-forward-f32-f32.comp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#version 450

#define N_THREADS 256
#extension GL_EXT_control_flow_attributes : enable

#define CHUNK_SIZE 4
#define N_BATCHES 32

layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

struct BatchInfo {
uint inputOffset;
Expand All @@ -20,23 +22,16 @@ layout(binding = 3) readonly buffer weightBuffer { float weight[]; };
shared uint sharedPosition;

void main() {
const uint threadIndex = gl_LocalInvocationID.x;
const uint batchIndex = gl_GlobalInvocationID.y;

if (threadIndex == 0) {
sharedPosition = uint(x[batchIndex]);
}
const uint batchIndex = gl_WorkGroupID.y;
const uint chunkIndex = gl_WorkGroupID.z;
const uint position = uint(x[batchIndex]);

barrier();

const uint position = sharedPosition;
const BatchInfo info = infos[batchIndex];
const uint offset = chunkIndex * CHUNK_SIZE;
const uint yOffset = info.outputOffset + offset;
const uint wOffset = position * info.outputSizeX + offset;

const uint outputSizeX = info.outputSizeX;
const uint yOffset = info.outputOffset;
const uint wOffset = position * outputSizeX;

for (uint i = threadIndex; i < outputSizeX; i += N_THREADS) {
[[unroll]] for (uint i = 0; i < CHUNK_SIZE; i++) {
y[yOffset + i] = weight[wOffset + i];
}
}
17 changes: 9 additions & 8 deletions src/nn/vulkan/matmul-forward-q80-q40-f32.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
#extension GL_EXT_shader_16bit_storage : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable

#define N_THREADS 64
#define MAX_THREADS 256
#define N_BATCHES 32
#define TILE_SIZE_X 2
#define TILE_SIZE_D 16
#define TILE_SIZE_D 8

#define Q80_Q40_BLOCK_SIZE 32

layout(local_size_x = N_THREADS, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

struct BatchInfo {
uint inputOffset;
Expand All @@ -35,17 +35,18 @@ layout(binding = 1) writeonly buffer outputBuffer { float y[]; };
layout(binding = 2) readonly uniform batchInfosBuffer { BatchInfo infos[N_BATCHES]; };
layout(binding = 3) readonly buffer weightBuffer { BlockQ40 weight[]; };

shared float16_t sums[N_THREADS * TILE_SIZE_D];
shared float16_t sums[MAX_THREADS * TILE_SIZE_D];

void main() {
const uint nThreads = gl_WorkGroupSize.x;
const uint threadIndex = gl_LocalInvocationID.x;
const uint batchIndex = gl_WorkGroupID.y;
const uint workGroupIndex = gl_WorkGroupID.z;
const BatchInfo info = infos[batchIndex];

const uint xTiles = info.inputSizeX / TILE_SIZE_X;
const uint xSlice = xTiles / N_THREADS;
const uint xRest = xTiles % N_THREADS;
const uint xSlice = xTiles / nThreads;
const uint xRest = xTiles % nThreads;

const uint inputOffset = info.inputOffset;
const uint inputSizeX = info.inputSizeX;
Expand Down Expand Up @@ -97,15 +98,15 @@ void main() {

barrier();

[[unroll]] for (uint i = N_THREADS / 2; i > 0; i >>= 1) {
for (uint i = nThreads / 2; i > 0; i >>= 1) {
for (uint dt = 0; dt < TILE_SIZE_D; dt++) {
if (threadIndex < i) {
sums[threadIndex * TILE_SIZE_D + dt] += sums[(threadIndex + i) * TILE_SIZE_D + dt];
}
}
barrier();
}
for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += N_THREADS) {
for (uint dt = threadIndex; dt < TILE_SIZE_D; dt += nThreads) {
y[outputOffset + d + dt] = float(sums[dt]);
}
}
Loading
Loading