Skip to content
Open
43 changes: 12 additions & 31 deletions src/Algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Algorithm::isInit()
{
return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
this->mDescriptorPool && this->mDescriptorSet &&
this->mDescriptorSetLayout && this->mShaderModule;
this->mDescriptorSetLayout && this->mShader;
}

void
Expand Down Expand Up @@ -73,18 +73,12 @@ Algorithm::destroy()
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mPipelineLayout = nullptr;
}

if (this->mFreeShaderModule && this->mShaderModule) {
KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
if (!this->mShaderModule) {
KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
"module but it is null");
}
this->mDevice->destroy(
*this->mShaderModule,
(vk::Optional<const vk::AllocationCallbacks>)nullptr);
this->mShaderModule = nullptr;
}

if (this->mShader)
{
this->mShader->destroy();
this->mShader = nullptr;
}

// We don't call freeDescriptorSet as the descriptor pool is not created
// with VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT more at
Expand Down Expand Up @@ -219,24 +213,11 @@ Algorithm::createParameters()
}

void
Algorithm::createShaderModule()
Algorithm::createShaderModule(const std::vector<uint32_t>& spirv)
{
KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");

vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
sizeof(uint32_t) *
this->mSpirv.size(),
this->mSpirv.data());

KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
this->mSpirv.size());
this->mFreeShaderModule = true;
this->mShaderModule = std::make_shared<vk::ShaderModule>();
this->mDevice->createShaderModule(
&shaderModuleInfo, nullptr, this->mShaderModule.get());
this->mFreeShaderModule = true;

KP_LOG_DEBUG("Kompute Algorithm create shader module success");
KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
this->mShader = std::make_shared<Shader>(this->mDevice, spirv);
KP_LOG_DEBUG("Kompute Algorithm create shader module success");
}

void
Expand Down Expand Up @@ -289,7 +270,7 @@ Algorithm::createPipeline()
vk::PipelineShaderStageCreateInfo shaderStage(
vk::PipelineShaderStageCreateFlags(),
vk::ShaderStageFlagBits::eCompute,
*this->mShaderModule,
this->mShader->getShaderModule(),
"main",
&specializationInfo);

Expand Down
3 changes: 2 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ add_library(kompute Algorithm.cpp
Tensor.cpp
Core.cpp
Image.cpp
Memory.cpp)
Memory.cpp
Shader.cpp)

add_library(kompute::kompute ALIAS kompute)

Expand Down
4 changes: 2 additions & 2 deletions src/Manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,11 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
uint32_t computeQueueFamilyIndex = 0;
bool computeQueueSupported = false;
for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
vk::QueueFamilyProperties queueFamilyProperties =
const vk::QueueFamilyProperties& queueFamilyProperties =
allQueueFamilyProperties[i];

if (queueFamilyProperties.queueFlags &
vk::QueueFlagBits::eCompute) {
vk::QueueFlagBits::eCompute ) {
computeQueueFamilyIndex = i;
computeQueueSupported = true;
break;
Expand Down
43 changes: 43 additions & 0 deletions src/Shader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include "kompute/Shader.hpp"

namespace kp {

Shader::Shader(const std::shared_ptr<vk::Device>& device,
const std::vector<uint32_t>& spv) :
mDevice(device)
{
KP_LOG_DEBUG("Kompute Module constructor started");
KP_LOG_DEBUG("Kompute Module Creating shader module. ShaderFileSize: {}",
spv.size());
vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
sizeof(uint32_t) * spv.size(), spv.data());
this->mDevice->createShaderModule(
&shaderModuleInfo, nullptr, &(this->mShaderModule) );
KP_LOG_DEBUG("Kompute Module constructor success");
}

const vk::ShaderModule& Shader::getShaderModule()
{
if (this->mDestroyed)
throw std::runtime_error("Attempting to get vk::ShaderModule from destroyed kp::Shader instance");
return this->mShaderModule;
}

void Shader::destroy()
{
KP_LOG_DEBUG("Kompute Module destructor started");
KP_LOG_DEBUG("Kompute Module Destroying shader module");
if (!this->mDestroyed)
{
this->mDestroyed = true;
this->mDevice->destroyShaderModule(this->mShaderModule);
}
KP_LOG_DEBUG("Kompute Module destructor success");
}

Shader::~Shader()
{
this->destroy();
}

} // end namespace kp
10 changes: 4 additions & 6 deletions src/include/kompute/Algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#endif

#include "kompute/Tensor.hpp"
#include "kompute/Shader.hpp"
#include "logger/Logger.hpp"

namespace kp {
Expand Down Expand Up @@ -95,7 +96,6 @@ class Algorithm
KP_LOG_DEBUG("Kompute Algorithm rebuild started");

this->mMemObjects = memObjects;
this->mSpirv = spirv;

if (specializationConstants.size()) {
if (this->mSpecializationConstantsData) {
Expand Down Expand Up @@ -137,7 +137,7 @@ class Algorithm
}

this->createParameters();
this->createShaderModule();
this->createShaderModule(spirv);
this->createPipeline();
}

Expand Down Expand Up @@ -303,8 +303,6 @@ class Algorithm
bool mFreeDescriptorPool = false;
std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
bool mFreeDescriptorSet = false;
std::shared_ptr<vk::ShaderModule> mShaderModule;
bool mFreeShaderModule = false;
std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
bool mFreePipelineLayout = false;
std::shared_ptr<vk::PipelineCache> mPipelineCache;
Expand All @@ -313,17 +311,17 @@ class Algorithm
bool mFreePipeline = false;

// -------------- ALWAYS OWNED RESOURCES
std::vector<uint32_t> mSpirv;
void* mSpecializationConstantsData = nullptr;
uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
uint32_t mSpecializationConstantsSize = 0;
void* mPushConstantsData = nullptr;
uint32_t mPushConstantsDataTypeMemorySize = 0;
uint32_t mPushConstantsSize = 0;
Workgroup mWorkgroup;
std::shared_ptr<Shader> mShader = nullptr;

// Create util functions
void createShaderModule();
void createShaderModule(const std::vector<uint32_t>& spirv);
void createPipeline();

// Parameters
Expand Down
44 changes: 44 additions & 0 deletions src/include/kompute/Shader.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#pragma once

#include "kompute/Core.hpp"
#include "logger/Logger.hpp"
#include <memory>

namespace kp {

// forward declarations for std::shared_from_this
class Shader;

/*
* Wrapper for Vulkan's shader modules.
*/
class Shader
{
// not-owned resources
std::shared_ptr<vk::Device> mDevice;

// owned resources
vk::ShaderModule mShaderModule;
bool mDestroyed = false;

public:

/**
* Constructor accepting a device and a SPIR-V binary
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment conventions should align to rest of codebase - see manager.hpp

* @param device The vk::Device for the shader module to be compiled for
* @param spv The SPIR-V binary
**/
Shader(const std::shared_ptr<vk::Device>& device,
const std::vector<uint32_t>& spv);

/**
* getter for mShaderModule
**/
const vk::ShaderModule& getShaderModule();

void destroy();

~Shader();
};

} // End namespace kp
4 changes: 2 additions & 2 deletions test/TestAsyncOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
std::vector<std::shared_ptr<kp::Algorithm>> algosAsync;

for (uint32_t i = 0; i < numParallel; i++) {
inputsAsyncB.push_back(mgr.tensor(data));
algosAsync.push_back(mgr.algorithm({ inputsAsyncB[i] }, spirv));
inputsAsyncB.push_back(mgrAsync.tensor(data));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you changing this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test was wrong and was causing my driver to crash. It was trying to use different kp::Manager (and therefore different underlying vk::Device) instances for allocating buffers, creating pipelines and executing them. You can't do this, even if the underlying hardware is the same. If you read the Vulkan validation errors while running the tests, you would see this:

[Jan  8 2026 11:08:47] [debug] [Manager.cpp:42] [VALIDATION]: Validation - vkCmdPipelineBarrier(): pBufferMemoryBarriers[0].buffer (VkBuffer 0x280000000028) was created, allocated or retrieved from VkDevice 0x5a2129296950, but command is using (or its dispatchable parameter is associated with) VkDevice 0x5a2129386e80

My change here fixes this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still isn't completely fixed; see here for details: #445 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this is not "fixing" anything. This is a test that is currently set up on specific hardware, which is testing parallel execution. What you are doing is just not running the test. This is not "fixing" something.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still crashes, but the validation error in my message no longer appears when I attempt to run the test. So it is a step in the right direction towards compliant API usage, but not a complete fix.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to see what this validation error may be, but your change basically makes this test redundant, so it's not correct. If you read through the test you are making it to not compare anything; the test is profiling a parallel queue with a non-parallel queue on a specific GPU that actually supports parallel processing (not async, parallel). Here's more info: https://medium.com/data-science/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc

Copy link
Author

@softcookiepp softcookiepp Jan 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The simplified reasoning is that you are taking an underlying vk::PhysicalDevice and creating two different vk::Device instances with it, since each kp::Manager creates its own vk::Device. Though these may have the same physical GPU, the Vulkan driver may initialize separate resources every time a vk::Device is created for a given GPU.

What do these resources look like? It really depends on the driver. NVIDIA drivers (which given your comments, it seems you have) are more robust in this regard, while AMD drivers (which I am using right now) are quite a bit more unforgiving about things like this.

So when you try to use memory and pipelines allocated with mgr (which has its own vk::Device instance) with mgrAsync (which has a totally separate vk::Device), you are violating one of the core assumptions that Vulkan drivers make. You may as well be asking for it to mix and match resources created on different physical GPUs.

As for why the test is redundant now, I am really not sure; both mgr and mgrAsync are still used in this test. I just made sure mgrAsync did not accept tensors and algorithms created with mgr. If you could point to exactly which lines break the test and why, that would be very much appreciated.

algosAsync.push_back(mgrAsync.algorithm({ inputsAsyncB[i] }, spirv));
}

std::vector<std::shared_ptr<kp::Sequence>> sqs;
Expand Down